summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/iphc.c1
-rw-r--r--net/8021q/Makefile1
-rw-r--r--net/8021q/vlan.c13
-rw-r--r--net/8021q/vlan_dev.c3
-rw-r--r--net/9p/client.c119
-rw-r--r--net/9p/mod.c4
-rw-r--r--net/9p/protocol.c2
-rw-r--r--net/9p/trans_fd.c22
-rw-r--r--net/9p/trans_rdma.c12
-rw-r--r--net/9p/trans_virtio.c66
-rw-r--r--net/9p/trans_xen.c3
-rw-r--r--net/9p/util.c1
-rw-r--r--net/Kconfig13
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/mpoa_proc.c6
-rw-r--r--net/ax25/ax25_addr.c1
-rw-r--r--net/ax25/ax25_ds_in.c1
-rw-r--r--net/ax25/ax25_ds_subr.c1
-rw-r--r--net/ax25/ax25_ip.c1
-rw-r--r--net/ax25/ax25_out.c1
-rw-r--r--net/batman-adv/Kconfig19
-rw-r--r--net/batman-adv/Makefile3
-rw-r--r--net/batman-adv/bat_iv_ogm.c330
-rw-r--r--net/batman-adv/bat_iv_ogm.h6
-rw-r--r--net/batman-adv/bat_v_elp.c10
-rw-r--r--net/batman-adv/bat_v_ogm.h6
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c12
-rw-r--r--net/batman-adv/debugfs.c39
-rw-r--r--net/batman-adv/debugfs.h6
-rw-r--r--net/batman-adv/gateway_client.c11
-rw-r--r--net/batman-adv/hard-interface.c47
-rw-r--r--net/batman-adv/icmp_socket.c3
-rw-r--r--net/batman-adv/log.c20
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/network-coding.c27
-rw-r--r--net/batman-adv/originator.c124
-rw-r--r--net/batman-adv/originator.h4
-rw-r--r--net/batman-adv/soft-interface.c27
-rw-r--r--net/batman-adv/sysfs.c30
-rw-r--r--net/batman-adv/trace.c22
-rw-r--r--net/batman-adv/trace.h78
-rw-r--r--net/batman-adv/translation-table.c6
-rw-r--r--net/batman-adv/tvlv.c8
-rw-r--r--net/batman-adv/types.h69
-rw-r--r--net/bluetooth/af_bluetooth.c2
-rw-r--r--net/bluetooth/bnep/core.c7
-rw-r--r--net/bluetooth/cmtp/core.c14
-rw-r--r--net/bluetooth/hci_conn.c189
-rw-r--r--net/bluetooth/hci_core.c170
-rw-r--r--net/bluetooth/hci_debugfs.c19
-rw-r--r--net/bluetooth/hci_event.c660
-rw-r--r--net/bluetooth/hci_request.c616
-rw-r--r--net/bluetooth/hci_request.h8
-rw-r--r--net/bluetooth/hidp/core.c19
-rw-r--r--net/bluetooth/l2cap_core.c102
-rw-r--r--net/bluetooth/leds.c6
-rw-r--r--net/bluetooth/mgmt.c409
-rw-r--r--net/bluetooth/rfcomm/tty.c12
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/bluetooth/smp.c68
-rw-r--r--net/bluetooth/smp.h3
-rw-r--r--net/bpf/test_run.c36
-rw-r--r--net/bpfilter/Kconfig1
-rw-r--r--net/bpfilter/Makefile4
-rw-r--r--net/bpfilter/bpfilter_kern.c11
-rw-r--r--net/bridge/Kconfig2
-rw-r--r--net/bridge/br.c20
-rw-r--r--net/bridge/br_arp_nd_proxy.c15
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_fdb.c24
-rw-r--r--net/bridge/br_forward.c16
-rw-r--r--net/bridge/br_if.c71
-rw-r--r--net/bridge/br_input.c2
-rw-r--r--net/bridge/br_mdb.c36
-rw-r--r--net/bridge/br_multicast.c67
-rw-r--r--net/bridge/br_netfilter_hooks.c11
-rw-r--r--net/bridge/br_netlink.c73
-rw-r--r--net/bridge/br_private.h76
-rw-r--r--net/bridge/br_switchdev.c9
-rw-r--r--net/bridge/br_sysfs_br.c49
-rw-r--r--net/bridge/br_sysfs_if.c94
-rw-r--r--net/bridge/br_vlan.c88
-rw-r--r--net/bridge/netfilter/ebtable_filter.c1
-rw-r--r--net/bridge/netfilter/ebtable_nat.c1
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c3
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/cfrfml.c3
-rw-r--r--net/ceph/Kconfig1
-rw-r--r--net/ceph/Makefile1
-rw-r--r--net/ceph/auth.c16
-rw-r--r--net/ceph/auth_none.c1
-rw-r--r--net/ceph/auth_none.h1
-rw-r--r--net/ceph/auth_x.c239
-rw-r--r--net/ceph/auth_x.h3
-rw-r--r--net/ceph/auth_x_protocol.h7
-rw-r--r--net/ceph/ceph_common.c13
-rw-r--r--net/ceph/cls_lock_client.c4
-rw-r--r--net/ceph/crush/mapper.c4
-rw-r--r--net/ceph/messenger.c113
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/ceph/osd_client.c27
-rw-r--r--net/ceph/pagevec.c1
-rw-r--r--net/compat.c16
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c935
-rw-r--r--net/core/dev_ioctl.c7
-rw-r--r--net/core/devlink.c1379
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c203
-rw-r--r--net/core/fib_rules.c39
-rw-r--r--net/core/filter.c1431
-rw-r--r--net/core/flow_dissector.c215
-rw-r--r--net/core/gen_estimator.c21
-rw-r--r--net/core/gen_stats.c73
-rw-r--r--net/core/link_watch.c2
-rw-r--r--net/core/lwt_bpf.c2
-rw-r--r--net/core/neighbour.c220
-rw-r--r--net/core/net-sysfs.c159
-rw-r--r--net/core/net_namespace.c50
-rw-r--r--net/core/netpoll.c60
-rw-r--r--net/core/pktgen.c14
-rw-r--r--net/core/rtnetlink.c464
-rw-r--r--net/core/secure_seq.c1
-rw-r--r--net/core/skbuff.c96
-rw-r--r--net/core/skmsg.c802
-rw-r--r--net/core/sock.c180
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/core/sock_map.c1003
-rw-r--r--net/core/sock_reuseport.c92
-rw-r--r--net/core/utils.c2
-rw-r--r--net/core/xdp.c112
-rw-r--r--net/dcb/dcbnl.c97
-rw-r--r--net/dccp/input.c4
-rw-r--r--net/dccp/ipv4.c4
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/decnet/Kconfig1
-rw-r--r--net/decnet/Makefile1
-rw-r--r--net/decnet/TODO5
-rw-r--r--net/decnet/dn_dev.c2
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/decnet/dn_nsp_in.c2
-rw-r--r--net/decnet/dn_nsp_out.c1
-rw-r--r--net/decnet/dn_route.c5
-rw-r--r--net/decnet/dn_rules.c2
-rw-r--r--net/decnet/netfilter/Makefile1
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c1
-rw-r--r--net/dns_resolver/dns_key.c68
-rw-r--r--net/dns_resolver/dns_query.c5
-rw-r--r--net/dsa/Kconfig3
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/dsa.c51
-rw-r--r--net/dsa/dsa2.c14
-rw-r--r--net/dsa/dsa_priv.h4
-rw-r--r--net/dsa/legacy.c9
-rw-r--r--net/dsa/slave.c37
-rw-r--r--net/dsa/switch.c22
-rw-r--r--net/dsa/tag_gswip.c109
-rw-r--r--net/ethernet/eth.c12
-rw-r--r--net/ieee802154/6lowpan/reassembly.c10
-rw-r--r--net/ieee802154/6lowpan/tx.c21
-rw-r--r--net/ieee802154/core.c1
-rw-r--r--net/ieee802154/nl_policy.c1
-rw-r--r--net/ieee802154/socket.c17
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c19
-rw-r--r--net/ipv4/ah4.c4
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/bpfilter/Makefile1
-rw-r--r--net/ipv4/cipso_ipv4.c11
-rw-r--r--net/ipv4/datagram.c2
-rw-r--r--net/ipv4/devinet.c213
-rw-r--r--net/ipv4/esp4.c11
-rw-r--r--net/ipv4/esp4_offload.c10
-rw-r--r--net/ipv4/fib_frontend.c154
-rw-r--r--net/ipv4/fib_semantics.c87
-rw-r--r--net/ipv4/fib_trie.c37
-rw-r--r--net/ipv4/fou.c20
-rw-r--r--net/ipv4/gre_demux.c7
-rw-r--r--net/ipv4/gre_offload.c8
-rw-r--r--net/ipv4/icmp.c13
-rw-r--r--net/ipv4/igmp.c21
-rw-r--r--net/ipv4/inet_connection_sock.c14
-rw-r--r--net/ipv4/inet_fragment.c17
-rw-r--r--net/ipv4/inet_hashtables.c19
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ip_fragment.c376
-rw-r--r--net/ipv4/ip_gre.c30
-rw-r--r--net/ipv4/ip_input.c143
-rw-r--r--net/ipv4/ip_output.c26
-rw-r--r--net/ipv4/ip_sockglue.c3
-rw-r--r--net/ipv4/ip_tunnel.c9
-rw-r--r--net/ipv4/ip_vti.c7
-rw-r--r--net/ipv4/ipcomp.c4
-rw-r--r--net/ipv4/ipip.c5
-rw-r--r--net/ipv4/ipmr.c79
-rw-r--r--net/ipv4/ipmr_base.c126
-rw-r--r--net/ipv4/metrics.c30
-rw-r--r--net/ipv4/netfilter.c53
-rw-r--r--net/ipv4/netfilter/Kconfig30
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c17
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c472
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c22
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c1
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c27
-rw-r--r--net/ipv4/ping.c18
-rw-r--r--net/ipv4/proc.c3
-rw-r--r--net/ipv4/raw.c13
-rw-r--r--net/ipv4/route.c61
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/sysctl_net_ipv4.c32
-rw-r--r--net/ipv4/tcp.c131
-rw-r--r--net/ipv4/tcp_bbr.c134
-rw-r--r--net/ipv4/tcp_bpf.c668
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_dctcp.c55
-rw-r--r--net/ipv4/tcp_dctcp.h40
-rw-r--r--net/ipv4/tcp_input.c135
-rw-r--r--net/ipv4/tcp_ipv4.c17
-rw-r--r--net/ipv4/tcp_minisocks.c218
-rw-r--r--net/ipv4/tcp_offload.c17
-rw-r--r--net/ipv4/tcp_output.c181
-rw-r--r--net/ipv4/tcp_rate.c19
-rw-r--r--net/ipv4/tcp_recovery.c7
-rw-r--r--net/ipv4/tcp_timer.c53
-rw-r--r--net/ipv4/tcp_ulp.c79
-rw-r--r--net/ipv4/udp.c75
-rw-r--r--net/ipv4/udp_offload.c15
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_mode_transport.c4
-rw-r--r--net/ipv6/Kconfig2
-rw-r--r--net/ipv6/addrconf.c342
-rw-r--r--net/ipv6/addrlabel.c34
-rw-r--r--net/ipv6/af_inet6.c26
-rw-r--r--net/ipv6/datagram.c6
-rw-r--r--net/ipv6/esp6.c7
-rw-r--r--net/ipv6/esp6_offload.c10
-rw-r--r--net/ipv6/icmp.c32
-rw-r--r--net/ipv6/ila/Makefile2
-rw-r--r--net/ipv6/ila/ila.h27
-rw-r--r--net/ipv6/ila/ila_common.c31
-rw-r--r--net/ipv6/ila/ila_main.c121
-rw-r--r--net/ipv6/ila/ila_xlat.c292
-rw-r--r--net/ipv6/inet6_hashtables.c14
-rw-r--r--net/ipv6/ip6_fib.c77
-rw-r--r--net/ipv6/ip6_flowlabel.c3
-rw-r--r--net/ipv6/ip6_gre.c35
-rw-r--r--net/ipv6/ip6_input.c134
-rw-r--r--net/ipv6/ip6_offload.c17
-rw-r--r--net/ipv6/ip6_output.c46
-rw-r--r--net/ipv6/ip6_tunnel.c33
-rw-r--r--net/ipv6/ip6_vti.c21
-rw-r--r--net/ipv6/ip6mr.c75
-rw-r--r--net/ipv6/ipv6_sockglue.c14
-rw-r--r--net/ipv6/mcast.c26
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter.c62
-rw-r--r--net/ipv6/netfilter/Kconfig27
-rw-r--r--net/ipv6/netfilter/Makefile6
-rw-r--r--net/ipv6/netfilter/ip6t_ipv6header.c5
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c12
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c10
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c460
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c25
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c4
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c8
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c19
-rw-r--r--net/ipv6/ping.c7
-rw-r--r--net/ipv6/raw.c47
-rw-r--r--net/ipv6/reassembly.c111
-rw-r--r--net/ipv6/route.c306
-rw-r--r--net/ipv6/seg6.c1
-rw-r--r--net/ipv6/seg6_hmac.c1
-rw-r--r--net/ipv6/seg6_local.c54
-rw-r--r--net/ipv6/sit.c6
-rw-r--r--net/ipv6/tcpv6_offload.c4
-rw-r--r--net/ipv6/udp.c82
-rw-r--r--net/ipv6/udp_offload.c6
-rw-r--r--net/ipv6/xfrm6_input.c1
-rw-r--r--net/ipv6/xfrm6_mode_ro.c2
-rw-r--r--net/ipv6/xfrm6_mode_transport.c4
-rw-r--r--net/ipv6/xfrm6_output.c2
-rw-r--r--net/ipv6/xfrm6_policy.c4
-rw-r--r--net/iucv/af_iucv.c88
-rw-r--r--net/iucv/iucv.c2
-rw-r--r--net/kcm/Kconfig1
-rw-r--r--net/kcm/kcmsock.c1
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/l2tp/l2tp_core.c86
-rw-r--r--net/l2tp/l2tp_core.h73
-rw-r--r--net/l2tp/l2tp_debugfs.c8
-rw-r--r--net/l2tp/l2tp_eth.c32
-rw-r--r--net/l2tp/l2tp_ip.c4
-rw-r--r--net/l2tp/l2tp_ip6.c15
-rw-r--r--net/l2tp/l2tp_netlink.c37
-rw-r--r--net/l2tp/l2tp_ppp.c560
-rw-r--r--net/llc/Kconfig2
-rw-r--r--net/llc/Makefile2
-rw-r--r--net/llc/af_llc.c11
-rw-r--r--net/llc/llc_conn.c1
-rw-r--r--net/llc/llc_core.c4
-rw-r--r--net/llc/llc_if.c1
-rw-r--r--net/mac80211/Kconfig17
-rw-r--r--net/mac80211/Makefile12
-rw-r--r--net/mac80211/agg-rx.c10
-rw-r--r--net/mac80211/agg-tx.c19
-rw-r--r--net/mac80211/cfg.c153
-rw-r--r--net/mac80211/debugfs.c4
-rw-r--r--net/mac80211/debugfs_sta.c364
-rw-r--r--net/mac80211/driver-ops.h26
-rw-r--r--net/mac80211/ethtool.c6
-rw-r--r--net/mac80211/he.c55
-rw-r--r--net/mac80211/ht.c2
-rw-r--r--net/mac80211/ibss.c26
-rw-r--r--net/mac80211/ieee80211_i.h58
-rw-r--r--net/mac80211/iface.c7
-rw-r--r--net/mac80211/key.c131
-rw-r--r--net/mac80211/led.c20
-rw-r--r--net/mac80211/main.c142
-rw-r--r--net/mac80211/mesh.c5
-rw-r--r--net/mac80211/mesh.h3
-rw-r--r--net/mac80211/mesh_hwmp.c13
-rw-r--r--net/mac80211/mlme.c488
-rw-r--r--net/mac80211/offchannel.c2
-rw-r--r--net/mac80211/rate.h13
-rw-r--r--net/mac80211/rc80211_minstrel.c163
-rw-r--r--net/mac80211/rc80211_minstrel.h35
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c68
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c298
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h20
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c58
-rw-r--r--net/mac80211/rx.c183
-rw-r--r--net/mac80211/scan.c56
-rw-r--r--net/mac80211/spectmgmt.c5
-rw-r--r--net/mac80211/sta_info.c128
-rw-r--r--net/mac80211/sta_info.h20
-rw-r--r--net/mac80211/status.c30
-rw-r--r--net/mac80211/tdls.c8
-rw-r--r--net/mac80211/trace.h25
-rw-r--r--net/mac80211/tx.c156
-rw-r--r--net/mac80211/util.c336
-rw-r--r--net/mac80211/vht.c20
-rw-r--r--net/mac802154/tx.c15
-rw-r--r--net/mpls/af_mpls.c138
-rw-r--r--net/mpls/mpls_iptunnel.c2
-rw-r--r--net/ncsi/Kconfig6
-rw-r--r--net/ncsi/internal.h21
-rw-r--r--net/ncsi/ncsi-cmd.c38
-rw-r--r--net/ncsi/ncsi-manage.c98
-rw-r--r--net/ncsi/ncsi-netlink.c209
-rw-r--r--net/ncsi/ncsi-netlink.h12
-rw-r--r--net/ncsi/ncsi-pkt.h22
-rw-r--r--net/ncsi/ncsi-rsp.c150
-rw-r--r--net/netfilter/Kconfig76
-rw-r--r--net/netfilter/Makefile13
-rw-r--r--net/netfilter/core.c15
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h2
-rw-r--r--net/netfilter/ipvs/Kconfig8
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c89
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c18
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c19
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c18
-rw-r--r--net/netfilter/nf_conncount.c386
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c410
-rw-r--r--net/netfilter/nf_conntrack_expect.c6
-rw-r--r--net/netfilter/nf_conntrack_helper.c10
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c66
-rw-r--r--net/netfilter/nf_conntrack_netlink.c193
-rw-r--r--net/netfilter/nf_conntrack_proto.c938
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c202
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c60
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c60
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c (renamed from net/ipv4/netfilter/nf_conntrack_proto_icmp.c)91
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c (renamed from net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c)101
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c302
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c310
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c288
-rw-r--r--net/netfilter/nf_conntrack_standalone.c37
-rw-r--r--net/netfilter/nf_conntrack_timeout.c21
-rw-r--r--net/netfilter/nf_flow_table_core.c65
-rw-r--r--net/netfilter/nf_flow_table_ip.c6
-rw-r--r--net/netfilter/nf_log_common.c5
-rw-r--r--net/netfilter/nf_nat_core.c18
-rw-r--r--net/netfilter/nf_nat_helper.c4
-rw-r--r--net/netfilter/nf_nat_redirect.c4
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c364
-rw-r--r--net/netfilter/nf_tables_core.c44
-rw-r--r--net/netfilter/nfnetlink.c23
-rw-r--r--net/netfilter/nfnetlink_acct.c29
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c135
-rw-r--r--net/netfilter/nfnetlink_osf.c432
-rw-r--r--net/netfilter/nfnetlink_queue.c3
-rw-r--r--net/netfilter/nft_chain_filter.c18
-rw-r--r--net/netfilter/nft_cmp.c6
-rw-r--r--net/netfilter/nft_compat.c24
-rw-r--r--net/netfilter/nft_connlimit.c36
-rw-r--r--net/netfilter/nft_ct.c234
-rw-r--r--net/netfilter/nft_dup_netdev.c2
-rw-r--r--net/netfilter/nft_dynset.c25
-rw-r--r--net/netfilter/nft_flow_offload.c2
-rw-r--r--net/netfilter/nft_fwd_netdev.c4
-rw-r--r--net/netfilter/nft_lookup.c26
-rw-r--r--net/netfilter/nft_meta.c131
-rw-r--r--net/netfilter/nft_numgen.c4
-rw-r--r--net/netfilter/nft_objref.c20
-rw-r--r--net/netfilter/nft_osf.c127
-rw-r--r--net/netfilter/nft_reject.c6
-rw-r--r--net/netfilter/nft_rt.c11
-rw-r--r--net/netfilter/nft_set_bitmap.c6
-rw-r--r--net/netfilter/nft_set_hash.c46
-rw-r--r--net/netfilter/nft_set_rbtree.c42
-rw-r--r--net/netfilter/nft_socket.c22
-rw-r--r--net/netfilter/nft_tproxy.c318
-rw-r--r--net/netfilter/nft_tunnel.c566
-rw-r--r--net/netfilter/nft_xfrm.c294
-rw-r--r--net/netfilter/utils.c131
-rw-r--r--net/netfilter/x_tables.c7
-rw-r--r--net/netfilter/xt_AUDIT.c2
-rw-r--r--net/netfilter/xt_CHECKSUM.c22
-rw-r--r--net/netfilter/xt_CT.c8
-rw-r--r--net/netfilter/xt_IDLETIMER.c4
-rw-r--r--net/netfilter/xt_SECMARK.c2
-rw-r--r--net/netfilter/xt_TEE.c80
-rw-r--r--net/netfilter/xt_TPROXY.c9
-rw-r--r--net/netfilter/xt_cgroup.c78
-rw-r--r--net/netfilter/xt_cluster.c14
-rw-r--r--net/netfilter/xt_connlimit.c4
-rw-r--r--net/netfilter/xt_hashlimit.c18
-rw-r--r--net/netfilter/xt_nat.c2
-rw-r--r--net/netfilter/xt_osf.c155
-rw-r--r--net/netfilter/xt_owner.c2
-rw-r--r--net/netfilter/xt_recent.c3
-rw-r--r--net/netfilter/xt_socket.c8
-rw-r--r--net/netlabel/netlabel_unlabeled.c3
-rw-r--r--net/netlabel/netlabel_user.c2
-rw-r--r--net/netlink/af_netlink.c52
-rw-r--r--net/netlink/af_netlink.h1
-rw-r--r--net/nfc/hci/core.c10
-rw-r--r--net/nfc/llcp_sock.c2
-rw-r--r--net/nfc/nci/uart.c7
-rw-r--r--net/openvswitch/actions.c33
-rw-r--r--net/openvswitch/conntrack.c38
-rw-r--r--net/openvswitch/datapath.c20
-rw-r--r--net/openvswitch/flow.c22
-rw-r--r--net/openvswitch/flow_netlink.c80
-rw-r--r--net/openvswitch/vport-internal_dev.c5
-rw-r--r--net/packet/af_packet.c48
-rw-r--r--net/rds/Kconfig4
-rw-r--r--net/rds/Makefile1
-rw-r--r--net/rds/af_rds.c205
-rw-r--r--net/rds/bind.c141
-rw-r--r--net/rds/cong.c23
-rw-r--r--net/rds/connection.c283
-rw-r--r--net/rds/ib.c133
-rw-r--r--net/rds/ib.h55
-rw-r--r--net/rds/ib_cm.c320
-rw-r--r--net/rds/ib_frmr.c12
-rw-r--r--net/rds/ib_mr.h2
-rw-r--r--net/rds/ib_rdma.c26
-rw-r--r--net/rds/ib_recv.c39
-rw-r--r--net/rds/ib_send.c19
-rw-r--r--net/rds/loop.c7
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rdma.c6
-rw-r--r--net/rds/rdma_transport.c95
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h90
-rw-r--r--net/rds/recv.c97
-rw-r--r--net/rds/send.c129
-rw-r--r--net/rds/tcp.c153
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_connect.c68
-rw-r--r--net/rds/tcp_listen.c87
-rw-r--r--net/rds/tcp_recv.c9
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c16
-rw-r--r--net/rfkill/core.c8
-rw-r--r--net/rfkill/rfkill-gpio.c1
-rw-r--r--net/rxrpc/af_rxrpc.c19
-rw-r--r--net/rxrpc/ar-internal.h64
-rw-r--r--net/rxrpc/call_accept.c68
-rw-r--r--net/rxrpc/call_event.c2
-rw-r--r--net/rxrpc/call_object.c7
-rw-r--r--net/rxrpc/conn_client.c17
-rw-r--r--net/rxrpc/conn_event.c43
-rw-r--r--net/rxrpc/conn_object.c21
-rw-r--r--net/rxrpc/input.c346
-rw-r--r--net/rxrpc/local_event.c7
-rw-r--r--net/rxrpc/local_object.c62
-rw-r--r--net/rxrpc/net_ns.c3
-rw-r--r--net/rxrpc/output.c99
-rw-r--r--net/rxrpc/peer_event.c64
-rw-r--r--net/rxrpc/peer_object.c77
-rw-r--r--net/rxrpc/proc.c148
-rw-r--r--net/rxrpc/protocol.h15
-rw-r--r--net/rxrpc/recvmsg.c99
-rw-r--r--net/rxrpc/rxkad.c31
-rw-r--r--net/rxrpc/skbuff.c15
-rw-r--r--net/rxrpc/sysctl.c1
-rw-r--r--net/rxrpc/utils.c23
-rw-r--r--net/sched/Kconfig50
-rw-r--r--net/sched/Makefile6
-rw-r--r--net/sched/act_api.c488
-rw-r--r--net/sched/act_bpf.c49
-rw-r--r--net/sched/act_connmark.c42
-rw-r--r--net/sched/act_csum.c71
-rw-r--r--net/sched/act_gact.c53
-rw-r--r--net/sched/act_ife.c149
-rw-r--r--net/sched/act_ipt.c47
-rw-r--r--net/sched/act_mirred.c185
-rw-r--r--net/sched/act_nat.c46
-rw-r--r--net/sched/act_pedit.c155
-rw-r--r--net/sched/act_police.c243
-rw-r--r--net/sched/act_sample.c60
-rw-r--r--net/sched/act_simple.c38
-rw-r--r--net/sched/act_skbedit.c175
-rw-r--r--net/sched/act_skbmod.c68
-rw-r--r--net/sched/act_tunnel_key.c334
-rw-r--r--net/sched/act_vlan.c87
-rw-r--r--net/sched/cls_api.c939
-rw-r--r--net/sched/cls_basic.c1
-rw-r--r--net/sched/cls_bpf.c43
-rw-r--r--net/sched/cls_flower.c652
-rw-r--r--net/sched/cls_matchall.c34
-rw-r--r--net/sched/cls_tcindex.c8
-rw-r--r--net/sched/cls_u32.c248
-rw-r--r--net/sched/sch_api.c69
-rw-r--r--net/sched/sch_atm.c2
-rw-r--r--net/sched/sch_cake.c3034
-rw-r--r--net/sched/sch_cbq.c2
-rw-r--r--net/sched/sch_cbs.c134
-rw-r--r--net/sched/sch_drr.c4
-rw-r--r--net/sched/sch_dsmark.c2
-rw-r--r--net/sched/sch_etf.c484
-rw-r--r--net/sched/sch_fifo.c2
-rw-r--r--net/sched/sch_fq.c103
-rw-r--r--net/sched/sch_fq_codel.c2
-rw-r--r--net/sched/sch_generic.c66
-rw-r--r--net/sched/sch_hfsc.c2
-rw-r--r--net/sched/sch_hhf.c2
-rw-r--r--net/sched/sch_htb.c129
-rw-r--r--net/sched/sch_mq.c4
-rw-r--r--net/sched/sch_mqprio.c4
-rw-r--r--net/sched/sch_multiq.c6
-rw-r--r--net/sched/sch_netem.c89
-rw-r--r--net/sched/sch_pie.c36
-rw-r--r--net/sched/sch_prio.c6
-rw-r--r--net/sched/sch_qfq.c4
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_sfb.c4
-rw-r--r--net/sched/sch_skbprio.c320
-rw-r--r--net/sched/sch_taprio.c962
-rw-r--r--net/sched/sch_tbf.c6
-rw-r--r--net/sctp/Kconfig4
-rw-r--r--net/sctp/associola.c18
-rw-r--r--net/sctp/chunk.c6
-rw-r--r--net/sctp/input.c2
-rw-r--r--net/sctp/ipv6.c20
-rw-r--r--net/sctp/output.c6
-rw-r--r--net/sctp/outqueue.c21
-rw-r--r--net/sctp/proc.c8
-rw-r--r--net/sctp/protocol.c16
-rw-r--r--net/sctp/sm_sideeffect.c1
-rw-r--r--net/sctp/socket.c351
-rw-r--r--net/sctp/stream.c153
-rw-r--r--net/sctp/stream_interleave.c20
-rw-r--r--net/sctp/stream_sched.c13
-rw-r--r--net/sctp/stream_sched_prio.c22
-rw-r--r--net/sctp/stream_sched_rr.c8
-rw-r--r--net/sctp/transport.c12
-rw-r--r--net/sctp/ulpqueue.c2
-rw-r--r--net/smc/Makefile2
-rw-r--r--net/smc/af_smc.c332
-rw-r--r--net/smc/smc.h9
-rw-r--r--net/smc/smc_cdc.c113
-rw-r--r--net/smc/smc_cdc.h86
-rw-r--r--net/smc/smc_clc.c211
-rw-r--r--net/smc/smc_clc.h99
-rw-r--r--net/smc/smc_close.c14
-rw-r--r--net/smc/smc_core.c350
-rw-r--r--net/smc/smc_core.h85
-rw-r--r--net/smc/smc_diag.c33
-rw-r--r--net/smc/smc_ib.c173
-rw-r--r--net/smc/smc_ib.h7
-rw-r--r--net/smc/smc_ism.c348
-rw-r--r--net/smc/smc_ism.h48
-rw-r--r--net/smc/smc_llc.c80
-rw-r--r--net/smc/smc_llc.h7
-rw-r--r--net/smc/smc_pnet.c173
-rw-r--r--net/smc/smc_pnet.h19
-rw-r--r--net/smc/smc_rx.c21
-rw-r--r--net/smc/smc_tx.c242
-rw-r--r--net/smc/smc_tx.h6
-rw-r--r--net/smc/smc_wr.c41
-rw-r--r--net/smc/smc_wr.h3
-rw-r--r--net/socket.c81
-rw-r--r--net/strparser/Kconfig4
-rw-r--r--net/strparser/strparser.c30
-rw-r--r--net/sunrpc/auth.c4
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c50
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c15
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c1
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c3
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c70
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c2
-rw-r--r--net/sunrpc/auth_null.c2
-rw-r--r--net/sunrpc/auth_unix.c2
-rw-r--r--net/sunrpc/backchannel_rqst.c1
-rw-r--r--net/sunrpc/clnt.c30
-rw-r--r--net/sunrpc/rpcb_clnt.c2
-rw-r--r--net/sunrpc/stats.c55
-rw-r--r--net/sunrpc/sunrpc.h1
-rw-r--r--net/sunrpc/svc.c78
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c4
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c35
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c5
-rw-r--r--net/sunrpc/xprtrdma/verbs.c10
-rw-r--r--net/sunrpc/xprtsock.c1
-rw-r--r--net/tipc/bcast.c6
-rw-r--r--net/tipc/bearer.c14
-rw-r--r--net/tipc/diag.c2
-rw-r--r--net/tipc/group.c42
-rw-r--r--net/tipc/group.h1
-rw-r--r--net/tipc/link.c201
-rw-r--r--net/tipc/link.h5
-rw-r--r--net/tipc/monitor.c3
-rw-r--r--net/tipc/msg.c109
-rw-r--r--net/tipc/msg.h11
-rw-r--r--net/tipc/name_distr.c14
-rw-r--r--net/tipc/name_table.c13
-rw-r--r--net/tipc/name_table.h10
-rw-r--r--net/tipc/netlink.c2
-rw-r--r--net/tipc/netlink_compat.c5
-rw-r--r--net/tipc/node.c104
-rw-r--r--net/tipc/node.h16
-rw-r--r--net/tipc/socket.c321
-rw-r--r--net/tipc/socket.h3
-rw-r--r--net/tipc/topsrv.c16
-rw-r--r--net/tipc/udp_media.c18
-rw-r--r--net/tls/Kconfig1
-rw-r--r--net/tls/tls_device.c310
-rw-r--r--net/tls/tls_device_fallback.c13
-rw-r--r--net/tls/tls_main.c131
-rw-r--r--net/tls/tls_sw.c1694
-rw-r--r--net/unix/af_unix.c13
-rw-r--r--net/wimax/Makefile2
-rw-r--r--net/wimax/debugfs.c2
-rw-r--r--net/wimax/op-msg.c1
-rw-r--r--net/wimax/stack.c4
-rw-r--r--net/wireless/core.c104
-rw-r--r--net/wireless/core.h16
-rw-r--r--net/wireless/lib80211_crypt_tkip.c114
-rw-r--r--net/wireless/lib80211_crypt_wep.c52
-rw-r--r--net/wireless/nl80211.c1067
-rw-r--r--net/wireless/rdev-ops.h15
-rw-r--r--net/wireless/reg.c208
-rw-r--r--net/wireless/scan.c58
-rw-r--r--net/wireless/sysfs.c4
-rw-r--r--net/wireless/trace.h235
-rw-r--r--net/wireless/util.c249
-rw-r--r--net/wireless/wext-compat.c22
-rw-r--r--net/x25/Kconfig2
-rw-r--r--net/x25/x25_subr.c1
-rw-r--r--net/xdp/xdp_umem.c130
-rw-r--r--net/xdp/xdp_umem.h12
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c56
-rw-r--r--net/xdp/xsk_queue.c60
-rw-r--r--net/xdp/xsk_queue.h16
-rw-r--r--net/xfrm/Kconfig9
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/xfrm_device.c27
-rw-r--r--net/xfrm/xfrm_hash.h5
-rw-r--r--net/xfrm/xfrm_input.c8
-rw-r--r--net/xfrm/xfrm_interface.c975
-rw-r--r--net/xfrm/xfrm_output.c9
-rw-r--r--net/xfrm/xfrm_policy.c326
-rw-r--r--net/xfrm/xfrm_state.c48
-rw-r--r--net/xfrm/xfrm_user.c112
699 files changed, 41327 insertions, 14877 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 6b1042e21656..52fad5dad9f7 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -770,6 +770,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
hdr.hop_limit, &hdr.daddr);
skb_push(skb, sizeof(hdr));
+ skb_reset_mac_header(skb);
skb_reset_network_header(skb);
skb_copy_to_linear_data(skb, &hdr, sizeof(hdr));
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
index 9b703454b93e..e05d4d7aab35 100644
--- a/net/8021q/Makefile
+++ b/net/8021q/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q.o
8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o
8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o
8021q-$(CONFIG_PROC_FS) += vlanproc.o
-
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 8ccee3d01822..5e9950453955 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -647,13 +647,14 @@ out:
return err;
}
-static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
- struct vlan_hdr *vhdr;
- unsigned int hlen, off_vlan;
const struct packet_offload *ptype;
+ unsigned int hlen, off_vlan;
+ struct sk_buff *pp = NULL;
+ struct vlan_hdr *vhdr;
+ struct sk_buff *p;
__be16 type;
int flush = 1;
@@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
struct vlan_hdr *vhdr2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 546af0e73ac3..ff720f1ebf73 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -756,8 +756,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev)
return;
vlan->netpoll = NULL;
-
- __netpoll_free_async(netpoll);
+ __netpoll_free(netpoll);
}
#endif /* CONFIG_NET_POLL_CONTROLLER */
diff --git a/net/9p/client.c b/net/9p/client.c
index 5c1343195292..deae53a7dffc 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -283,8 +283,9 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
return ERR_PTR(-ENOMEM);
}
for (col = 0; col < P9_ROW_MAXTAG; col++) {
- c->reqs[row][col].status = REQ_STATUS_IDLE;
- c->reqs[row][col].tc = NULL;
+ req = &c->reqs[row][col];
+ req->status = REQ_STATUS_IDLE;
+ init_waitqueue_head(&req->wq);
}
c->max_tag += P9_ROW_MAXTAG;
}
@@ -294,13 +295,6 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
col = tag % P9_ROW_MAXTAG;
req = &c->reqs[row][col];
- if (!req->wq) {
- req->wq = kmalloc(sizeof(wait_queue_head_t), GFP_NOFS);
- if (!req->wq)
- goto grow_failed;
- init_waitqueue_head(req->wq);
- }
-
if (!req->tc)
req->tc = p9_fcall_alloc(alloc_msize);
if (!req->rc)
@@ -320,9 +314,7 @@ grow_failed:
pr_err("Couldn't grow tag array\n");
kfree(req->tc);
kfree(req->rc);
- kfree(req->wq);
req->tc = req->rc = NULL;
- req->wq = NULL;
return ERR_PTR(-ENOMEM);
}
@@ -341,7 +333,7 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
* buffer to read the data into */
tag++;
- if(tag >= c->max_tag)
+ if (tag >= c->max_tag)
return NULL;
row = tag / P9_ROW_MAXTAG;
@@ -410,7 +402,6 @@ static void p9_tag_cleanup(struct p9_client *c)
/* free requests associated with tags */
for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
for (col = 0; col < P9_ROW_MAXTAG; col++) {
- kfree(c->reqs[row][col].wq);
kfree(c->reqs[row][col].tc);
kfree(c->reqs[row][col].rc);
}
@@ -448,12 +439,12 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
/*
* This barrier is needed to make sure any change made to req before
- * the other thread wakes up will indeed be seen by the waiting side.
+ * the status change is visible to another thread
*/
smp_wmb();
req->status = status;
- wake_up(req->wq);
+ wake_up(&req->wq);
p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
}
EXPORT_SYMBOL(p9_client_cb);
@@ -478,20 +469,11 @@ p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
int err;
pdu->offset = 0;
- if (pdu->size == 0)
- pdu->size = 7;
err = p9pdu_readf(pdu, 0, "dbw", &r_size, &r_type, &r_tag);
if (err)
goto rewind_and_exit;
- pdu->size = r_size;
- pdu->id = r_type;
- pdu->tag = r_tag;
-
- p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
- pdu->size, pdu->id, pdu->tag);
-
if (type)
*type = r_type;
if (tag)
@@ -499,6 +481,16 @@ p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
if (size)
*size = r_size;
+ if (pdu->size != r_size || r_size < 7) {
+ err = -EINVAL;
+ goto rewind_and_exit;
+ }
+
+ pdu->id = r_type;
+ pdu->tag = r_tag;
+
+ p9_debug(P9_DEBUG_9P, "<<< size=%d type: %d tag: %d\n",
+ pdu->size, pdu->id, pdu->tag);
rewind_and_exit:
if (rewind)
@@ -525,6 +517,12 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
int ecode;
err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
+ if (req->rc->size >= c->msize) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d\n",
+ req->rc->size);
+ return -EIO;
+ }
/*
* dump the response from server
* This should be after check errors which poplulate pdu_fcall.
@@ -774,7 +772,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
}
again:
/* Wait for the response */
- err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD);
+ err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
/*
* Make sure our req is coherent with regard to updates in other
@@ -909,34 +907,31 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
{
int ret;
struct p9_fid *fid;
- unsigned long flags;
p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
if (!fid)
- return ERR_PTR(-ENOMEM);
-
- ret = p9_idpool_get(clnt->fidpool);
- if (ret < 0) {
- ret = -ENOSPC;
- goto error;
- }
- fid->fid = ret;
+ return NULL;
memset(&fid->qid, 0, sizeof(struct p9_qid));
fid->mode = -1;
fid->uid = current_fsuid();
fid->clnt = clnt;
fid->rdir = NULL;
- spin_lock_irqsave(&clnt->lock, flags);
- list_add(&fid->flist, &clnt->fidlist);
- spin_unlock_irqrestore(&clnt->lock, flags);
+ fid->fid = 0;
- return fid;
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&clnt->lock);
+ ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1,
+ GFP_NOWAIT);
+ spin_unlock_irq(&clnt->lock);
+ idr_preload_end();
+
+ if (!ret)
+ return fid;
-error:
kfree(fid);
- return ERR_PTR(ret);
+ return NULL;
}
static void p9_fid_destroy(struct p9_fid *fid)
@@ -946,9 +941,8 @@ static void p9_fid_destroy(struct p9_fid *fid)
p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
clnt = fid->clnt;
- p9_idpool_put(fid->fid, clnt->fidpool);
spin_lock_irqsave(&clnt->lock, flags);
- list_del(&fid->flist);
+ idr_remove(&clnt->fids, fid->fid);
spin_unlock_irqrestore(&clnt->lock, flags);
kfree(fid->rdir);
kfree(fid);
@@ -958,7 +952,7 @@ static int p9_client_version(struct p9_client *c)
{
int err = 0;
struct p9_req_t *req;
- char *version;
+ char *version = NULL;
int msize;
p9_debug(P9_DEBUG_9P, ">>> TVERSION msize %d protocol %d\n",
@@ -1031,7 +1025,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
memcpy(clnt->name, client_id, strlen(client_id) + 1);
spin_lock_init(&clnt->lock);
- INIT_LIST_HEAD(&clnt->fidlist);
+ idr_init(&clnt->fids);
err = p9_tag_init(clnt);
if (err < 0)
@@ -1051,18 +1045,12 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
goto destroy_tagpool;
}
- clnt->fidpool = p9_idpool_create();
- if (IS_ERR(clnt->fidpool)) {
- err = PTR_ERR(clnt->fidpool);
- goto put_trans;
- }
-
p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
err = clnt->trans_mod->create(clnt, dev_name, options);
if (err)
- goto destroy_fidpool;
+ goto put_trans;
if (clnt->msize > clnt->trans_mod->maxsize)
clnt->msize = clnt->trans_mod->maxsize;
@@ -1075,8 +1063,6 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
close_trans:
clnt->trans_mod->close(clnt);
-destroy_fidpool:
- p9_idpool_destroy(clnt->fidpool);
put_trans:
v9fs_put_trans(clnt->trans_mod);
destroy_tagpool:
@@ -1089,7 +1075,8 @@ EXPORT_SYMBOL(p9_client_create);
void p9_client_destroy(struct p9_client *clnt)
{
- struct p9_fid *fid, *fidptr;
+ struct p9_fid *fid;
+ int id;
p9_debug(P9_DEBUG_MUX, "clnt %p\n", clnt);
@@ -1098,14 +1085,11 @@ void p9_client_destroy(struct p9_client *clnt)
v9fs_put_trans(clnt->trans_mod);
- list_for_each_entry_safe(fid, fidptr, &clnt->fidlist, flist) {
+ idr_for_each_entry(&clnt->fids, fid, id) {
pr_info("Found fid %d not clunked\n", fid->fid);
p9_fid_destroy(fid);
}
- if (clnt->fidpool)
- p9_idpool_destroy(clnt->fidpool);
-
p9_tag_cleanup(clnt);
kfree(clnt);
@@ -1138,9 +1122,8 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
afid ? afid->fid : -1, uname, aname);
fid = p9_fid_create(clnt);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- fid = NULL;
+ if (!fid) {
+ err = -ENOMEM;
goto error;
}
fid->uid = n_uname;
@@ -1189,9 +1172,8 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
clnt = oldfid->clnt;
if (clone) {
fid = p9_fid_create(clnt);
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
- fid = NULL;
+ if (!fid) {
+ err = -ENOMEM;
goto error;
}
@@ -1576,7 +1558,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
int count = iov_iter_count(to);
int rsize, non_zc = 0;
char *dataptr;
-
+
rsize = fid->iounit;
if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
rsize = clnt->msize - P9_IOHDRSZ;
@@ -1790,7 +1772,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
"<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
"<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
"<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
- "<<< st_gen=%lld st_data_version=%lld",
+ "<<< st_gen=%lld st_data_version=%lld\n",
ret->st_result_mask, ret->qid.type, ret->qid.path,
ret->qid.version, ret->st_mode, ret->st_nlink,
from_kuid(&init_user_ns, ret->st_uid),
@@ -2019,9 +2001,8 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
err = 0;
clnt = file_fid->clnt;
attr_fid = p9_fid_create(clnt);
- if (IS_ERR(attr_fid)) {
- err = PTR_ERR(attr_fid);
- attr_fid = NULL;
+ if (!attr_fid) {
+ err = -ENOMEM;
goto error;
}
p9_debug(P9_DEBUG_9P,
diff --git a/net/9p/mod.c b/net/9p/mod.c
index eb9777f05755..253ba824a325 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -171,13 +171,11 @@ void v9fs_put_trans(struct p9_trans_module *m)
*/
static int __init init_p9(void)
{
- int ret = 0;
-
p9_error_init();
pr_info("Installing 9P2000 support\n");
p9_trans_fd_init();
- return ret;
+ return 0;
}
/**
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 931ea00c4fed..4a1e1dd30b52 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -156,7 +156,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
*sptr = kmalloc(len + 1, GFP_NOFS);
if (*sptr == NULL) {
- errcode = -EFAULT;
+ errcode = -ENOMEM;
break;
}
if (pdu_read(pdu, *sptr, len)) {
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 588bf88c3305..e2ef3c782c53 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -185,6 +185,8 @@ static void p9_mux_poll_stop(struct p9_conn *m)
spin_lock_irqsave(&p9_poll_lock, flags);
list_del_init(&m->poll_pending_link);
spin_unlock_irqrestore(&p9_poll_lock, flags);
+
+ flush_work(&p9_poll_work);
}
/**
@@ -197,15 +199,14 @@ static void p9_mux_poll_stop(struct p9_conn *m)
static void p9_conn_cancel(struct p9_conn *m, int err)
{
struct p9_req_t *req, *rtmp;
- unsigned long flags;
LIST_HEAD(cancel_list);
p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
- spin_lock_irqsave(&m->client->lock, flags);
+ spin_lock(&m->client->lock);
if (m->err) {
- spin_unlock_irqrestore(&m->client->lock, flags);
+ spin_unlock(&m->client->lock);
return;
}
@@ -217,7 +218,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
list_move(&req->req_list, &cancel_list);
}
- spin_unlock_irqrestore(&m->client->lock, flags);
list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
@@ -226,6 +226,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
req->t_err = err;
p9_client_cb(m->client, req, REQ_STATUS_ERROR);
}
+ spin_unlock(&m->client->lock);
}
static __poll_t
@@ -324,7 +325,9 @@ static void p9_read_work(struct work_struct *work)
if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new header\n");
- err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0);
+ /* Header size */
+ m->rc.size = 7;
+ err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0);
if (err) {
p9_debug(P9_DEBUG_ERROR,
"error parsing header: %d\n", err);
@@ -369,12 +372,14 @@ static void p9_read_work(struct work_struct *work)
*/
if ((m->req) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new packet\n");
+ m->req->rc->size = m->rc.offset;
spin_lock(&m->client->lock);
if (m->req->status != REQ_STATUS_ERROR)
status = REQ_STATUS_RCVD;
list_del(&m->req->req_list);
- spin_unlock(&m->client->lock);
+ /* update req->status while holding client->lock */
p9_client_cb(m->client, m->req, status);
+ spin_unlock(&m->client->lock);
m->rc.sdata = NULL;
m->rc.offset = 0;
m->rc.capacity = 0;
@@ -940,7 +945,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
if (err < 0)
return err;
- if (valid_ipaddr4(addr) < 0)
+ if (addr == NULL || valid_ipaddr4(addr) < 0)
return -EINVAL;
csocket = NULL;
@@ -990,6 +995,9 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
csocket = NULL;
+ if (addr == NULL)
+ return -EINVAL;
+
if (strlen(addr) >= UNIX_PATH_MAX) {
pr_err("%s (%d): address too long: %s\n",
__func__, task_pid_nr(current), addr);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 3d414acb7015..b513cffeeb3c 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -320,6 +320,7 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS)
goto err_out;
+ c->rc->size = wc->byte_len;
err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
if (err)
goto err_out;
@@ -396,7 +397,7 @@ static int
post_recv(struct p9_client *client, struct p9_rdma_context *c)
{
struct p9_trans_rdma *rdma = client->trans;
- struct ib_recv_wr wr, *bad_wr;
+ struct ib_recv_wr wr;
struct ib_sge sge;
c->busa = ib_dma_map_single(rdma->cm_id->device,
@@ -415,7 +416,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
wr.wr_cqe = &c->cqe;
wr.sg_list = &sge;
wr.num_sge = 1;
- return ib_post_recv(rdma->qp, &wr, &bad_wr);
+ return ib_post_recv(rdma->qp, &wr, NULL);
error:
p9_debug(P9_DEBUG_ERROR, "EIO\n");
@@ -425,7 +426,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
static int rdma_request(struct p9_client *client, struct p9_req_t *req)
{
struct p9_trans_rdma *rdma = client->trans;
- struct ib_send_wr wr, *bad_wr;
+ struct ib_send_wr wr;
struct ib_sge sge;
int err = 0;
unsigned long flags;
@@ -520,7 +521,7 @@ dont_need_post_recv:
* status in case of a very fast reply.
*/
req->status = REQ_STATUS_SENT;
- err = ib_post_send(rdma->qp, &wr, &bad_wr);
+ err = ib_post_send(rdma->qp, &wr, NULL);
if (err)
goto send_error;
@@ -644,6 +645,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
+ if (addr == NULL)
+ return -EINVAL;
+
/* Parse the transport specific mount options */
err = parse_opts(args, &opts);
if (err < 0)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 05006cbb3361..7728b0acde09 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -89,10 +89,8 @@ struct virtio_chan {
unsigned long p9_max_pages;
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[VIRTQUEUE_NUM];
-
- int tag_len;
/*
- * tag name to identify a mount Non-null terminated
+ * tag name to identify a mount null terminated
*/
char *tag;
@@ -144,24 +142,27 @@ static void req_done(struct virtqueue *vq)
struct virtio_chan *chan = vq->vdev->priv;
unsigned int len;
struct p9_req_t *req;
+ bool need_wakeup = false;
unsigned long flags;
p9_debug(P9_DEBUG_TRANS, ": request done\n");
- while (1) {
- spin_lock_irqsave(&chan->lock, flags);
- req = virtqueue_get_buf(chan->vq, &len);
- if (req == NULL) {
- spin_unlock_irqrestore(&chan->lock, flags);
- break;
+ spin_lock_irqsave(&chan->lock, flags);
+ while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) {
+ if (!chan->ring_bufs_avail) {
+ chan->ring_bufs_avail = 1;
+ need_wakeup = true;
}
- chan->ring_bufs_avail = 1;
- spin_unlock_irqrestore(&chan->lock, flags);
- /* Wakeup if anyone waiting for VirtIO ring space. */
- wake_up(chan->vc_wq);
- if (len)
+
+ if (len) {
+ req->rc->size = len;
p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
+ }
}
+ spin_unlock_irqrestore(&chan->lock, flags);
+ /* Wakeup if anyone waiting for VirtIO ring space. */
+ if (need_wakeup)
+ wake_up(chan->vc_wq);
}
/**
@@ -188,7 +189,7 @@ static int pack_sg_list(struct scatterlist *sg, int start,
s = rest_of_page(data);
if (s > count)
s = count;
- BUG_ON(index > limit);
+ BUG_ON(index >= limit);
/* Make sure we don't terminate early. */
sg_unmark_end(&sg[index]);
sg_set_buf(&sg[index++], data, s);
@@ -233,6 +234,7 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
s = PAGE_SIZE - data_off;
if (s > count)
s = count;
+ BUG_ON(index >= limit);
/* Make sure we don't terminate early. */
sg_unmark_end(&sg[index]);
sg_set_page(&sg[index++], pdata[i++], s, data_off);
@@ -382,8 +384,8 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
* p9_virtio_zc_request - issue a zero copy request
* @client: client instance issuing the request
* @req: request to be issued
- * @uidata: user bffer that should be ued for zero copy read
- * @uodata: user buffer that shoud be user for zero copy write
+ * @uidata: user buffer that should be used for zero copy read
+ * @uodata: user buffer that should be used for zero copy write
* @inlen: read buffer size
* @outlen: write buffer size
* @in_hdr_len: reader header size, This is the size of response protocol data
@@ -406,6 +408,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
p9_debug(P9_DEBUG_TRANS, "virtio request\n");
if (uodata) {
+ __le32 sz;
int n = p9_get_mapped_pages(chan, &out_pages, uodata,
outlen, &offs, &need_drop);
if (n < 0)
@@ -416,6 +419,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4);
outlen = n;
}
+ /* The size field of the message must include the length of the
+ * header and the length of the data. We didn't actually know
+ * the length of the data until this point so add it in now.
+ */
+ sz = cpu_to_le32(req->tc->size + outlen);
+ memcpy(&req->tc->sdata[0], &sz, sizeof(sz));
} else if (uidata) {
int n = p9_get_mapped_pages(chan, &in_pages, uidata,
inlen, &offs, &need_drop);
@@ -446,7 +455,7 @@ req_retry_pinned:
out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
out_pages, out_nr_pages, offs, outlen);
}
-
+
/*
* Take care of in data
* For example TREAD have 11.
@@ -490,7 +499,7 @@ req_retry_pinned:
virtqueue_kick(chan->vq);
spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
- err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD);
+ err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
/*
* Non kernel buffers are pinned, unpin them
*/
@@ -517,14 +526,15 @@ static ssize_t p9_mount_tag_show(struct device *dev,
{
struct virtio_chan *chan;
struct virtio_device *vdev;
+ int tag_len;
vdev = dev_to_virtio(dev);
chan = vdev->priv;
+ tag_len = strlen(chan->tag);
- memcpy(buf, chan->tag, chan->tag_len);
- buf[chan->tag_len] = 0;
+ memcpy(buf, chan->tag, tag_len + 1);
- return chan->tag_len + 1;
+ return tag_len + 1;
}
static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
@@ -563,7 +573,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
if (IS_ERR(chan->vq)) {
err = PTR_ERR(chan->vq);
- goto out_free_vq;
+ goto out_free_chan;
}
chan->vq->vdev->priv = chan;
spin_lock_init(&chan->lock);
@@ -577,7 +587,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
err = -EINVAL;
goto out_free_vq;
}
- tag = kmalloc(tag_len, GFP_KERNEL);
+ tag = kzalloc(tag_len + 1, GFP_KERNEL);
if (!tag) {
err = -ENOMEM;
goto out_free_vq;
@@ -586,7 +596,6 @@ static int p9_virtio_probe(struct virtio_device *vdev)
virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag),
tag, tag_len);
chan->tag = tag;
- chan->tag_len = tag_len;
err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
if (err) {
goto out_free_tag;
@@ -616,6 +625,7 @@ out_free_tag:
kfree(tag);
out_free_vq:
vdev->config->del_vqs(vdev);
+out_free_chan:
kfree(chan);
fail:
return err;
@@ -643,10 +653,12 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
int ret = -ENOENT;
int found = 0;
+ if (devname == NULL)
+ return -EINVAL;
+
mutex_lock(&virtio_9p_lock);
list_for_each_entry(chan, &virtio_chan_list, chan_list) {
- if (!strncmp(devname, chan->tag, chan->tag_len) &&
- strlen(devname) == chan->tag_len) {
+ if (!strcmp(devname, chan->tag)) {
if (!chan->inuse) {
chan->inuse = true;
found = 1;
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 2e2b8bca54f3..c2d54ac76bfd 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -94,6 +94,9 @@ static int p9_xen_create(struct p9_client *client, const char *addr, char *args)
{
struct xen_9pfs_front_priv *priv;
+ if (addr == NULL)
+ return -EINVAL;
+
read_lock(&xen_9pfs_lock);
list_for_each_entry(priv, &xen_9pfs_devs, list) {
if (!strcmp(priv->tag, addr)) {
diff --git a/net/9p/util.c b/net/9p/util.c
index 59f278e64f58..55ad98277e85 100644
--- a/net/9p/util.c
+++ b/net/9p/util.c
@@ -138,4 +138,3 @@ int p9_idpool_check(int id, struct p9_idpool *p)
return idr_find(&p->pool, id) != NULL;
}
EXPORT_SYMBOL(p9_idpool_check);
-
diff --git a/net/Kconfig b/net/Kconfig
index f738a6f27665..f235edb593ba 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -12,7 +12,7 @@ menuconfig NET
The reason is that some programs need kernel networking support even
when running on a stand-alone machine that isn't connected to any
other computer.
-
+
If you are upgrading from an older kernel, you
should consider updating your networking tools too because changes
in the kernel and the tools often go hand in hand. The tools are
@@ -300,8 +300,11 @@ config BPF_JIT
config BPF_STREAM_PARSER
bool "enable BPF STREAM_PARSER"
+ depends on INET
depends on BPF_SYSCALL
+ depends on CGROUP_BPF
select STREAM_PARSER
+ select NET_SOCK_MSG
---help---
Enabling this allows a stream parser to be used with
BPF_MAP_TYPE_SOCKMAP.
@@ -413,6 +416,14 @@ config GRO_CELLS
config SOCK_VALIDATE_XMIT
bool
+config NET_SOCK_MSG
+ bool
+ default n
+ help
+ The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+ ULPs (upper layer modules, e.g. TLS) to process L7 application data
+ with the help of BPF programs.
+
config NET_DEVLINK
tristate "Network physical/parent device Netlink interface"
help
diff --git a/net/atm/common.c b/net/atm/common.c
index a7a68e509628..a38c174fc766 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
struct atm_vcc *vcc;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
vcc = ATM_SD(sock);
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index b93cc0f18292..46d6cd9a36ae 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -307,9 +307,3 @@ void mpc_proc_clean(void)
}
#endif /* CONFIG_PROC_FS */
-
-
-
-
-
-
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index ac2542b7be88..a14cfa736b63 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -304,4 +304,3 @@ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out)
}
}
}
-
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 891596e74278..488fc2d7085a 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -299,4 +299,3 @@ int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
return queued;
}
-
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index 28827e81ba2b..bc0329f43013 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -205,4 +205,3 @@ void ax25_dama_off(ax25_cb *ax25)
ax25->condition &= ~AX25_COND_DAMA_MODE;
ax25_dev_dama_off(ax25->ax25_dev);
}
-
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 183b1c583d56..70417e9b932d 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -249,4 +249,3 @@ const struct header_ops ax25_header_ops = {
EXPORT_SYMBOL(ax25_header_ops);
EXPORT_SYMBOL(ax25_ip_xmit);
-
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index b11a5f466fcc..3e5afc8dc93e 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -394,4 +394,3 @@ int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr)
}
return 0;
}
-
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index de8034d80623..f75816f58107 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -24,7 +24,6 @@ config BATMAN_ADV
depends on NET
select CRC16
select LIBCRC32C
- default n
help
B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
a routing protocol for multi-hop ad-hoc mesh networks. The
@@ -33,7 +32,7 @@ config BATMAN_ADV
tools.
config BATMAN_ADV_BATMAN_V
- bool "B.A.T.M.A.N. V protocol (experimental)"
+ bool "B.A.T.M.A.N. V protocol"
depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
default y
help
@@ -60,7 +59,7 @@ config BATMAN_ADV_BLA
config BATMAN_ADV_DAT
bool "Distributed ARP Table"
depends on BATMAN_ADV && INET
- default n
+ default y
help
This option enables DAT (Distributed ARP Table), a DHT based
mechanism that increases ARP reliability on sparse wireless
@@ -70,7 +69,6 @@ config BATMAN_ADV_DAT
config BATMAN_ADV_NC
bool "Network Coding"
depends on BATMAN_ADV
- default n
help
This option enables network coding, a mechanism that aims to
increase the overall network throughput by fusing multiple
@@ -84,7 +82,6 @@ config BATMAN_ADV_NC
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
- default n
help
This option enables the multicast optimisation which aims to
reduce the air overhead while improving the reliability of
@@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS
bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
- default n
help
Enable this to export routing related debug tables via debugfs.
The information for each soft-interface and used hard-interface can be
@@ -110,3 +106,14 @@ config BATMAN_ADV_DEBUG
say N here. This enables compilation of support for
outputting debugging information to the kernel log. The
output is controlled via the module parameter debug.
+
+config BATMAN_ADV_TRACING
+ bool "B.A.T.M.A.N. tracing support"
+ depends on BATMAN_ADV
+ depends on EVENT_TRACING
+ help
+ This is an option for use by developers; most people should
+ say N here. Select this option to gather traces like the debug
+ messages using the generic tracing infrastructure of the kernel.
+ BATMAN_ADV_DEBUG must also be selected to get trace events for
+ batadv_dbg.
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index b97ba6fb8353..9b58160fe485 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -42,6 +42,9 @@ batman-adv-y += routing.o
batman-adv-y += send.o
batman-adv-y += soft-interface.o
batman-adv-y += sysfs.o
+batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o
batman-adv-y += tp_meter.o
batman-adv-y += translation-table.o
batman-adv-y += tvlv.o
+
+CFLAGS_trace.o := -I$(src)
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 73bf6a93a3cf..d2227091029f 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -138,169 +138,6 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
}
/**
- * batadv_iv_ogm_orig_free() - free the private resources allocated for this
- * orig_node
- * @orig_node: the orig_node for which the resources have to be free'd
- */
-static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
-{
- kfree(orig_node->bat_iv.bcast_own);
- kfree(orig_node->bat_iv.bcast_own_sum);
-}
-
-/**
- * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node
- * to include the new hard-interface
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- *
- * Return: 0 on success, a negative error code otherwise.
- */
-static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
- unsigned int max_if_num)
-{
- void *data_ptr;
- size_t old_size;
- int ret = -ENOMEM;
-
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS;
- data_ptr = kmalloc_array(max_if_num,
- BATADV_NUM_WORDS * sizeof(unsigned long),
- GFP_ATOMIC);
- if (!data_ptr)
- goto unlock;
-
- memcpy(data_ptr, orig_node->bat_iv.bcast_own, old_size);
- kfree(orig_node->bat_iv.bcast_own);
- orig_node->bat_iv.bcast_own = data_ptr;
-
- data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr)
- goto unlock;
-
- memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
- (max_if_num - 1) * sizeof(u8));
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own_sum = data_ptr;
-
- ret = 0;
-
-unlock:
- spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- return ret;
-}
-
-/**
- * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- * @del_if_num: the index of the interface being removed
- */
-static void
-batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node,
- unsigned int max_if_num,
- unsigned int del_if_num)
-{
- size_t chunk_size;
- size_t if_offset;
- void *data_ptr;
-
- lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
-
- chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS;
- data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC);
- if (!data_ptr)
- /* use old buffer when new one could not be allocated */
- data_ptr = orig_node->bat_iv.bcast_own;
-
- /* copy first part */
- memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
-
- /* copy second part */
- if_offset = (del_if_num + 1) * chunk_size;
- memmove((char *)data_ptr + del_if_num * chunk_size,
- (uint8_t *)orig_node->bat_iv.bcast_own + if_offset,
- (max_if_num - del_if_num) * chunk_size);
-
- /* bcast_own was shrunk down in new buffer; free old one */
- if (orig_node->bat_iv.bcast_own != data_ptr) {
- kfree(orig_node->bat_iv.bcast_own);
- orig_node->bat_iv.bcast_own = data_ptr;
- }
-}
-
-/**
- * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- * @del_if_num: the index of the interface being removed
- */
-static void
-batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node,
- unsigned int max_if_num,
- unsigned int del_if_num)
-{
- size_t if_offset;
- void *data_ptr;
-
- lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
-
- data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr)
- /* use old buffer when new one could not be allocated */
- data_ptr = orig_node->bat_iv.bcast_own_sum;
-
- memmove(data_ptr, orig_node->bat_iv.bcast_own_sum,
- del_if_num * sizeof(u8));
-
- if_offset = (del_if_num + 1) * sizeof(u8);
- memmove((char *)data_ptr + del_if_num * sizeof(u8),
- orig_node->bat_iv.bcast_own_sum + if_offset,
- (max_if_num - del_if_num) * sizeof(u8));
-
- /* bcast_own_sum was shrunk down in new buffer; free old one */
- if (orig_node->bat_iv.bcast_own_sum != data_ptr) {
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own_sum = data_ptr;
- }
-}
-
-/**
- * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node
- * to exclude the removed interface
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- * @del_if_num: the index of the interface being removed
- *
- * Return: 0 on success, a negative error code otherwise.
- */
-static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
- unsigned int max_if_num,
- unsigned int del_if_num)
-{
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- if (max_if_num == 0) {
- kfree(orig_node->bat_iv.bcast_own);
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own = NULL;
- orig_node->bat_iv.bcast_own_sum = NULL;
- } else {
- batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num,
- del_if_num);
- batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num,
- del_if_num);
- }
-
- spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- return 0;
-}
-
-/**
* batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an
* originator
* @bat_priv: the bat priv with all the soft interface information
@@ -315,7 +152,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
{
struct batadv_orig_node *orig_node;
int hash_added;
- size_t size;
orig_node = batadv_orig_hash_find(bat_priv, addr);
if (orig_node)
@@ -327,16 +163,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock);
- size = bat_priv->num_ifaces * sizeof(unsigned long) * BATADV_NUM_WORDS;
- orig_node->bat_iv.bcast_own = kzalloc(size, GFP_ATOMIC);
- if (!orig_node->bat_iv.bcast_own)
- goto free_orig_node;
-
- size = bat_priv->num_ifaces * sizeof(u8);
- orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC);
- if (!orig_node->bat_iv.bcast_own_sum)
- goto free_orig_node;
-
kref_get(&orig_node->refcount);
hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
batadv_choose_orig, orig_node,
@@ -347,8 +173,9 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
return orig_node;
free_orig_node_hash:
+ /* reference for batadv_hash_add */
batadv_orig_node_put(orig_node);
-free_orig_node:
+ /* reference from batadv_orig_node_new */
batadv_orig_node_put(orig_node);
return NULL;
@@ -893,26 +720,30 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_head *head;
struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
unsigned long *word;
u32 i;
- size_t word_index;
u8 *w;
- unsigned int if_num;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
rcu_read_lock();
hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
- word_index = hard_iface->if_num * BATADV_NUM_WORDS;
- word = &orig_node->bat_iv.bcast_own[word_index];
-
- batadv_bit_get_packet(bat_priv, word, 1, 0);
- if_num = hard_iface->if_num;
- w = &orig_node->bat_iv.bcast_own_sum[if_num];
- *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE);
- spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ hlist_for_each_entry_rcu(orig_ifinfo,
+ &orig_node->ifinfo_list,
+ list) {
+ if (orig_ifinfo->if_outgoing != hard_iface)
+ continue;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ word = orig_ifinfo->bat_iv.bcast_own;
+ batadv_bit_get_packet(bat_priv, word, 1, 0);
+ w = &orig_ifinfo->bat_iv.bcast_own_sum;
+ *w = bitmap_weight(word,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ }
}
rcu_read_unlock();
}
@@ -1000,6 +831,35 @@ out:
}
/**
+ * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface
+ * @orig_node: originator which reproadcasted the OGMs directly
+ * @if_outgoing: interface which transmitted the original OGM and received the
+ * direct rebroadcast
+ *
+ * Return: Number of replied (rebroadcasted) OGMs which were transmitted by
+ * an originator and directly (without intermediate hop) received by a specific
+ * interface
+ */
+static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ u8 sum;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ return 0;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ sum = orig_ifinfo->bat_iv.bcast_own_sum;
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ batadv_orig_ifinfo_put(orig_ifinfo);
+
+ return sum;
+}
+
+/**
* batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an
* originator
* @bat_priv: the bat priv with all the soft interface information
@@ -1026,8 +886,6 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
struct batadv_neigh_node *neigh_node = NULL;
struct batadv_neigh_node *tmp_neigh_node = NULL;
struct batadv_neigh_node *router = NULL;
- struct batadv_orig_node *orig_node_tmp;
- unsigned int if_num;
u8 sum_orig, sum_neigh;
u8 *neigh_addr;
u8 tq_avg;
@@ -1132,18 +990,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
*/
if (router_ifinfo &&
neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) {
- orig_node_tmp = router->orig_node;
- spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
- if_num = router->if_incoming->if_num;
- sum_orig = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
- spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
-
- orig_node_tmp = neigh_node->orig_node;
- spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
- if_num = neigh_node->if_incoming->if_num;
- sum_neigh = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
- spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
-
+ sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node,
+ router->if_incoming);
+ sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node,
+ neigh_node->if_incoming);
if (sum_orig >= sum_neigh)
goto out;
}
@@ -1186,7 +1036,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
u8 total_count;
u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
- unsigned int if_num;
unsigned int tq_asym_penalty, inv_asym_penalty;
unsigned int combined_tq;
unsigned int tq_iface_penalty;
@@ -1227,9 +1076,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
orig_node->last_seen = jiffies;
/* find packet count of corresponding one hop neighbor */
- spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
- if_num = if_incoming->if_num;
- orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num];
+ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming);
neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
if (neigh_ifinfo) {
neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
@@ -1237,7 +1084,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
} else {
neigh_rq_count = 0;
}
- spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
/* pay attention to not get a value bigger than 100 % */
if (orig_eq_count > neigh_rq_count)
@@ -1622,6 +1468,49 @@ out:
}
/**
+ * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it
+ * @ogm_packet: rebroadcast OGM packet to process
+ * @if_incoming: the interface where this packet was received
+ * @orig_node: originator which reproadcasted the OGMs
+ * @if_incoming_seqno: OGM sequence number when rebroadcast was received
+ */
+static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_orig_node *orig_node,
+ u32 if_incoming_seqno)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ s32 bit_pos;
+ u8 *weight;
+
+ /* neighbor has to indicate direct link and it has to
+ * come via the corresponding interface
+ */
+ if (!(ogm_packet->flags & BATADV_DIRECTLINK))
+ return;
+
+ if (!batadv_compare_eth(if_incoming->net_dev->dev_addr,
+ ogm_packet->orig))
+ return;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming);
+ if (!orig_ifinfo)
+ return;
+
+ /* save packet seqno for bidirectional check */
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ bit_pos = if_incoming_seqno - 2;
+ bit_pos -= ntohl(ogm_packet->seqno);
+ batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos);
+ weight = &orig_ifinfo->bat_iv.bcast_own_sum;
+ *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ batadv_orig_ifinfo_put(orig_ifinfo);
+}
+
+/**
* batadv_iv_ogm_process() - process an incoming batman iv OGM
* @skb: the skb containing the OGM
* @ogm_offset: offset to the OGM which should be processed (for aggregates)
@@ -1705,37 +1594,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
}
if (is_my_orig) {
- unsigned long *word;
- size_t offset;
- s32 bit_pos;
- unsigned int if_num;
- u8 *weight;
-
orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
ethhdr->h_source);
if (!orig_neigh_node)
return;
- /* neighbor has to indicate direct link and it has to
- * come via the corresponding interface
- * save packet seqno for bidirectional check
- */
- if (has_directlink_flag &&
- batadv_compare_eth(if_incoming->net_dev->dev_addr,
- ogm_packet->orig)) {
- if_num = if_incoming->if_num;
- offset = if_num * BATADV_NUM_WORDS;
-
- spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
- word = &orig_neigh_node->bat_iv.bcast_own[offset];
- bit_pos = if_incoming_seqno - 2;
- bit_pos -= ntohl(ogm_packet->seqno);
- batadv_set_bit(word, bit_pos);
- weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num];
- *weight = bitmap_weight(word,
- BATADV_TQ_LOCAL_WINDOW_SIZE);
- spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
- }
+ batadv_iv_ogm_process_reply(ogm_packet, if_incoming,
+ orig_neigh_node, if_incoming_seqno);
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Drop packet: originator packet from myself (via neighbor)\n");
@@ -2844,9 +2709,6 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.print = batadv_iv_ogm_orig_print,
#endif
.dump = batadv_iv_ogm_orig_dump,
- .free = batadv_iv_ogm_orig_free,
- .add_if = batadv_iv_ogm_orig_add_if,
- .del_if = batadv_iv_ogm_orig_del_if,
},
.gw = {
.init_sel_class = batadv_iv_init_sel_class,
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 317cafd302cf..3dc6a7a43eb7 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -16,11 +16,11 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_
-#define _BATMAN_ADV_BATADV_IV_OGM_H_
+#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_
+#define _NET_BATMAN_ADV_BAT_IV_OGM_H_
#include "main.h"
int batadv_iv_init(void);
-#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */
+#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 71c20c1d4002..9f481cfdf77d 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -241,7 +241,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
* the packet to be exactly of that size to make the link
* throughput estimation effective.
*/
- skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len);
+ skb_put_zero(skb, probe_len - hard_iface->bat_v.elp_skb->len);
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Sending unicast (probe) ELP packet on interface %s to %pM\n",
@@ -268,6 +268,7 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
struct batadv_priv *bat_priv;
struct sk_buff *skb;
u32 elp_interval;
+ bool ret;
bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
@@ -329,8 +330,11 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
* may sleep and that is not allowed in an rcu protected
* context. Therefore schedule a task for that.
*/
- queue_work(batadv_event_workqueue,
- &hardif_neigh->bat_v.metric_work);
+ ret = queue_work(batadv_event_workqueue,
+ &hardif_neigh->bat_v.metric_work);
+
+ if (!ret)
+ batadv_hardif_neigh_put(hardif_neigh);
}
rcu_read_unlock();
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index ed36c5e79fde..e5be14c908c6 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -16,8 +16,8 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _BATMAN_ADV_BATADV_V_OGM_H_
-#define _BATMAN_ADV_BATADV_V_OGM_H_
+#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_
+#define _NET_BATMAN_ADV_BAT_V_OGM_H_
#include "main.h"
@@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_ogm_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
-#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */
+#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index a2de5a44bd41..5f1aeeded0e3 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1449,7 +1449,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
* detection frames. Set the locally administered bit to avoid
* collisions with users mac addresses.
*/
- random_ether_addr(bat_priv->bla.loopdetect_addr);
+ eth_random_addr(bat_priv->bla.loopdetect_addr);
bat_priv->bla.loopdetect_addr[0] = 0xba;
bat_priv->bla.loopdetect_addr[1] = 0xbe;
bat_priv->bla.loopdetect_lasttime = jiffies;
@@ -1772,6 +1772,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
{
struct batadv_bla_backbone_gw *backbone_gw;
struct ethhdr *ethhdr;
+ bool ret;
ethhdr = eth_hdr(skb);
@@ -1795,8 +1796,13 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
if (unlikely(!backbone_gw))
return true;
- queue_work(batadv_event_workqueue, &backbone_gw->report_work);
- /* backbone_gw is unreferenced in the report work function function */
+ ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work);
+
+ /* backbone_gw is unreferenced in the report work function function
+ * if queue_work() call was successful
+ */
+ if (!ret)
+ batadv_backbone_gw_put(backbone_gw);
return true;
}
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 87479c60670e..8b608a2e2653 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -47,8 +47,24 @@
static struct dentry *batadv_debugfs;
+/**
+ * batadv_debugfs_deprecated() - Log use of deprecated batadv debugfs access
+ * @file: file which was accessed
+ * @alt: explanation what can be used as alternative
+ */
+void batadv_debugfs_deprecated(struct file *file, const char *alt)
+{
+ struct dentry *dentry = file_dentry(file);
+ const char *name = dentry->d_name.name;
+
+ pr_warn_ratelimited(DEPRECATED "%s (pid %d) Use of debugfs file \"%s\".\n%s",
+ current->comm, task_pid_nr(current), name, alt);
+}
+
static int batadv_algorithms_open(struct inode *inode, struct file *file)
{
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_ROUTING_ALGOS instead\n");
return single_open(file, batadv_algo_seq_print_text, NULL);
}
@@ -56,6 +72,8 @@ static int neighbors_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_NEIGHBORS instead\n");
return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev);
}
@@ -63,6 +81,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_ORIGINATORS instead\n");
return single_open(file, batadv_orig_seq_print_text, net_dev);
}
@@ -79,6 +99,8 @@ static int batadv_originators_hardif_open(struct inode *inode,
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_HARDIFS instead\n");
return single_open(file, batadv_orig_hardif_seq_print_text, net_dev);
}
@@ -86,6 +108,8 @@ static int batadv_gateways_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_GATEWAYS instead\n");
return single_open(file, batadv_gw_client_seq_print_text, net_dev);
}
@@ -93,6 +117,8 @@ static int batadv_transtable_global_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_TRANSTABLE_GLOBAL instead\n");
return single_open(file, batadv_tt_global_seq_print_text, net_dev);
}
@@ -101,6 +127,8 @@ static int batadv_bla_claim_table_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_BLA_CLAIM instead\n");
return single_open(file, batadv_bla_claim_table_seq_print_text,
net_dev);
}
@@ -110,6 +138,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_BLA_BACKBONE instead\n");
return single_open(file, batadv_bla_backbone_table_seq_print_text,
net_dev);
}
@@ -118,7 +148,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache
+ * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache
* @inode: inode which was opened
* @file: file handle to be initialized
*
@@ -128,6 +158,8 @@ static int batadv_dat_cache_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_DAT_CACHE instead\n");
return single_open(file, batadv_dat_cache_seq_print_text, net_dev);
}
#endif
@@ -136,6 +168,8 @@ static int batadv_transtable_local_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_TRANSTABLE_LOCAL instead\n");
return single_open(file, batadv_tt_local_seq_print_text, net_dev);
}
@@ -149,6 +183,7 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file, "");
return single_open(file, batadv_nc_nodes_seq_print_text, net_dev);
}
#endif
@@ -165,6 +200,8 @@ static int batadv_mcast_flags_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
+ batadv_debugfs_deprecated(file,
+ "Use genl command BATADV_CMD_GET_MCAST_FLAGS instead\n");
return single_open(file, batadv_mcast_flags_seq_print_text, net_dev);
}
#endif
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 08a592ffbee5..8de018e5c577 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -21,12 +21,14 @@
#include "main.h"
+struct file;
struct net_device;
#define BATADV_DEBUGFS_SUBDIR "batman_adv"
#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS)
+void batadv_debugfs_deprecated(struct file *file, const char *alt);
void batadv_debugfs_init(void);
void batadv_debugfs_destroy(void);
int batadv_debugfs_add_meshif(struct net_device *dev);
@@ -38,6 +40,10 @@ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
#else
+static inline void batadv_debugfs_deprecated(struct file *file, const char *alt)
+{
+}
+
static inline void batadv_debugfs_init(void)
{
}
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 8b198ee798c9..140c61a3f1ec 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -32,6 +32,7 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/rculist.h>
@@ -348,6 +349,9 @@ out:
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: originator announcing gateway capabilities
* @gateway: announced bandwidth information
+ *
+ * Has to be called with the appropriate locks being acquired
+ * (gw.list_lock).
*/
static void batadv_gw_node_add(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
@@ -355,6 +359,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
{
struct batadv_gw_node *gw_node;
+ lockdep_assert_held(&bat_priv->gw.list_lock);
+
if (gateway->bandwidth_down == 0)
return;
@@ -369,10 +375,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
- spin_lock_bh(&bat_priv->gw.list_lock);
kref_get(&gw_node->refcount);
hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
- spin_unlock_bh(&bat_priv->gw.list_lock);
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n",
@@ -428,11 +432,14 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
{
struct batadv_gw_node *gw_node, *curr_gw = NULL;
+ spin_lock_bh(&bat_priv->gw.list_lock);
gw_node = batadv_gw_node_get(bat_priv, orig_node);
if (!gw_node) {
batadv_gw_node_add(bat_priv, orig_node, gateway);
+ spin_unlock_bh(&bat_priv->gw.list_lock);
goto out;
}
+ spin_unlock_bh(&bat_priv->gw.list_lock);
if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) &&
gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 2f0d42f2f913..781c5b6e6e8e 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -763,11 +763,6 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
hard_iface->soft_iface = soft_iface;
bat_priv = netdev_priv(hard_iface->soft_iface);
- if (bat_priv->num_ifaces >= UINT_MAX) {
- ret = -ENOSPC;
- goto err_dev;
- }
-
ret = netdev_master_upper_dev_link(hard_iface->net_dev,
soft_iface, NULL, NULL, NULL);
if (ret)
@@ -777,16 +772,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
if (ret < 0)
goto err_upper;
- hard_iface->if_num = bat_priv->num_ifaces;
- bat_priv->num_ifaces++;
hard_iface->if_status = BATADV_IF_INACTIVE;
- ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
- if (ret < 0) {
- bat_priv->algo_ops->iface.disable(hard_iface);
- bat_priv->num_ifaces--;
- hard_iface->if_status = BATADV_IF_NOT_IN_USE;
- goto err_upper;
- }
kref_get(&hard_iface->refcount);
hard_iface->batman_adv_ptype.type = ethertype;
@@ -834,6 +820,33 @@ err:
}
/**
+ * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface
+ * @soft_iface: soft interface to check
+ *
+ * This function is only using RCU for locking - the result can therefore be
+ * off when another functions is modifying the list at the same time. The
+ * caller can use the rtnl_lock to make sure that the count is accurate.
+ *
+ * Return: number of connected/enslaved hard interfaces
+ */
+static size_t batadv_hardif_cnt(const struct net_device *soft_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+ size_t count = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != soft_iface)
+ continue;
+
+ count++;
+ }
+ rcu_read_unlock();
+
+ return count;
+}
+
+/**
* batadv_hardif_disable_interface() - Remove hard interface from soft interface
* @hard_iface: hard interface to be removed
* @autodel: whether to delete soft interface when it doesn't contain any other
@@ -855,9 +868,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
dev_remove_pack(&hard_iface->batman_adv_ptype);
batadv_hardif_put(hard_iface);
- bat_priv->num_ifaces--;
- batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
-
primary_if = batadv_primary_if_get_selected(bat_priv);
if (hard_iface == primary_if) {
struct batadv_hard_iface *new_if;
@@ -881,7 +891,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface);
/* nobody uses this interface anymore */
- if (bat_priv->num_ifaces == 0) {
+ if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) {
batadv_gw_check_client_stop(bat_priv);
if (autodel == BATADV_IF_CLEANUP_AUTO)
@@ -917,7 +927,6 @@ batadv_hardif_add_interface(struct net_device *net_dev)
if (ret)
goto free_if;
- hard_iface->if_num = 0;
hard_iface->net_dev = net_dev;
hard_iface->soft_iface = NULL;
hard_iface->if_status = BATADV_IF_NOT_IN_USE;
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 55c358ad3331..d70f363c52ae 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -47,6 +47,7 @@
#include <linux/wait.h>
#include <uapi/linux/batadv_packet.h>
+#include "debugfs.h"
#include "hard-interface.h"
#include "log.h"
#include "originator.h"
@@ -74,6 +75,8 @@ static int batadv_socket_open(struct inode *inode, struct file *file)
if (!try_module_get(THIS_MODULE))
return -EBUSY;
+ batadv_debugfs_deprecated(file, "");
+
nonseekable_open(inode, file);
socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 853773e45f79..6beb5f067810 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -40,6 +40,9 @@
#include <linux/wait.h>
#include <stdarg.h>
+#include "debugfs.h"
+#include "trace.h"
+
#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
@@ -98,13 +101,19 @@ static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
*/
int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- char tmp_log_buf[256];
va_start(args, fmt);
- vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
- batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s",
- jiffies_to_msecs(jiffies), tmp_log_buf);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ batadv_fdebug_log(bat_priv->debug_log, "[%10u] %pV",
+ jiffies_to_msecs(jiffies), &vaf);
+
+ trace_batadv_dbg(bat_priv, &vaf);
+
va_end(args);
return 0;
@@ -115,6 +124,9 @@ static int batadv_log_open(struct inode *inode, struct file *file)
if (!try_module_get(THIS_MODULE))
return -EBUSY;
+ batadv_debugfs_deprecated(file,
+ "Use tracepoint batadv:batadv_dbg instead\n");
+
nonseekable_open(inode, file);
file->private_data = inode->i_private;
return 0;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 8da3c9336111..2002b70e18db 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.2"
+#define BATADV_SOURCE_VERSION "2018.4"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index c3578444f3cb..34caf129a9bf 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -854,16 +854,27 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
spinlock_t *lock; /* Used to lock list selected by "int in_coding" */
struct list_head *list;
+ /* Select ingoing or outgoing coding node */
+ if (in_coding) {
+ lock = &orig_neigh_node->in_coding_list_lock;
+ list = &orig_neigh_node->in_coding_list;
+ } else {
+ lock = &orig_neigh_node->out_coding_list_lock;
+ list = &orig_neigh_node->out_coding_list;
+ }
+
+ spin_lock_bh(lock);
+
/* Check if nc_node is already added */
nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding);
/* Node found */
if (nc_node)
- return nc_node;
+ goto unlock;
nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC);
if (!nc_node)
- return NULL;
+ goto unlock;
/* Initialize nc_node */
INIT_LIST_HEAD(&nc_node->list);
@@ -872,22 +883,14 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
kref_get(&orig_neigh_node->refcount);
nc_node->orig_node = orig_neigh_node;
- /* Select ingoing or outgoing coding node */
- if (in_coding) {
- lock = &orig_neigh_node->in_coding_list_lock;
- list = &orig_neigh_node->in_coding_list;
- } else {
- lock = &orig_neigh_node->out_coding_list_lock;
- list = &orig_neigh_node->out_coding_list;
- }
-
batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n",
nc_node->addr, nc_node->orig_node->orig);
/* Add nc_node to orig_node */
- spin_lock_bh(lock);
kref_get(&nc_node->refcount);
list_add_tail_rcu(&nc_node->list, list);
+
+unlock:
spin_unlock_bh(lock);
return nc_node;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 716e5b43acfa..56a981af5c92 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -904,9 +904,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
batadv_frag_purge_orig(orig_node, NULL);
- if (orig_node->bat_priv->algo_ops->orig.free)
- orig_node->bat_priv->algo_ops->orig.free(orig_node);
-
kfree(orig_node->tt_buff);
kfree(orig_node);
}
@@ -1339,7 +1336,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
return false;
}
-static void _batadv_purge_orig(struct batadv_priv *bat_priv)
+/**
+ * batadv_purge_orig_ref() - Purge all outdated originators
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
{
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_node *node_tmp;
@@ -1385,21 +1386,12 @@ static void batadv_purge_orig(struct work_struct *work)
delayed_work = to_delayed_work(work);
bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
- _batadv_purge_orig(bat_priv);
+ batadv_purge_orig_ref(bat_priv);
queue_delayed_work(batadv_event_workqueue,
&bat_priv->orig_work,
msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
}
-/**
- * batadv_purge_orig_ref() - Purge all outdated originators
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
-{
- _batadv_purge_orig(bat_priv);
-}
-
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
@@ -1560,107 +1552,3 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
return ret;
}
-
-/**
- * batadv_orig_hash_add_if() - Add interface to originators in orig_hash
- * @hard_iface: hard interface to add (already slave of the soft interface)
- * @max_if_num: new number of interfaces
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num)
-{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_algo_ops *bao = bat_priv->algo_ops;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct hlist_head *head;
- struct batadv_orig_node *orig_node;
- u32 i;
- int ret;
-
- /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
- * if_num
- */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- ret = 0;
- if (bao->orig.add_if)
- ret = bao->orig.add_if(orig_node, max_if_num);
- if (ret == -ENOMEM)
- goto err;
- }
- rcu_read_unlock();
- }
-
- return 0;
-
-err:
- rcu_read_unlock();
- return -ENOMEM;
-}
-
-/**
- * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash
- * @hard_iface: hard interface to remove (still slave of the soft interface)
- * @max_if_num: new number of interfaces
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num)
-{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct hlist_head *head;
- struct batadv_hard_iface *hard_iface_tmp;
- struct batadv_orig_node *orig_node;
- struct batadv_algo_ops *bao = bat_priv->algo_ops;
- u32 i;
- int ret;
-
- /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
- * if_num
- */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- ret = 0;
- if (bao->orig.del_if)
- ret = bao->orig.del_if(orig_node, max_if_num,
- hard_iface->if_num);
- if (ret == -ENOMEM)
- goto err;
- }
- rcu_read_unlock();
- }
-
- /* renumber remaining batman interfaces _inside_ of orig_hash_lock */
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface_tmp, &batadv_hardif_list, list) {
- if (hard_iface_tmp->if_status == BATADV_IF_NOT_IN_USE)
- continue;
-
- if (hard_iface == hard_iface_tmp)
- continue;
-
- if (hard_iface->soft_iface != hard_iface_tmp->soft_iface)
- continue;
-
- if (hard_iface_tmp->if_num > hard_iface->if_num)
- hard_iface_tmp->if_num--;
- }
- rcu_read_unlock();
-
- hard_iface->if_num = -1;
- return 0;
-
-err:
- rcu_read_unlock();
- return -ENOMEM;
-}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 3b3f59b881e1..a8b4c7b667ec 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -72,10 +72,6 @@ void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
-int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num);
-int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num);
struct batadv_orig_node_vlan *
batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
unsigned short vid);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 1485263a348b..5db5a0a4c959 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -574,15 +574,20 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
struct batadv_softif_vlan *vlan;
int err;
+ spin_lock_bh(&bat_priv->softif_vlan_list_lock);
+
vlan = batadv_softif_vlan_get(bat_priv, vid);
if (vlan) {
batadv_softif_vlan_put(vlan);
+ spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
return -EEXIST;
}
vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC);
- if (!vlan)
+ if (!vlan) {
+ spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
return -ENOMEM;
+ }
vlan->bat_priv = bat_priv;
vlan->vid = vid;
@@ -590,17 +595,23 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
atomic_set(&vlan->ap_isolation, 0);
+ kref_get(&vlan->refcount);
+ hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
+ spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
+
+ /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the
+ * sleeping behavior of the sysfs functions and the fs_reclaim lock
+ */
err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
if (err) {
- kfree(vlan);
+ /* ref for the function */
+ batadv_softif_vlan_put(vlan);
+
+ /* ref for the list */
+ batadv_softif_vlan_put(vlan);
return err;
}
- spin_lock_bh(&bat_priv->softif_vlan_list_lock);
- kref_get(&vlan->refcount);
- hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
- spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
-
/* add a new TT local entry. This one will be marked with the NOPURGE
* flag
*/
@@ -833,7 +844,6 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->frag_seqno, random_seqno);
bat_priv->primary_if = NULL;
- bat_priv->num_ifaces = 0;
batadv_nc_init_bat_priv(bat_priv);
@@ -1051,6 +1061,7 @@ static void batadv_softif_init_early(struct net_device *dev)
dev->needs_free_netdev = true;
dev->priv_destructor = batadv_softif_free;
dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_LLTX;
dev->priv_flags |= IFF_NO_QUEUE;
/* can't call min_mtu, because the needed variables
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index f2eef43bd2ec..09427fc6494a 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -188,7 +188,8 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \
\
return __batadv_store_uint_attr(buff, count, _min, _max, \
_post_func, attr, \
- &bat_priv->_var, net_dev); \
+ &bat_priv->_var, net_dev, \
+ NULL); \
}
#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
@@ -262,7 +263,9 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \
\
length = __batadv_store_uint_attr(buff, count, _min, _max, \
_post_func, attr, \
- &hard_iface->_var, net_dev); \
+ &hard_iface->_var, \
+ hard_iface->soft_iface, \
+ net_dev); \
\
batadv_hardif_put(hard_iface); \
return length; \
@@ -356,10 +359,12 @@ __batadv_store_bool_attr(char *buff, size_t count,
static int batadv_store_uint_attr(const char *buff, size_t count,
struct net_device *net_dev,
+ struct net_device *slave_dev,
const char *attr_name,
unsigned int min, unsigned int max,
atomic_t *attr)
{
+ char ifname[IFNAMSIZ + 3] = "";
unsigned long uint_val;
int ret;
@@ -385,8 +390,11 @@ static int batadv_store_uint_attr(const char *buff, size_t count,
if (atomic_read(attr) == uint_val)
return count;
- batadv_info(net_dev, "%s: Changing from: %i to: %lu\n",
- attr_name, atomic_read(attr), uint_val);
+ if (slave_dev)
+ snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name);
+
+ batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n",
+ attr_name, ifname, atomic_read(attr), uint_val);
atomic_set(attr, uint_val);
return count;
@@ -397,12 +405,13 @@ static ssize_t __batadv_store_uint_attr(const char *buff, size_t count,
void (*post_func)(struct net_device *),
const struct attribute *attr,
atomic_t *attr_store,
- struct net_device *net_dev)
+ struct net_device *net_dev,
+ struct net_device *slave_dev)
{
int ret;
- ret = batadv_store_uint_attr(buff, count, net_dev, attr->name, min, max,
- attr_store);
+ ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev,
+ attr->name, min, max, attr_store);
if (post_func && ret)
post_func(net_dev);
@@ -571,7 +580,7 @@ static ssize_t batadv_store_gw_sel_class(struct kobject *kobj,
return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE,
batadv_post_gw_reselect, attr,
&bat_priv->gw.sel_class,
- bat_priv->soft_iface);
+ bat_priv->soft_iface, NULL);
}
static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
@@ -1090,8 +1099,9 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj,
if (old_tp_override == tp_override)
goto out;
- batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n",
- "throughput_override",
+ batadv_info(hard_iface->soft_iface,
+ "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n",
+ "throughput_override", net_dev->name,
old_tp_override / 10, old_tp_override % 10,
tp_override / 10, tp_override % 10);
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
new file mode 100644
index 000000000000..3d57f9981f25
--- /dev/null
+++ b/net/batman-adv/trace.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
+ *
+ * Sven Eckelmann
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
new file mode 100644
index 000000000000..3acda26a30ca
--- /dev/null
+++ b/net/batman-adv/trace.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
+ *
+ * Sven Eckelmann
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _NET_BATMAN_ADV_TRACE_H_
+
+#include "main.h"
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM batadv
+
+/* provide dummy function when tracing is disabled */
+#if !defined(CONFIG_BATMAN_ADV_TRACING)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, ...) \
+ static inline void trace_ ## name(proto) {}
+
+#endif /* CONFIG_BATMAN_ADV_TRACING */
+
+#define BATADV_MAX_MSG_LEN 256
+
+TRACE_EVENT(batadv_dbg,
+
+ TP_PROTO(struct batadv_priv *bat_priv,
+ struct va_format *vaf),
+
+ TP_ARGS(bat_priv, vaf),
+
+ TP_STRUCT__entry(
+ __string(device, bat_priv->soft_iface->name)
+ __string(driver, KBUILD_MODNAME)
+ __dynamic_array(char, msg, BATADV_MAX_MSG_LEN)
+ ),
+
+ TP_fast_assign(
+ __assign_str(device, bat_priv->soft_iface->name);
+ __assign_str(driver, KBUILD_MODNAME);
+ WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
+ BATADV_MAX_MSG_LEN,
+ vaf->fmt,
+ *vaf->va) >= BATADV_MAX_MSG_LEN);
+ ),
+
+ TP_printk(
+ "%s %s %s",
+ __get_str(driver),
+ __get_str(device),
+ __get_str(msg)
+ )
+);
+
+#endif /* _NET_BATMAN_ADV_TRACE_H_ || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 12a2b7d21376..d21624c44665 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1613,6 +1613,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
{
struct batadv_tt_orig_list_entry *orig_entry;
+ spin_lock_bh(&tt_global->list_lock);
+
orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node);
if (orig_entry) {
/* refresh the ttvn: the current value could be a bogus one that
@@ -1635,11 +1637,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
orig_entry->flags = flags;
kref_init(&orig_entry->refcount);
- spin_lock_bh(&tt_global->list_lock);
kref_get(&orig_entry->refcount);
hlist_add_head_rcu(&orig_entry->list,
&tt_global->orig_list);
- spin_unlock_bh(&tt_global->list_lock);
atomic_inc(&tt_global->orig_list_count);
sync_flags:
@@ -1647,6 +1647,8 @@ sync_flags:
out:
if (orig_entry)
batadv_tt_orig_list_entry_put(orig_entry);
+
+ spin_unlock_bh(&tt_global->list_lock);
}
/**
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index a637458205d1..40e69c9346d2 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -529,15 +529,20 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
{
struct batadv_tvlv_handler *tvlv_handler;
+ spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
+
tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
if (tvlv_handler) {
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
batadv_tvlv_handler_put(tvlv_handler);
return;
}
tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC);
- if (!tvlv_handler)
+ if (!tvlv_handler) {
+ spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
return;
+ }
tvlv_handler->ogm_handler = optr;
tvlv_handler->unicast_handler = uptr;
@@ -547,7 +552,6 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
kref_init(&tvlv_handler->refcount);
INIT_HLIST_NODE(&tvlv_handler->list);
- spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
kref_get(&tvlv_handler->refcount);
hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list);
spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 360357f83f20..45b5592de816 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -43,12 +43,13 @@ struct seq_file;
#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is
- * changed, BATADV_DAT_ADDR_MAX is changed as well.
+ * typedef batadv_dat_addr_t - type used for all DHT addresses
+ *
+ * If it is changed, BATADV_DAT_ADDR_MAX is changed as well.
*
* *Please be careful: batadv_dat_addr_t must be UNSIGNED*
*/
-#define batadv_dat_addr_t u16
+typedef u16 batadv_dat_addr_t;
#endif /* CONFIG_BATMAN_ADV_DAT */
@@ -166,9 +167,6 @@ struct batadv_hard_iface {
/** @list: list node for batadv_hardif_list */
struct list_head list;
- /** @if_num: identificator of the interface */
- unsigned int if_num;
-
/** @if_status: status of the interface for batman-adv */
char if_status;
@@ -232,6 +230,20 @@ struct batadv_hard_iface {
};
/**
+ * struct batadv_orig_ifinfo - B.A.T.M.A.N. IV private orig_ifinfo members
+ */
+struct batadv_orig_ifinfo_bat_iv {
+ /**
+ * @bcast_own: bitfield which counts the number of our OGMs this
+ * orig_node rebroadcasted "back" to us (relative to last_real_seqno)
+ */
+ DECLARE_BITMAP(bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+ /** @bcast_own_sum: sum of bcast_own */
+ u8 bcast_own_sum;
+};
+
+/**
* struct batadv_orig_ifinfo - originator info per outgoing interface
*/
struct batadv_orig_ifinfo {
@@ -256,6 +268,9 @@ struct batadv_orig_ifinfo {
/** @batman_seqno_reset: time when the batman seqno window was reset */
unsigned long batman_seqno_reset;
+ /** @bat_iv: B.A.T.M.A.N. IV private structure */
+ struct batadv_orig_ifinfo_bat_iv bat_iv;
+
/** @refcount: number of contexts the object is used */
struct kref refcount;
@@ -338,19 +353,10 @@ struct batadv_orig_node_vlan {
*/
struct batadv_orig_bat_iv {
/**
- * @bcast_own: set of bitfields (one per hard-interface) where each one
- * counts the number of our OGMs this orig_node rebroadcasted "back" to
- * us (relative to last_real_seqno). Every bitfield is
- * BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
- */
- unsigned long *bcast_own;
-
- /** @bcast_own_sum: sum of bcast_own */
- u8 *bcast_own_sum;
-
- /**
- * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
- * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
+ * @ogm_cnt_lock: lock protecting &batadv_orig_ifinfo_bat_iv.bcast_own,
+ * &batadv_orig_ifinfo_bat_iv.bcast_own_sum,
+ * &batadv_neigh_ifinfo_bat_iv.bat_iv.real_bits and
+ * &batadv_neigh_ifinfo_bat_iv.real_packet_count
*/
spinlock_t ogm_cnt_lock;
};
@@ -1596,9 +1602,6 @@ struct batadv_priv {
/** @batman_queue_left: number of remaining OGM packet slots */
atomic_t batman_queue_left;
- /** @num_ifaces: number of interfaces assigned to this mesh interface */
- unsigned int num_ifaces;
-
/** @mesh_obj: kobject for sysfs mesh subdirectory */
struct kobject *mesh_obj;
@@ -2178,28 +2181,6 @@ struct batadv_algo_neigh_ops {
* struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
*/
struct batadv_algo_orig_ops {
- /**
- * @free: free the resources allocated by the routing algorithm for an
- * orig_node object (optional)
- */
- void (*free)(struct batadv_orig_node *orig_node);
-
- /**
- * @add_if: ask the routing algorithm to apply the needed changes to the
- * orig_node due to a new hard-interface being added into the mesh
- * (optional)
- */
- int (*add_if)(struct batadv_orig_node *orig_node,
- unsigned int max_if_num);
-
- /**
- * @del_if: ask the routing algorithm to apply the needed changes to the
- * orig_node due to an hard-interface being removed from the mesh
- * (optional)
- */
- int (*del_if)(struct batadv_orig_node *orig_node,
- unsigned int max_if_num, unsigned int del_if_num);
-
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/** @print: print the originator table (optional) */
void (*print)(struct batadv_priv *priv, struct seq_file *seq,
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 3264e1873219..deacc52d7ff1 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -159,7 +159,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk)
BT_DBG("parent %p, sk %p", parent, sk);
sock_hold(sk);
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
bt_sk(sk)->parent = parent;
release_sock(sk);
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 7b3965861013..43c284158f63 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -489,9 +489,6 @@ static int bnep_session(void *arg)
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- /* Ensure session->terminate is updated */
- smp_mb__before_atomic();
-
if (atomic_read(&s->terminate))
break;
/* RX */
@@ -512,6 +509,10 @@ static int bnep_session(void *arg)
break;
netif_wake_queue(dev);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(sk_sleep(sk), &wait);
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 7f26a5a19ff6..07cfa3249f83 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -288,9 +288,6 @@ static int cmtp_session(void *arg)
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- /* Ensure session->terminate is updated */
- smp_mb__before_atomic();
-
if (atomic_read(&session->terminate))
break;
if (sk->sk_state != BT_CONNECTED)
@@ -306,6 +303,10 @@ static int cmtp_session(void *arg)
cmtp_process_transmit(session);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(sk_sleep(sk), &wait);
@@ -431,9 +432,10 @@ int cmtp_del_connection(struct cmtp_conndel_req *req)
/* Stop session thread */
atomic_inc(&session->terminate);
- /* Ensure session->terminate is updated */
- smp_mb__after_atomic();
-
+ /*
+ * See the comment preceding the call to wait_woken()
+ * in cmtp_session().
+ */
wake_up_interruptible(sk_sleep(session->sock->sk));
} else
err = -ENOENT;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 45ff5dc124cc..bd4978ce8c45 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -748,11 +748,30 @@ static bool conn_use_rpa(struct hci_conn *conn)
return hci_dev_test_flag(hdev, HCI_PRIVACY);
}
+static void set_ext_conn_params(struct hci_conn *conn,
+ struct hci_cp_le_ext_conn_param *p)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ memset(p, 0, sizeof(*p));
+
+ /* Set window to be the same value as the interval to
+ * enable continuous scanning.
+ */
+ p->scan_interval = cpu_to_le16(hdev->le_scan_interval);
+ p->scan_window = p->scan_interval;
+ p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ p->conn_latency = cpu_to_le16(conn->le_conn_latency);
+ p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ p->min_ce_len = cpu_to_le16(0x0000);
+ p->max_ce_len = cpu_to_le16(0x0000);
+}
+
static void hci_req_add_le_create_conn(struct hci_request *req,
struct hci_conn *conn,
bdaddr_t *direct_rpa)
{
- struct hci_cp_le_create_conn cp;
struct hci_dev *hdev = conn->hdev;
u8 own_addr_type;
@@ -775,25 +794,71 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
return;
}
- memset(&cp, 0, sizeof(cp));
+ if (use_ext_conn(hdev)) {
+ struct hci_cp_le_ext_create_conn *cp;
+ struct hci_cp_le_ext_conn_param *p;
+ u8 data[sizeof(*cp) + sizeof(*p) * 3];
+ u32 plen;
- /* Set window to be the same value as the interval to enable
- * continuous scanning.
- */
- cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
- cp.scan_window = cp.scan_interval;
+ cp = (void *) data;
+ p = (void *) cp->data;
+
+ memset(cp, 0, sizeof(*cp));
- bacpy(&cp.peer_addr, &conn->dst);
- cp.peer_addr_type = conn->dst_type;
- cp.own_address_type = own_addr_type;
- cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
- cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
- cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
- cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
- cp.min_ce_len = cpu_to_le16(0x0000);
- cp.max_ce_len = cpu_to_le16(0x0000);
+ bacpy(&cp->peer_addr, &conn->dst);
+ cp->peer_addr_type = conn->dst_type;
+ cp->own_addr_type = own_addr_type;
- hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+ plen = sizeof(*cp);
+
+ if (scan_1m(hdev)) {
+ cp->phys |= LE_SCAN_PHY_1M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_2m(hdev)) {
+ cp->phys |= LE_SCAN_PHY_2M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_coded(hdev)) {
+ cp->phys |= LE_SCAN_PHY_CODED;
+ set_ext_conn_params(conn, p);
+
+ plen += sizeof(*p);
+ }
+
+ hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, plen, data);
+
+ } else {
+ struct hci_cp_le_create_conn cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Set window to be the same value as the interval to enable
+ * continuous scanning.
+ */
+ cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
+ cp.scan_window = cp.scan_interval;
+
+ bacpy(&cp.peer_addr, &conn->dst);
+ cp.peer_addr_type = conn->dst_type;
+ cp.own_address_type = own_addr_type;
+ cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
+ cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+ }
conn->state = BT_CONNECT;
clear_bit(HCI_CONN_SCANNING, &conn->flags);
@@ -803,35 +868,81 @@ static void hci_req_directed_advertising(struct hci_request *req,
struct hci_conn *conn)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_param cp;
u8 own_addr_type;
u8 enable;
- /* Clear the HCI_LE_ADV bit temporarily so that the
- * hci_update_random_address knows that it's safe to go ahead
- * and write a new random address. The flag will be set back on
- * as soon as the SET_ADV_ENABLE HCI command completes.
- */
- hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_params cp;
+ bdaddr_t random_addr;
- /* Set require_privacy to false so that the remote device has a
- * chance of identifying us.
- */
- if (hci_update_random_address(req, false, conn_use_rpa(conn),
- &own_addr_type) < 0)
- return;
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL,
+ &own_addr_type, &random_addr) < 0)
+ return;
- memset(&cp, 0, sizeof(cp));
- cp.type = LE_ADV_DIRECT_IND;
- cp.own_address_type = own_addr_type;
- cp.direct_addr_type = conn->dst_type;
- bacpy(&cp.direct_addr, &conn->dst);
- cp.channel_map = hdev->le_adv_channel_map;
+ memset(&cp, 0, sizeof(cp));
- hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND);
+ cp.own_addr_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.tx_power = HCI_TX_POWER_INVALID;
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ cp.handle = 0; /* Use instance 0 for directed adv */
+ cp.own_addr_type = own_addr_type;
+ cp.peer_addr_type = conn->dst_type;
+ bacpy(&cp.peer_addr, &conn->dst);
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
+
+ if (own_addr_type == ADDR_LE_DEV_RANDOM &&
+ bacmp(&random_addr, BDADDR_ANY) &&
+ bacmp(&random_addr, &hdev->random_addr)) {
+ struct hci_cp_le_set_adv_set_rand_addr cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = 0;
+ bacpy(&cp.bdaddr, &random_addr);
+
+ hci_req_add(req,
+ HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ sizeof(cp), &cp);
+ }
- enable = 0x01;
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+ __hci_req_enable_ext_advertising(req);
+ } else {
+ struct hci_cp_le_set_adv_param cp;
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ if (hci_update_random_address(req, false, conn_use_rpa(conn),
+ &own_addr_type) < 0)
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.type = LE_ADV_DIRECT_IND;
+ cp.own_address_type = own_addr_type;
+ cp.direct_addr_type = conn->dst_type;
+ bacpy(&cp.direct_addr, &conn->dst);
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+
+ enable = 0x01;
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ }
conn->state = BT_CONNECT;
}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ee8ef1228263..7352fe85674b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -695,11 +695,42 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
if (hdev->commands[35] & (0x20 | 0x40))
events[1] |= 0x08; /* LE PHY Update Complete */
+ /* If the controller supports LE Set Extended Scan Parameters
+ * and LE Set Extended Scan Enable commands, enable the
+ * corresponding event.
+ */
+ if (use_ext_scan(hdev))
+ events[1] |= 0x10; /* LE Extended Advertising
+ * Report
+ */
+
+ /* If the controller supports the LE Extended Create Connection
+ * command, enable the corresponding event.
+ */
+ if (use_ext_conn(hdev))
+ events[1] |= 0x02; /* LE Enhanced Connection
+ * Complete
+ */
+
+ /* If the controller supports the LE Extended Advertising
+ * command, enable the corresponding event.
+ */
+ if (ext_adv_capable(hdev))
+ events[2] |= 0x02; /* LE Advertising Set
+ * Terminated
+ */
+
hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events),
events);
- if (hdev->commands[25] & 0x40) {
- /* Read LE Advertising Channel TX Power */
+ /* Read LE Advertising Channel TX Power */
+ if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) {
+ /* HCI TS spec forbids mixing of legacy and extended
+ * advertising commands wherein READ_ADV_TX_POWER is
+ * also included. So do not call it if extended adv
+ * is supported otherwise controller will return
+ * COMMAND_DISALLOWED for extended commands.
+ */
hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
}
@@ -714,6 +745,17 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
}
+ if (hdev->commands[34] & 0x40) {
+ /* Read LE Resolving List Size */
+ hci_req_add(req, HCI_OP_LE_READ_RESOLV_LIST_SIZE,
+ 0, NULL);
+ }
+
+ if (hdev->commands[34] & 0x20) {
+ /* Clear LE Resolving List */
+ hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL);
+ }
+
if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
/* Read LE Maximum Data Length */
hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
@@ -722,6 +764,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL);
}
+ if (ext_adv_capable(hdev)) {
+ /* Read LE Number of Supported Advertising Sets */
+ hci_req_add(req, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
+ 0, NULL);
+ }
+
hci_set_le_support(req);
}
@@ -802,10 +850,9 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
if (hdev->commands[35] & 0x20) {
struct hci_cp_le_set_default_phy cp;
- /* No transmitter PHY or receiver PHY preferences */
- cp.all_phys = 0x03;
- cp.tx_phys = 0;
- cp.rx_phys = 0;
+ cp.all_phys = 0x00;
+ cp.tx_phys = hdev->le_tx_def_phys;
+ cp.rx_phys = hdev->le_rx_def_phys;
hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp);
}
@@ -1368,7 +1415,8 @@ static int hci_dev_do_open(struct hci_dev *hdev)
atomic_set(&hdev->cmd_cnt, 1);
set_bit(HCI_INIT, &hdev->flags);
- if (hci_dev_test_flag(hdev, HCI_SETUP)) {
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
hci_sock_dev_event(hdev, HCI_DEV_SETUP);
if (hdev->setup)
@@ -1432,6 +1480,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
if (!ret) {
hci_dev_hold(hdev);
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
set_bit(HCI_UP, &hdev->flags);
hci_sock_dev_event(hdev, HCI_DEV_UP);
hci_leds_update_powered(hdev, true);
@@ -1587,9 +1636,15 @@ int hci_dev_do_close(struct hci_dev *hdev)
if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
cancel_delayed_work(&hdev->service_cache);
- if (hci_dev_test_flag(hdev, HCI_MGMT))
+ if (hci_dev_test_flag(hdev, HCI_MGMT)) {
+ struct adv_info *adv_instance;
+
cancel_delayed_work_sync(&hdev->rpa_expired);
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list)
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+ }
+
/* Avoid potential lockdep warnings from the *_flush() calls by
* ensuring the workqueue is empty up front.
*/
@@ -1897,7 +1952,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
break;
case HCISETPTYPE:
+ if (hdev->pkt_type == (__u16) dr.dev_opt)
+ break;
+
hdev->pkt_type = (__u16) dr.dev_opt;
+ mgmt_phy_configuration_changed(hdev, NULL);
break;
case HCISETACLMTU:
@@ -2661,6 +2720,8 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
hdev->cur_adv_instance = 0x00;
}
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+
list_del(&adv_instance->list);
kfree(adv_instance);
@@ -2669,6 +2730,14 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
return 0;
}
+void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired)
+{
+ struct adv_info *adv_instance, *n;
+
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list)
+ adv_instance->rpa_expired = rpa_expired;
+}
+
/* This function requires the caller holds hdev->lock */
void hci_adv_instances_clear(struct hci_dev *hdev)
{
@@ -2680,6 +2749,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
}
list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
list_del(&adv_instance->list);
kfree(adv_instance);
}
@@ -2688,6 +2758,16 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
hdev->cur_adv_instance = 0x00;
}
+static void adv_instance_rpa_expired(struct work_struct *work)
+{
+ struct adv_info *adv_instance = container_of(work, struct adv_info,
+ rpa_expired_cb.work);
+
+ BT_DBG("");
+
+ adv_instance->rpa_expired = true;
+}
+
/* This function requires the caller holds hdev->lock */
int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
u16 adv_data_len, u8 *adv_data,
@@ -2736,6 +2816,11 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
else
adv_instance->duration = duration;
+ adv_instance->tx_power = HCI_TX_POWER_INVALID;
+
+ INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb,
+ adv_instance_rpa_expired);
+
BT_DBG("%s for %dMR", hdev->name, instance);
return 0;
@@ -2754,6 +2839,20 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
return NULL;
}
+struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
+ struct list_head *bdaddr_list, bdaddr_t *bdaddr,
+ u8 type)
+{
+ struct bdaddr_list_with_irk *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
{
struct bdaddr_list *b, *n;
@@ -2786,6 +2885,35 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type)
return 0;
}
+int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type, u8 *peer_irk, u8 *local_irk)
+{
+ struct bdaddr_list_with_irk *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+
+ if (peer_irk)
+ memcpy(entry->peer_irk, peer_irk, 16);
+
+ if (local_irk)
+ memcpy(entry->local_irk, local_irk, 16);
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
{
struct bdaddr_list *entry;
@@ -2805,6 +2933,26 @@ int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
return 0;
}
+int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type)
+{
+ struct bdaddr_list_with_irk *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY)) {
+ hci_bdaddr_list_clear(list);
+ return 0;
+ }
+
+ entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type);
+ if (!entry)
+ return -ENOENT;
+
+ list_del(&entry->list);
+ kfree(entry);
+
+ return 0;
+}
+
/* This function requires the caller holds hdev->lock */
struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
bdaddr_t *addr, u8 addr_type)
@@ -2999,6 +3147,10 @@ struct hci_dev *hci_alloc_dev(void)
hdev->le_max_tx_time = 0x0148;
hdev->le_max_rx_len = 0x001b;
hdev->le_max_rx_time = 0x0148;
+ hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE;
+ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
@@ -3017,6 +3169,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_LIST_HEAD(&hdev->identity_resolving_keys);
INIT_LIST_HEAD(&hdev->remote_oob_data);
INIT_LIST_HEAD(&hdev->le_white_list);
+ INIT_LIST_HEAD(&hdev->le_resolv_list);
INIT_LIST_HEAD(&hdev->le_conn_params);
INIT_LIST_HEAD(&hdev->pend_le_conns);
INIT_LIST_HEAD(&hdev->pend_le_reports);
@@ -3218,6 +3371,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
hci_remote_oob_data_clear(hdev);
hci_adv_instances_clear(hdev);
hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
hci_conn_params_clear_all(hdev);
hci_discovery_filter_clear(hdev);
hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 0d8ab5b3c177..51f5b1efc3a5 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -694,6 +694,21 @@ static int white_list_show(struct seq_file *f, void *ptr)
DEFINE_SHOW_ATTRIBUTE(white_list);
+static int resolv_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->le_resolv_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(resolv_list);
+
static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
{
struct hci_dev *hdev = f->private;
@@ -955,6 +970,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
&hdev->le_white_list_size);
debugfs_create_file("white_list", 0444, hdev->debugfs, hdev,
&white_list_fops);
+ debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs,
+ &hdev->le_resolv_list_size);
+ debugfs_create_file("resolv_list", 0444, hdev->debugfs, hdev,
+ &resolv_list_fops);
debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs,
hdev, &identity_resolving_keys_fops);
debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 235b5aaab23d..ef9928d7b4fb 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -221,6 +221,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
hdev->ssp_debug_mode = 0;
hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
}
static void hci_cc_read_stored_link_key(struct hci_dev *hdev,
@@ -1041,6 +1042,57 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_set_default_phy *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_tx_def_phys = cp->tx_phys;
+ hdev->le_rx_def_phys = cp->rx_phys;
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_set_adv_set_rand_addr *cp;
+ struct adv_info *adv_instance;
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!hdev->cur_adv_instance) {
+ /* Store in hdev for instance 0 (Set adv and Directed advs) */
+ bacpy(&hdev->random_addr, &cp->bdaddr);
+ } else {
+ adv_instance = hci_find_adv_instance(hdev,
+ hdev->cur_adv_instance);
+ if (adv_instance)
+ bacpy(&adv_instance->random_addr, &cp->bdaddr);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
{
__u8 *sent, status = *((__u8 *) skb->data);
@@ -1076,6 +1128,40 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (cp->enable) {
+ struct hci_conn *conn;
+
+ hci_dev_set_flag(hdev, HCI_LE_ADV);
+
+ conn = hci_lookup_le_connect(hdev);
+ if (conn)
+ queue_delayed_work(hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_cp_le_set_scan_param *cp;
@@ -1097,6 +1183,31 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_scan_params *cp;
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_scan_phy_params *phy_param;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS);
+ if (!cp)
+ return;
+
+ phy_param = (void *)cp->data;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_type = phy_param->type;
+
+ hci_dev_unlock(hdev);
+}
+
static bool has_pending_adv_report(struct hci_dev *hdev)
{
struct discovery_state *d = &hdev->discovery;
@@ -1126,24 +1237,11 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr,
d->last_adv_data_len = len;
}
-static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
- struct sk_buff *skb)
+static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
{
- struct hci_cp_le_set_scan_enable *cp;
- __u8 status = *((__u8 *) skb->data);
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- if (status)
- return;
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
- if (!cp)
- return;
-
hci_dev_lock(hdev);
- switch (cp->enable) {
+ switch (enable) {
case LE_SCAN_ENABLE:
hci_dev_set_flag(hdev, HCI_LE_SCAN);
if (hdev->le_scan_type == LE_SCAN_ACTIVE)
@@ -1189,13 +1287,63 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
default:
bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d",
- cp->enable);
+ enable);
break;
}
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_scan_enable *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
+ if (!cp)
+ return;
+
+ le_set_scan_enable_complete(hdev, cp->enable);
+}
+
+static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_scan_enable *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE);
+ if (!cp)
+ return;
+
+ le_set_scan_enable_complete(hdev, cp->enable);
+}
+
+static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_num_supported_adv_sets *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x No of Adv sets %u", hdev->name, rp->status,
+ rp->num_of_sets);
+
+ if (rp->status)
+ return;
+
+ hdev->le_num_of_adv_sets = rp->num_of_sets;
+}
+
static void hci_cc_le_read_white_list_size(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -1306,6 +1454,95 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
}
+static void hci_cc_le_add_to_resolv_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_add_to_resolv_list *sent;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST);
+ if (!sent)
+ return;
+
+ hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+ sent->bdaddr_type, sent->peer_irk,
+ sent->local_irk);
+}
+
+static void hci_cc_le_del_from_resolv_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_del_from_resolv_list *sent;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST);
+ if (!sent)
+ return;
+
+ hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+ sent->bdaddr_type);
+}
+
+static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
+}
+
+static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_resolv_list_size *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size);
+
+ if (rp->status)
+ return;
+
+ hdev->le_resolv_list_size = rp->size;
+}
+
+static void hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 *sent, status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE);
+ if (!sent)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (*sent)
+ hci_dev_set_flag(hdev, HCI_LL_RPA_RESOLUTION);
+ else
+ hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_le_read_max_data_len(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -1375,6 +1612,37 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_le_set_ext_adv_params *rp = (void *) skb->data;
+ struct hci_cp_le_set_ext_adv_params *cp;
+ struct adv_info *adv_instance;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+ hdev->adv_addr_type = cp->own_addr_type;
+ if (!hdev->cur_adv_instance) {
+ /* Store in hdev for instance 0 */
+ hdev->adv_tx_power = rp->tx_power;
+ } else {
+ adv_instance = hci_find_adv_instance(hdev,
+ hdev->cur_adv_instance);
+ if (adv_instance)
+ adv_instance->tx_power = rp->tx_power;
+ }
+ /* Update adv data as tx power is known now */
+ hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_rp_read_rssi *rp = (void *) skb->data;
@@ -1896,10 +2164,44 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
hci_dev_unlock(hdev);
}
+static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
+ u8 peer_addr_type, u8 own_address_type,
+ u8 filter_policy)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_le(hdev, peer_addr,
+ peer_addr_type);
+ if (!conn)
+ return;
+
+ /* Store the initiator and responder address information which
+ * is needed for SMP. These values will not change during the
+ * lifetime of the connection.
+ */
+ conn->init_addr_type = own_address_type;
+ if (own_address_type == ADDR_LE_DEV_RANDOM)
+ bacpy(&conn->init_addr, &hdev->random_addr);
+ else
+ bacpy(&conn->init_addr, &hdev->bdaddr);
+
+ conn->resp_addr_type = peer_addr_type;
+ bacpy(&conn->resp_addr, peer_addr);
+
+ /* We don't want the connection attempt to stick around
+ * indefinitely since LE doesn't have a page timeout concept
+ * like BR/EDR. Set a timer for any connection that doesn't use
+ * the white list for connecting.
+ */
+ if (filter_policy == HCI_LE_USE_PEER_ADDR)
+ queue_delayed_work(conn->hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+}
+
static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_create_conn *cp;
- struct hci_conn *conn;
BT_DBG("%s status 0x%2.2x", hdev->name, status);
@@ -1916,35 +2218,34 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_le(hdev, &cp->peer_addr,
- cp->peer_addr_type);
- if (!conn)
- goto unlock;
+ cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
+ cp->own_address_type, cp->filter_policy);
- /* Store the initiator and responder address information which
- * is needed for SMP. These values will not change during the
- * lifetime of the connection.
- */
- conn->init_addr_type = cp->own_address_type;
- if (cp->own_address_type == ADDR_LE_DEV_RANDOM)
- bacpy(&conn->init_addr, &hdev->random_addr);
- else
- bacpy(&conn->init_addr, &hdev->bdaddr);
+ hci_dev_unlock(hdev);
+}
- conn->resp_addr_type = cp->peer_addr_type;
- bacpy(&conn->resp_addr, &cp->peer_addr);
+static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_ext_create_conn *cp;
- /* We don't want the connection attempt to stick around
- * indefinitely since LE doesn't have a page timeout concept
- * like BR/EDR. Set a timer for any connection that doesn't use
- * the white list for connecting.
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ /* All connection failure handling is taken care of by the
+ * hci_le_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
*/
- if (cp->filter_policy == HCI_LE_USE_PEER_ADDR)
- queue_delayed_work(conn->hdev->workqueue,
- &conn->le_conn_timeout,
- conn->conn_timeout);
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_EXT_CREATE_CONN);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
+ cp->own_addr_type, cp->filter_policy);
-unlock:
hci_dev_unlock(hdev);
}
@@ -2618,8 +2919,10 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
/* We should disregard the current RPA and generate a new one
* whenever the encryption procedure fails.
*/
- if (ev->status && conn->type == LE_LINK)
+ if (ev->status && conn->type == LE_LINK) {
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
+ }
clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
@@ -3015,6 +3318,26 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_le_write_def_data_len(hdev, skb);
break;
+ case HCI_OP_LE_ADD_TO_RESOLV_LIST:
+ hci_cc_le_add_to_resolv_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_DEL_FROM_RESOLV_LIST:
+ hci_cc_le_del_from_resolv_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_CLEAR_RESOLV_LIST:
+ hci_cc_le_clear_resolv_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_RESOLV_LIST_SIZE:
+ hci_cc_le_read_resolv_list_size(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_ADDR_RESOLV_ENABLE:
+ hci_cc_le_set_addr_resolution_enable(hdev, skb);
+ break;
+
case HCI_OP_LE_READ_MAX_DATA_LEN:
hci_cc_le_read_max_data_len(hdev, skb);
break;
@@ -3039,6 +3362,34 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_write_ssp_debug_mode(hdev, skb);
break;
+ case HCI_OP_LE_SET_EXT_SCAN_PARAMS:
+ hci_cc_le_set_ext_scan_param(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_EXT_SCAN_ENABLE:
+ hci_cc_le_set_ext_scan_enable(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_DEFAULT_PHY:
+ hci_cc_le_set_default_phy(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS:
+ hci_cc_le_read_num_adv_sets(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_EXT_ADV_PARAMS:
+ hci_cc_set_ext_adv_param(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_EXT_ADV_ENABLE:
+ hci_cc_le_set_ext_adv_enable(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_ADV_SET_RAND_ADDR:
+ hci_cc_le_set_adv_set_random_addr(hdev, skb);
+ break;
+
default:
BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
break;
@@ -3134,6 +3485,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cs_le_start_enc(hdev, ev->status);
break;
+ case HCI_OP_LE_EXT_CREATE_CONN:
+ hci_cs_le_ext_create_conn(hdev, ev->status);
+ break;
+
default:
BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
break;
@@ -4460,16 +4815,15 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
}
#endif
-static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
+ bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle,
+ u16 interval, u16 latency, u16 supervision_timeout)
{
- struct hci_ev_le_conn_complete *ev = (void *) skb->data;
struct hci_conn_params *params;
struct hci_conn *conn;
struct smp_irk *irk;
u8 addr_type;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
-
hci_dev_lock(hdev);
/* All controllers implicitly stop advertising in the event of a
@@ -4479,13 +4833,13 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn = hci_lookup_le_connect(hdev);
if (!conn) {
- conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role);
+ conn = hci_conn_add(hdev, LE_LINK, bdaddr, role);
if (!conn) {
bt_dev_err(hdev, "no memory for new connection");
goto unlock;
}
- conn->dst_type = ev->bdaddr_type;
+ conn->dst_type = bdaddr_type;
/* If we didn't have a hci_conn object previously
* but we're in master role this must be something
@@ -4496,8 +4850,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* initiator address based on the HCI_PRIVACY flag.
*/
if (conn->out) {
- conn->resp_addr_type = ev->bdaddr_type;
- bacpy(&conn->resp_addr, &ev->bdaddr);
+ conn->resp_addr_type = bdaddr_type;
+ bacpy(&conn->resp_addr, bdaddr);
if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
conn->init_addr_type = ADDR_LE_DEV_RANDOM;
bacpy(&conn->init_addr, &hdev->rpa);
@@ -4516,13 +4870,18 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* the advertising address type.
*/
conn->resp_addr_type = hdev->adv_addr_type;
- if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM)
- bacpy(&conn->resp_addr, &hdev->random_addr);
- else
+ if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) {
+ /* In case of ext adv, resp_addr will be updated in
+ * Adv Terminated event.
+ */
+ if (!ext_adv_capable(hdev))
+ bacpy(&conn->resp_addr, &hdev->random_addr);
+ } else {
bacpy(&conn->resp_addr, &hdev->bdaddr);
+ }
- conn->init_addr_type = ev->bdaddr_type;
- bacpy(&conn->init_addr, &ev->bdaddr);
+ conn->init_addr_type = bdaddr_type;
+ bacpy(&conn->init_addr, bdaddr);
/* For incoming connections, set the default minimum
* and maximum connection interval. They will be used
@@ -4548,8 +4907,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn->dst_type = irk->addr_type;
}
- if (ev->status) {
- hci_le_conn_failed(conn, ev->status);
+ if (status) {
+ hci_le_conn_failed(conn, status);
goto unlock;
}
@@ -4568,42 +4927,38 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
mgmt_device_connected(hdev, conn, 0, NULL, 0);
conn->sec_level = BT_SECURITY_LOW;
- conn->handle = __le16_to_cpu(ev->handle);
+ conn->handle = handle;
conn->state = BT_CONFIG;
- conn->le_conn_interval = le16_to_cpu(ev->interval);
- conn->le_conn_latency = le16_to_cpu(ev->latency);
- conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout);
+ conn->le_conn_interval = interval;
+ conn->le_conn_latency = latency;
+ conn->le_supv_timeout = supervision_timeout;
hci_debugfs_create_conn(conn);
hci_conn_add_sysfs(conn);
- if (!ev->status) {
- /* The remote features procedure is defined for master
- * role only. So only in case of an initiated connection
- * request the remote features.
- *
- * If the local controller supports slave-initiated features
- * exchange, then requesting the remote features in slave
- * role is possible. Otherwise just transition into the
- * connected state without requesting the remote features.
- */
- if (conn->out ||
- (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) {
- struct hci_cp_le_read_remote_features cp;
+ /* The remote features procedure is defined for master
+ * role only. So only in case of an initiated connection
+ * request the remote features.
+ *
+ * If the local controller supports slave-initiated features
+ * exchange, then requesting the remote features in slave
+ * role is possible. Otherwise just transition into the
+ * connected state without requesting the remote features.
+ */
+ if (conn->out ||
+ (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) {
+ struct hci_cp_le_read_remote_features cp;
- cp.handle = __cpu_to_le16(conn->handle);
+ cp.handle = __cpu_to_le16(conn->handle);
- hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
- sizeof(cp), &cp);
+ hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
+ sizeof(cp), &cp);
- hci_conn_hold(conn);
- } else {
- conn->state = BT_CONNECTED;
- hci_connect_cfm(conn, ev->status);
- }
+ hci_conn_hold(conn);
} else {
- hci_connect_cfm(conn, ev->status);
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, status);
}
params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
@@ -4622,6 +4977,61 @@ unlock:
hci_dev_unlock(hdev);
}
+static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_le_conn_complete *ev = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
+ ev->role, le16_to_cpu(ev->handle),
+ le16_to_cpu(ev->interval),
+ le16_to_cpu(ev->latency),
+ le16_to_cpu(ev->supervision_timeout));
+}
+
+static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_enh_conn_complete *ev = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
+ ev->role, le16_to_cpu(ev->handle),
+ le16_to_cpu(ev->interval),
+ le16_to_cpu(ev->latency),
+ le16_to_cpu(ev->supervision_timeout));
+}
+
+static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (ev->status)
+ return;
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle));
+ if (conn) {
+ struct adv_info *adv_instance;
+
+ if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM)
+ return;
+
+ if (!hdev->cur_adv_instance) {
+ bacpy(&conn->resp_addr, &hdev->random_addr);
+ return;
+ }
+
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+ if (adv_instance)
+ bacpy(&conn->resp_addr, &adv_instance->random_addr);
+ }
+}
+
static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -4957,6 +5367,78 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static u8 ext_evt_type_to_legacy(u16 evt_type)
+{
+ if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
+ switch (evt_type) {
+ case LE_LEGACY_ADV_IND:
+ return LE_ADV_IND;
+ case LE_LEGACY_ADV_DIRECT_IND:
+ return LE_ADV_DIRECT_IND;
+ case LE_LEGACY_ADV_SCAN_IND:
+ return LE_ADV_SCAN_IND;
+ case LE_LEGACY_NONCONN_IND:
+ return LE_ADV_NONCONN_IND;
+ case LE_LEGACY_SCAN_RSP_ADV:
+ case LE_LEGACY_SCAN_RSP_ADV_SCAN:
+ return LE_ADV_SCAN_RSP;
+ }
+
+ BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
+ evt_type);
+
+ return LE_ADV_INVALID;
+ }
+
+ if (evt_type & LE_EXT_ADV_CONN_IND) {
+ if (evt_type & LE_EXT_ADV_DIRECT_IND)
+ return LE_ADV_DIRECT_IND;
+
+ return LE_ADV_IND;
+ }
+
+ if (evt_type & LE_EXT_ADV_SCAN_RSP)
+ return LE_ADV_SCAN_RSP;
+
+ if (evt_type & LE_EXT_ADV_SCAN_IND)
+ return LE_ADV_SCAN_IND;
+
+ if (evt_type == LE_EXT_ADV_NON_CONN_IND ||
+ evt_type & LE_EXT_ADV_DIRECT_IND)
+ return LE_ADV_NONCONN_IND;
+
+ BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
+ evt_type);
+
+ return LE_ADV_INVALID;
+}
+
+static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u8 num_reports = skb->data[0];
+ void *ptr = &skb->data[1];
+
+ hci_dev_lock(hdev);
+
+ while (num_reports--) {
+ struct hci_ev_le_ext_adv_report *ev = ptr;
+ u8 legacy_evt_type;
+ u16 evt_type;
+
+ evt_type = __le16_to_cpu(ev->evt_type);
+ legacy_evt_type = ext_evt_type_to_legacy(evt_type);
+ if (legacy_evt_type != LE_ADV_INVALID) {
+ process_adv_report(hdev, legacy_evt_type, &ev->bdaddr,
+ ev->bdaddr_type, NULL, 0, ev->rssi,
+ ev->data, ev->length);
+ }
+
+ ptr += sizeof(*ev) + ev->length + 1;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -5189,6 +5671,18 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_le_direct_adv_report_evt(hdev, skb);
break;
+ case HCI_EV_LE_EXT_ADV_REPORT:
+ hci_le_ext_adv_report_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
+ hci_le_enh_conn_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_EXT_ADV_SET_TERM:
+ hci_le_ext_adv_term_evt(hdev, skb);
+ break;
+
default:
break;
}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e44d34734834..e8c9ef1e1922 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -647,11 +647,22 @@ void __hci_req_update_eir(struct hci_request *req)
void hci_req_add_le_scan_disable(struct hci_request *req)
{
- struct hci_cp_le_set_scan_enable cp;
+ struct hci_dev *hdev = req->hdev;
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_DISABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ if (use_ext_scan(hdev)) {
+ struct hci_cp_le_set_ext_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_DISABLE;
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp),
+ &cp);
+ } else {
+ struct hci_cp_le_set_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_DISABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ }
}
static void add_to_white_list(struct hci_request *req,
@@ -767,10 +778,86 @@ static bool scan_use_rpa(struct hci_dev *hdev)
return hci_dev_test_flag(hdev, HCI_PRIVACY);
}
+static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval,
+ u16 window, u8 own_addr_type, u8 filter_policy)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ /* Use ext scanning if set ext scan param and ext scan enable is
+ * supported
+ */
+ if (use_ext_scan(hdev)) {
+ struct hci_cp_le_set_ext_scan_params *ext_param_cp;
+ struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
+ struct hci_cp_le_scan_phy_params *phy_params;
+ u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 2];
+ u32 plen;
+
+ ext_param_cp = (void *)data;
+ phy_params = (void *)ext_param_cp->data;
+
+ memset(ext_param_cp, 0, sizeof(*ext_param_cp));
+ ext_param_cp->own_addr_type = own_addr_type;
+ ext_param_cp->filter_policy = filter_policy;
+
+ plen = sizeof(*ext_param_cp);
+
+ if (scan_1m(hdev) || scan_2m(hdev)) {
+ ext_param_cp->scanning_phys |= LE_SCAN_PHY_1M;
+
+ memset(phy_params, 0, sizeof(*phy_params));
+ phy_params->type = type;
+ phy_params->interval = cpu_to_le16(interval);
+ phy_params->window = cpu_to_le16(window);
+
+ plen += sizeof(*phy_params);
+ phy_params++;
+ }
+
+ if (scan_coded(hdev)) {
+ ext_param_cp->scanning_phys |= LE_SCAN_PHY_CODED;
+
+ memset(phy_params, 0, sizeof(*phy_params));
+ phy_params->type = type;
+ phy_params->interval = cpu_to_le16(interval);
+ phy_params->window = cpu_to_le16(window);
+
+ plen += sizeof(*phy_params);
+ phy_params++;
+ }
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS,
+ plen, ext_param_cp);
+
+ memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
+ ext_enable_cp.enable = LE_SCAN_ENABLE;
+ ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ sizeof(ext_enable_cp), &ext_enable_cp);
+ } else {
+ struct hci_cp_le_set_scan_param param_cp;
+ struct hci_cp_le_set_scan_enable enable_cp;
+
+ memset(&param_cp, 0, sizeof(param_cp));
+ param_cp.type = type;
+ param_cp.interval = cpu_to_le16(interval);
+ param_cp.window = cpu_to_le16(window);
+ param_cp.own_address_type = own_addr_type;
+ param_cp.filter_policy = filter_policy;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
+ &param_cp);
+
+ memset(&enable_cp, 0, sizeof(enable_cp));
+ enable_cp.enable = LE_SCAN_ENABLE;
+ enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
+ &enable_cp);
+ }
+}
+
void hci_req_add_le_passive_scan(struct hci_request *req)
{
- struct hci_cp_le_set_scan_param param_cp;
- struct hci_cp_le_set_scan_enable enable_cp;
struct hci_dev *hdev = req->hdev;
u8 own_addr_type;
u8 filter_policy;
@@ -804,20 +891,26 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
(hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
filter_policy |= 0x02;
- memset(&param_cp, 0, sizeof(param_cp));
- param_cp.type = LE_SCAN_PASSIVE;
- param_cp.interval = cpu_to_le16(hdev->le_scan_interval);
- param_cp.window = cpu_to_le16(hdev->le_scan_window);
- param_cp.own_address_type = own_addr_type;
- param_cp.filter_policy = filter_policy;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
- &param_cp);
+ hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval,
+ hdev->le_scan_window, own_addr_type, filter_policy);
+}
+
+static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv_instance;
+
+ /* Ignore instance 0 */
+ if (instance == 0x00)
+ return 0;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
- memset(&enable_cp, 0, sizeof(enable_cp));
- enable_cp.enable = LE_SCAN_ENABLE;
- enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
- &enable_cp);
+ /* TODO: Take into account the "appearance" and "local-name" flags here.
+ * These are currently being ignored as they are not supported.
+ */
+ return adv_instance->scan_rsp_len;
}
static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
@@ -841,9 +934,19 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
void __hci_req_disable_advertising(struct hci_request *req)
{
- u8 enable = 0x00;
+ if (ext_adv_capable(req->hdev)) {
+ struct hci_cp_le_set_ext_adv_enable cp;
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+ cp.enable = 0x00;
+ /* Disable all sets since we only support one set at the moment */
+ cp.num_of_sets = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp);
+ } else {
+ u8 enable = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+ }
}
static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
@@ -1081,29 +1184,58 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_rsp_data cp;
u8 len;
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
return;
- memset(&cp, 0, sizeof(cp));
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_scan_rsp_data cp;
- if (instance)
- len = create_instance_scan_rsp_data(hdev, instance, cp.data);
- else
- len = create_default_scan_rsp_data(hdev, cp.data);
+ memset(&cp, 0, sizeof(cp));
- if (hdev->scan_rsp_data_len == len &&
- !memcmp(cp.data, hdev->scan_rsp_data, len))
- return;
+ if (instance)
+ len = create_instance_scan_rsp_data(hdev, instance,
+ cp.data);
+ else
+ len = create_default_scan_rsp_data(hdev, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return;
- memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
- hdev->scan_rsp_data_len = len;
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
- cp.length = len;
+ cp.handle = 0;
+ cp.length = len;
+ cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp),
+ &cp);
+ } else {
+ struct hci_cp_le_set_scan_rsp_data cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (instance)
+ len = create_instance_scan_rsp_data(hdev, instance,
+ cp.data);
+ else
+ len = create_default_scan_rsp_data(hdev, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
+
+ cp.length = len;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
+ }
}
static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
@@ -1160,15 +1292,27 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
ptr += adv_instance->adv_data_len;
}
- /* Provide Tx Power only if we can provide a valid value for it */
- if (hdev->adv_tx_power != HCI_TX_POWER_INVALID &&
- (instance_flags & MGMT_ADV_FLAG_TX_POWER)) {
- ptr[0] = 0x02;
- ptr[1] = EIR_TX_POWER;
- ptr[2] = (u8)hdev->adv_tx_power;
+ if (instance_flags & MGMT_ADV_FLAG_TX_POWER) {
+ s8 adv_tx_power;
- ad_len += 3;
- ptr += 3;
+ if (ext_adv_capable(hdev)) {
+ if (adv_instance)
+ adv_tx_power = adv_instance->tx_power;
+ else
+ adv_tx_power = hdev->adv_tx_power;
+ } else {
+ adv_tx_power = hdev->adv_tx_power;
+ }
+
+ /* Provide Tx Power only if we can provide a valid value for it */
+ if (adv_tx_power != HCI_TX_POWER_INVALID) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)adv_tx_power;
+
+ ad_len += 3;
+ ptr += 3;
+ }
}
return ad_len;
@@ -1177,27 +1321,51 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_data cp;
u8 len;
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
return;
- memset(&cp, 0, sizeof(cp));
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_data cp;
- len = create_instance_adv_data(hdev, instance, cp.data);
+ memset(&cp, 0, sizeof(cp));
- /* There's nothing to do if the data hasn't changed */
- if (hdev->adv_data_len == len &&
- memcmp(cp.data, hdev->adv_data, len) == 0)
- return;
+ len = create_instance_adv_data(hdev, instance, cp.data);
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return;
+
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
+
+ cp.length = len;
+ cp.handle = 0;
+ cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp);
+ } else {
+ struct hci_cp_le_set_adv_data cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = create_instance_adv_data(hdev, instance, cp.data);
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return;
- memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
- hdev->adv_data_len = len;
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
- cp.length = len;
+ cp.length = len;
- hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
+ hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
+ }
}
int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
@@ -1229,9 +1397,13 @@ void hci_req_reenable_advertising(struct hci_dev *hdev)
__hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance,
true);
} else {
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
- __hci_req_enable_advertising(&req);
+ if (ext_adv_capable(hdev)) {
+ __hci_req_start_ext_adv(&req, 0x00);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ __hci_req_enable_advertising(&req);
+ }
}
hci_req_run(&req, adv_enable_complete);
@@ -1268,6 +1440,245 @@ unlock:
hci_dev_unlock(hdev);
}
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+ bool use_rpa, struct adv_info *adv_instance,
+ u8 *own_addr_type, bdaddr_t *rand_addr)
+{
+ int err;
+
+ bacpy(rand_addr, BDADDR_ANY);
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired then generate a new one.
+ */
+ if (use_rpa) {
+ int to;
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ if (adv_instance) {
+ if (!adv_instance->rpa_expired &&
+ !bacmp(&adv_instance->random_addr, &hdev->rpa))
+ return 0;
+
+ adv_instance->rpa_expired = false;
+ } else {
+ if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
+ !bacmp(&hdev->random_addr, &hdev->rpa))
+ return 0;
+ }
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ BT_ERR("%s failed to generate new RPA", hdev->name);
+ return err;
+ }
+
+ bacpy(rand_addr, &hdev->rpa);
+
+ to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
+ if (adv_instance)
+ queue_delayed_work(hdev->workqueue,
+ &adv_instance->rpa_expired_cb, to);
+ else
+ queue_delayed_work(hdev->workqueue,
+ &hdev->rpa_expired, to);
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for
+ * non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(rand_addr, &nrpa);
+
+ return 0;
+ }
+
+ /* No privacy so use a public address. */
+ *own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ return 0;
+}
+
+void __hci_req_clear_ext_adv_sets(struct hci_request *req)
+{
+ hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL);
+}
+
+int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_params cp;
+ struct hci_dev *hdev = req->hdev;
+ bool connectable;
+ u32 flags;
+ bdaddr_t random_addr;
+ u8 own_addr_type;
+ int err;
+ struct adv_info *adv_instance;
+ bool secondary_adv;
+ /* In ext adv set param interval is 3 octets */
+ const u8 adv_interval[3] = { 0x00, 0x08, 0x00 };
+
+ if (instance > 0) {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -EINVAL;
+ } else {
+ adv_instance = NULL;
+ }
+
+ flags = get_adv_instance_flags(hdev, instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ if (!is_advertising_allowed(hdev, connectable))
+ return -EPERM;
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used. In that case it is fine to use a
+ * non-resolvable private address.
+ */
+ err = hci_get_random_address(hdev, !connectable,
+ adv_use_rpa(hdev, flags), adv_instance,
+ &own_addr_type, &random_addr);
+ if (err < 0)
+ return err;
+
+ memset(&cp, 0, sizeof(cp));
+
+ memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval));
+ memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval));
+
+ secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
+
+ if (connectable) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
+ } else if (get_adv_instance_scan_rsp_len(hdev, instance)) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
+ } else {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
+ }
+
+ cp.own_addr_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.tx_power = 127;
+ cp.handle = 0;
+
+ if (flags & MGMT_ADV_FLAG_SEC_2M) {
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_2M;
+ } else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
+ cp.primary_phy = HCI_ADV_PHY_CODED;
+ cp.secondary_phy = HCI_ADV_PHY_CODED;
+ } else {
+ /* In all other cases use 1M */
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ }
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
+
+ if (own_addr_type == ADDR_LE_DEV_RANDOM &&
+ bacmp(&random_addr, BDADDR_ANY)) {
+ struct hci_cp_le_set_adv_set_rand_addr cp;
+
+ /* Check if random address need to be updated */
+ if (adv_instance) {
+ if (!bacmp(&random_addr, &adv_instance->random_addr))
+ return 0;
+ } else {
+ if (!bacmp(&random_addr, &hdev->random_addr))
+ return 0;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = 0;
+ bacpy(&cp.bdaddr, &random_addr);
+
+ hci_req_add(req,
+ HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ sizeof(cp), &cp);
+ }
+
+ return 0;
+}
+
+void __hci_req_enable_ext_advertising(struct hci_request *req)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *adv_set;
+ u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
+
+ cp = (void *) data;
+ adv_set = (void *) cp->data;
+
+ memset(cp, 0, sizeof(*cp));
+
+ cp->enable = 0x01;
+ cp->num_of_sets = 0x01;
+
+ memset(adv_set, 0, sizeof(*adv_set));
+
+ adv_set->handle = 0;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets,
+ data);
+}
+
+int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
+{
+ struct hci_dev *hdev = req->hdev;
+ int err;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ __hci_req_disable_advertising(req);
+
+ err = __hci_req_setup_ext_adv_instance(req, instance);
+ if (err < 0)
+ return err;
+
+ __hci_req_update_scan_rsp_data(req, instance);
+ __hci_req_enable_ext_advertising(req);
+
+ return 0;
+}
+
int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
bool force)
{
@@ -1321,9 +1732,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
return 0;
hdev->cur_adv_instance = instance;
- __hci_req_update_adv_data(req, instance);
- __hci_req_update_scan_rsp_data(req, instance);
- __hci_req_enable_advertising(req);
+ if (ext_adv_capable(hdev)) {
+ __hci_req_start_ext_adv(req, instance);
+ } else {
+ __hci_req_update_adv_data(req, instance);
+ __hci_req_update_scan_rsp_data(req, instance);
+ __hci_req_enable_advertising(req);
+ }
return 0;
}
@@ -1594,8 +2009,12 @@ static int connectable_update(struct hci_request *req, unsigned long opt)
/* Update the advertising parameters if necessary */
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !list_empty(&hdev->adv_instances))
- __hci_req_enable_advertising(req);
+ !list_empty(&hdev->adv_instances)) {
+ if (ext_adv_capable(hdev))
+ __hci_req_start_ext_adv(req, hdev->cur_adv_instance);
+ else
+ __hci_req_enable_advertising(req);
+ }
__hci_update_background_scan(req);
@@ -1704,8 +2123,12 @@ static int discoverable_update(struct hci_request *req, unsigned long opt)
/* Discoverable mode affects the local advertising
* address in limited privacy mode.
*/
- if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
- __hci_req_enable_advertising(req);
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) {
+ if (ext_adv_capable(hdev))
+ __hci_req_start_ext_adv(req, 0x00);
+ else
+ __hci_req_enable_advertising(req);
+ }
}
hci_dev_unlock(hdev);
@@ -1940,7 +2363,6 @@ discov_stopped:
static int le_scan_restart(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_enable cp;
/* If controller is not scanning we are done. */
if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
@@ -1948,10 +2370,23 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt)
hci_req_add_le_scan_disable(req);
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_ENABLE;
- cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ if (use_ext_scan(hdev)) {
+ struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
+
+ memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
+ ext_enable_cp.enable = LE_SCAN_ENABLE;
+ ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ sizeof(ext_enable_cp), &ext_enable_cp);
+ } else {
+ struct hci_cp_le_set_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_ENABLE;
+ cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ }
return 0;
}
@@ -2010,8 +2445,6 @@ static int active_scan(struct hci_request *req, unsigned long opt)
{
uint16_t interval = opt;
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_param param_cp;
- struct hci_cp_le_set_scan_enable enable_cp;
u8 own_addr_type;
int err;
@@ -2050,22 +2483,8 @@ static int active_scan(struct hci_request *req, unsigned long opt)
if (err < 0)
own_addr_type = ADDR_LE_DEV_PUBLIC;
- memset(&param_cp, 0, sizeof(param_cp));
- param_cp.type = LE_SCAN_ACTIVE;
- param_cp.interval = cpu_to_le16(interval);
- param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN);
- param_cp.own_address_type = own_addr_type;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
- &param_cp);
-
- memset(&enable_cp, 0, sizeof(enable_cp));
- enable_cp.enable = LE_SCAN_ENABLE;
- enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
- &enable_cp);
-
+ hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN,
+ own_addr_type, 0);
return 0;
}
@@ -2302,11 +2721,26 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt)
*/
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
list_empty(&hdev->adv_instances)) {
- __hci_req_update_adv_data(req, 0x00);
- __hci_req_update_scan_rsp_data(req, 0x00);
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
- __hci_req_enable_advertising(req);
+ int err;
+
+ if (ext_adv_capable(hdev)) {
+ err = __hci_req_setup_ext_adv_instance(req,
+ 0x00);
+ if (!err)
+ __hci_req_update_scan_rsp_data(req,
+ 0x00);
+ } else {
+ err = 0;
+ __hci_req_update_adv_data(req, 0x00);
+ __hci_req_update_scan_rsp_data(req, 0x00);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ if (!ext_adv_capable(hdev))
+ __hci_req_enable_advertising(req);
+ else if (!err)
+ __hci_req_enable_ext_advertising(req);
+ }
} else if (!list_empty(&hdev->adv_instances)) {
struct adv_info *adv_instance;
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 702beb140d9f..692cc8b13368 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -80,6 +80,14 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
struct hci_request *req, u8 instance,
bool force);
+int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
+int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
+void __hci_req_enable_ext_advertising(struct hci_request *req);
+void __hci_req_clear_ext_adv_sets(struct hci_request *req);
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+ bool use_rpa, struct adv_info *adv_instance,
+ u8 *own_addr_type, bdaddr_t *rand_addr);
+
void __hci_req_update_class(struct hci_request *req);
/* Returns true if HCI commands were queued */
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 7962111da216..a442e21f3894 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -431,8 +431,8 @@ static void hidp_del_timer(struct hidp_session *session)
del_timer(&session->timer);
}
-static void hidp_process_report(struct hidp_session *session,
- int type, const u8 *data, int len, int intr)
+static void hidp_process_report(struct hidp_session *session, int type,
+ const u8 *data, unsigned int len, int intr)
{
if (len > HID_MAX_BUFFER_SIZE)
len = HID_MAX_BUFFER_SIZE;
@@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->version = req->version;
hid->country = req->country;
- strncpy(hid->name, req->name, sizeof(req->name) - 1);
+ strncpy(hid->name, req->name, sizeof(hid->name));
snprintf(hid->phys, sizeof(hid->phys), "%pMR",
&l2cap_pi(session->ctrl_sock->sk)->chan->src);
@@ -1074,6 +1074,10 @@ static int hidp_session_start_sync(struct hidp_session *session)
static void hidp_session_terminate(struct hidp_session *session)
{
atomic_inc(&session->terminate);
+ /*
+ * See the comment preceding the call to wait_woken()
+ * in hidp_session_run().
+ */
wake_up_interruptible(&hidp_session_wq);
}
@@ -1193,8 +1197,6 @@ static void hidp_session_run(struct hidp_session *session)
* thread is woken up by ->sk_state_changed().
*/
- /* Ensure session->terminate is updated */
- smp_mb__before_atomic();
if (atomic_read(&session->terminate))
break;
@@ -1228,14 +1230,15 @@ static void hidp_session_run(struct hidp_session *session)
hidp_process_transmit(session, &session->ctrl_transmit,
session->ctrl_sock);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(&hidp_session_wq, &wait);
atomic_inc(&session->terminate);
-
- /* Ensure session->terminate is updated */
- smp_mb__after_atomic();
}
static int hidp_session_wake_function(wait_queue_entry_t *wait,
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d17a4736e47c..2146e0f3b6f8 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -51,9 +51,6 @@ static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
static LIST_HEAD(chan_list);
static DEFINE_RWLOCK(chan_list_lock);
-static u16 le_max_credits = L2CAP_LE_MAX_CREDITS;
-static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS;
-
static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
u8 code, u8 ident, u16 dlen, void *data);
static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
@@ -519,8 +516,10 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
chan->sdu_last_frag = NULL;
chan->sdu_len = 0;
chan->tx_credits = 0;
- chan->rx_credits = le_max_credits;
- chan->mps = min_t(u16, chan->imtu, le_default_mps);
+ /* Derive MPS from connection MTU to stop HCI fragmentation */
+ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
+ /* Give enough credits for a full packet */
+ chan->rx_credits = (chan->imtu / chan->mps) + 1;
skb_queue_head_init(&chan->tx_q);
}
@@ -681,9 +680,9 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
u16 result;
if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
- result = L2CAP_CR_AUTHORIZATION;
+ result = L2CAP_CR_LE_AUTHORIZATION;
else
- result = L2CAP_CR_BAD_PSM;
+ result = L2CAP_CR_LE_BAD_PSM;
l2cap_state_change(chan, BT_DISCONN);
@@ -1282,6 +1281,8 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
return;
+ l2cap_le_flowctl_init(chan);
+
req.psm = chan->psm;
req.scid = cpu_to_le16(chan->scid);
req.mtu = cpu_to_le16(chan->imtu);
@@ -3669,7 +3670,7 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
rsp.mtu = cpu_to_le16(chan->imtu);
rsp.mps = cpu_to_le16(chan->mps);
rsp.credits = cpu_to_le16(chan->rx_credits);
- rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
&rsp);
@@ -3815,9 +3816,17 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
result = L2CAP_CR_NO_MEM;
+ /* Check for valid dynamic CID range (as per Erratum 3253) */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_DYN_END) {
+ result = L2CAP_CR_INVALID_SCID;
+ goto response;
+ }
+
/* Check if we already have channel with that dcid */
- if (__l2cap_get_chan_by_dcid(conn, scid))
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_SCID_IN_USE;
goto response;
+ }
chan = pchan->ops->new_connection(pchan);
if (!chan)
@@ -5279,7 +5288,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
credits = __le16_to_cpu(rsp->credits);
result = __le16_to_cpu(rsp->result);
- if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 ||
+ if (result == L2CAP_CR_LE_SUCCESS && (mtu < 23 || mps < 23 ||
dcid < L2CAP_CID_DYN_START ||
dcid > L2CAP_CID_LE_DYN_END))
return -EPROTO;
@@ -5300,7 +5309,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
l2cap_chan_lock(chan);
switch (result) {
- case L2CAP_CR_SUCCESS:
+ case L2CAP_CR_LE_SUCCESS:
if (__l2cap_get_chan_by_dcid(conn, dcid)) {
err = -EBADSLT;
break;
@@ -5314,8 +5323,8 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
l2cap_chan_ready(chan);
break;
- case L2CAP_CR_AUTHENTICATION:
- case L2CAP_CR_ENCRYPTION:
+ case L2CAP_CR_LE_AUTHENTICATION:
+ case L2CAP_CR_LE_ENCRYPTION:
/* If we already have MITM protection we can't do
* anything.
*/
@@ -5458,7 +5467,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
&conn->hcon->dst, LE_LINK);
if (!pchan) {
- result = L2CAP_CR_BAD_PSM;
+ result = L2CAP_CR_LE_BAD_PSM;
chan = NULL;
goto response;
}
@@ -5468,33 +5477,31 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
SMP_ALLOW_STK)) {
- result = L2CAP_CR_AUTHENTICATION;
+ result = L2CAP_CR_LE_AUTHENTICATION;
chan = NULL;
goto response_unlock;
}
/* Check for valid dynamic CID range */
if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
- result = L2CAP_CR_INVALID_SCID;
+ result = L2CAP_CR_LE_INVALID_SCID;
chan = NULL;
goto response_unlock;
}
/* Check if we already have channel with that dcid */
if (__l2cap_get_chan_by_dcid(conn, scid)) {
- result = L2CAP_CR_SCID_IN_USE;
+ result = L2CAP_CR_LE_SCID_IN_USE;
chan = NULL;
goto response_unlock;
}
chan = pchan->ops->new_connection(pchan);
if (!chan) {
- result = L2CAP_CR_NO_MEM;
+ result = L2CAP_CR_LE_NO_MEM;
goto response_unlock;
}
- l2cap_le_flowctl_init(chan);
-
bacpy(&chan->src, &conn->hcon->src);
bacpy(&chan->dst, &conn->hcon->dst);
chan->src_type = bdaddr_src_type(conn->hcon);
@@ -5506,6 +5513,9 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
chan->tx_credits = __le16_to_cpu(req->credits);
__l2cap_chan_add(conn, chan);
+
+ l2cap_le_flowctl_init(chan);
+
dcid = chan->scid;
credits = chan->rx_credits;
@@ -5524,7 +5534,7 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
chan->ops->defer(chan);
} else {
l2cap_chan_ready(chan);
- result = L2CAP_CR_SUCCESS;
+ result = L2CAP_CR_LE_SUCCESS;
}
response_unlock:
@@ -6699,13 +6709,10 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
struct l2cap_le_credits pkt;
u16 return_credits;
- /* We return more credits to the sender only after the amount of
- * credits falls below half of the initial amount.
- */
- if (chan->rx_credits >= (le_max_credits + 1) / 2)
- return;
+ return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits;
- return_credits = le_max_credits - chan->rx_credits;
+ if (!return_credits)
+ return;
BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
@@ -6719,6 +6726,21 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
}
+static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len);
+
+ /* Wait recv to confirm reception before updating the credits */
+ err = chan->ops->recv(chan, skb);
+
+ /* Update credits whenever an SDU is received */
+ l2cap_chan_le_send_credits(chan);
+
+ return err;
+}
+
static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
{
int err;
@@ -6737,7 +6759,11 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
chan->rx_credits--;
BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits);
- l2cap_chan_le_send_credits(chan);
+ /* Update if remote had run out of credits, this should only happens
+ * if the remote is not using the entire MPS.
+ */
+ if (!chan->rx_credits)
+ l2cap_chan_le_send_credits(chan);
err = 0;
@@ -6763,12 +6789,22 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
}
if (skb->len == sdu_len)
- return chan->ops->recv(chan, skb);
+ return l2cap_le_recv(chan, skb);
chan->sdu = skb;
chan->sdu_len = sdu_len;
chan->sdu_last_frag = skb;
+ /* Detect if remote is not able to use the selected MPS */
+ if (skb->len + L2CAP_SDULEN_SIZE < chan->mps) {
+ u16 mps_len = skb->len + L2CAP_SDULEN_SIZE;
+
+ /* Adjust the number of credits */
+ BT_DBG("chan->mps %u -> %u", chan->mps, mps_len);
+ chan->mps = mps_len;
+ l2cap_chan_le_send_credits(chan);
+ }
+
return 0;
}
@@ -6785,7 +6821,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
skb = NULL;
if (chan->sdu->len == chan->sdu_len) {
- err = chan->ops->recv(chan, chan->sdu);
+ err = l2cap_le_recv(chan, chan->sdu);
if (!err) {
chan->sdu = NULL;
chan->sdu_last_frag = NULL;
@@ -7102,7 +7138,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
case L2CAP_MODE_BASIC:
break;
case L2CAP_MODE_LE_FLOWCTL:
- l2cap_le_flowctl_init(chan);
break;
case L2CAP_MODE_ERTM:
case L2CAP_MODE_STREAMING:
@@ -7645,11 +7680,6 @@ int __init l2cap_init(void)
l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs,
NULL, &l2cap_debugfs_fops);
- debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs,
- &le_max_credits);
- debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs,
- &le_default_mps);
-
return 0;
}
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index cb670b5594eb..6d59a5023231 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -43,7 +43,7 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF);
}
-static void power_activate(struct led_classdev *led_cdev)
+static int power_activate(struct led_classdev *led_cdev)
{
struct hci_basic_led_trigger *htrig;
bool powered;
@@ -52,10 +52,12 @@ static void power_activate(struct led_classdev *led_cdev)
powered = test_bit(HCI_UP, &htrig->hdev->flags);
led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
+
+ return 0;
}
static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
- void (*activate)(struct led_classdev *led_cdev),
+ int (*activate)(struct led_classdev *led_cdev),
const char *name)
{
struct hci_basic_led_trigger *htrig;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 8a80d48d89c4..ccce954f8146 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -617,6 +617,127 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev,
&rp, sizeof(rp));
}
+static u32 get_supported_phys(struct hci_dev *hdev)
+{
+ u32 supported_phys = 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ supported_phys |= MGMT_PHY_BR_1M_1SLOT;
+
+ if (hdev->features[0][0] & LMP_3SLOT)
+ supported_phys |= MGMT_PHY_BR_1M_3SLOT;
+
+ if (hdev->features[0][0] & LMP_5SLOT)
+ supported_phys |= MGMT_PHY_BR_1M_5SLOT;
+
+ if (lmp_edr_2m_capable(hdev)) {
+ supported_phys |= MGMT_PHY_EDR_2M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_2M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_2M_5SLOT;
+
+ if (lmp_edr_3m_capable(hdev)) {
+ supported_phys |= MGMT_PHY_EDR_3M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_3M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_3M_5SLOT;
+ }
+ }
+ }
+
+ if (lmp_le_capable(hdev)) {
+ supported_phys |= MGMT_PHY_LE_1M_TX;
+ supported_phys |= MGMT_PHY_LE_1M_RX;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_2M) {
+ supported_phys |= MGMT_PHY_LE_2M_TX;
+ supported_phys |= MGMT_PHY_LE_2M_RX;
+ }
+
+ if (hdev->le_features[1] & HCI_LE_PHY_CODED) {
+ supported_phys |= MGMT_PHY_LE_CODED_TX;
+ supported_phys |= MGMT_PHY_LE_CODED_RX;
+ }
+ }
+
+ return supported_phys;
+}
+
+static u32 get_selected_phys(struct hci_dev *hdev)
+{
+ u32 selected_phys = 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ selected_phys |= MGMT_PHY_BR_1M_1SLOT;
+
+ if (hdev->pkt_type & (HCI_DM3 | HCI_DH3))
+ selected_phys |= MGMT_PHY_BR_1M_3SLOT;
+
+ if (hdev->pkt_type & (HCI_DM5 | HCI_DH5))
+ selected_phys |= MGMT_PHY_BR_1M_5SLOT;
+
+ if (lmp_edr_2m_capable(hdev)) {
+ if (!(hdev->pkt_type & HCI_2DH1))
+ selected_phys |= MGMT_PHY_EDR_2M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_2DH3))
+ selected_phys |= MGMT_PHY_EDR_2M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_2DH5))
+ selected_phys |= MGMT_PHY_EDR_2M_5SLOT;
+
+ if (lmp_edr_3m_capable(hdev)) {
+ if (!(hdev->pkt_type & HCI_3DH1))
+ selected_phys |= MGMT_PHY_EDR_3M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_3DH3))
+ selected_phys |= MGMT_PHY_EDR_3M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_3DH5))
+ selected_phys |= MGMT_PHY_EDR_3M_5SLOT;
+ }
+ }
+ }
+
+ if (lmp_le_capable(hdev)) {
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M)
+ selected_phys |= MGMT_PHY_LE_1M_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M)
+ selected_phys |= MGMT_PHY_LE_1M_RX;
+
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M)
+ selected_phys |= MGMT_PHY_LE_2M_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M)
+ selected_phys |= MGMT_PHY_LE_2M_RX;
+
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED)
+ selected_phys |= MGMT_PHY_LE_CODED_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED)
+ selected_phys |= MGMT_PHY_LE_CODED_RX;
+ }
+
+ return selected_phys;
+}
+
+static u32 get_configurable_phys(struct hci_dev *hdev)
+{
+ return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT &
+ ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX);
+}
+
static u32 get_supported_settings(struct hci_dev *hdev)
{
u32 settings = 0;
@@ -654,6 +775,8 @@ static u32 get_supported_settings(struct hci_dev *hdev)
hdev->set_bdaddr)
settings |= MGMT_SETTING_CONFIGURATION;
+ settings |= MGMT_SETTING_PHY_CONFIGURATION;
+
return settings;
}
@@ -817,7 +940,10 @@ static void rpa_expired(struct work_struct *work)
* function.
*/
hci_req_init(&req, hdev);
- __hci_req_enable_advertising(&req);
+ if (ext_adv_capable(hdev))
+ __hci_req_start_ext_adv(&req, hdev->cur_adv_instance);
+ else
+ __hci_req_enable_advertising(&req);
hci_req_run(&req, NULL);
}
@@ -1721,10 +1847,17 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
*/
if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
struct hci_request req;
-
hci_req_init(&req, hdev);
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
+ if (ext_adv_capable(hdev)) {
+ int err;
+
+ err = __hci_req_setup_ext_adv_instance(&req, 0x00);
+ if (!err)
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ }
hci_req_run(&req, NULL);
hci_update_background_scan(hdev);
}
@@ -1823,6 +1956,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
} else {
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
__hci_req_disable_advertising(&req);
+
+ if (ext_adv_capable(hdev))
+ __hci_req_clear_ext_adv_sets(&req);
}
hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp),
@@ -2298,9 +2434,8 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
/* LE address type */
addr_type = le_addr_type(cp->addr.type);
- hci_remove_irk(hdev, &cp->addr.bdaddr, addr_type);
-
- err = hci_remove_ltk(hdev, &cp->addr.bdaddr, addr_type);
+ /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */
+ err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type);
if (err < 0) {
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE,
MGMT_STATUS_NOT_PAIRED, &rp,
@@ -2314,8 +2449,6 @@ static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
goto done;
}
- /* Abort any ongoing SMP pairing */
- smp_cancel_pairing(conn);
/* Defer clearing up the connection parameters until closing to
* give a chance of keeping them if a repairing happens.
@@ -3184,6 +3317,225 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
return err;
}
+static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_rp_get_phy_confguration rp;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+
+ rp.supported_phys = cpu_to_le32(get_supported_phys(hdev));
+ rp.selected_phys = cpu_to_le32(get_selected_phys(hdev));
+ rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev));
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0,
+ &rp, sizeof(rp));
+}
+
+int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ struct mgmt_ev_phy_configuration_changed ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ ev.selected_phys = cpu_to_le32(get_selected_phys(hdev));
+
+ return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev,
+ sizeof(ev), skip);
+}
+
+static void set_default_phy_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev);
+ if (!cmd)
+ goto unlock;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ mgmt_status(status));
+ } else {
+ mgmt_cmd_complete(cmd->sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION, 0,
+ NULL, 0);
+
+ mgmt_phy_configuration_changed(hdev, cmd->sk);
+ }
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_phy_confguration *cp = data;
+ struct hci_cp_le_set_default_phy cp_phy;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys;
+ u16 pkt_type = (HCI_DH1 | HCI_DM1);
+ bool changed = false;
+ int err;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ configurable_phys = get_configurable_phys(hdev);
+ supported_phys = get_supported_phys(hdev);
+ selected_phys = __le32_to_cpu(cp->selected_phys);
+
+ if (selected_phys & ~supported_phys)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ unconfigure_phys = supported_phys & ~configurable_phys;
+
+ if ((selected_phys & unconfigure_phys) != unconfigure_phys)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (selected_phys == get_selected_phys(hdev))
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ 0, NULL, 0);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (selected_phys & MGMT_PHY_BR_1M_3SLOT)
+ pkt_type |= (HCI_DH3 | HCI_DM3);
+ else
+ pkt_type &= ~(HCI_DH3 | HCI_DM3);
+
+ if (selected_phys & MGMT_PHY_BR_1M_5SLOT)
+ pkt_type |= (HCI_DH5 | HCI_DM5);
+ else
+ pkt_type &= ~(HCI_DH5 | HCI_DM5);
+
+ if (selected_phys & MGMT_PHY_EDR_2M_1SLOT)
+ pkt_type &= ~HCI_2DH1;
+ else
+ pkt_type |= HCI_2DH1;
+
+ if (selected_phys & MGMT_PHY_EDR_2M_3SLOT)
+ pkt_type &= ~HCI_2DH3;
+ else
+ pkt_type |= HCI_2DH3;
+
+ if (selected_phys & MGMT_PHY_EDR_2M_5SLOT)
+ pkt_type &= ~HCI_2DH5;
+ else
+ pkt_type |= HCI_2DH5;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_1SLOT)
+ pkt_type &= ~HCI_3DH1;
+ else
+ pkt_type |= HCI_3DH1;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_3SLOT)
+ pkt_type &= ~HCI_3DH3;
+ else
+ pkt_type |= HCI_3DH3;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_5SLOT)
+ pkt_type &= ~HCI_3DH5;
+ else
+ pkt_type |= HCI_3DH5;
+
+ if (pkt_type != hdev->pkt_type) {
+ hdev->pkt_type = pkt_type;
+ changed = true;
+ }
+
+ if ((selected_phys & MGMT_PHY_LE_MASK) ==
+ (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) {
+ if (changed)
+ mgmt_phy_configuration_changed(hdev, sk);
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ 0, NULL, 0);
+
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
+ len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ memset(&cp_phy, 0, sizeof(cp_phy));
+
+ if (!(selected_phys & MGMT_PHY_LE_TX_MASK))
+ cp_phy.all_phys |= 0x01;
+
+ if (!(selected_phys & MGMT_PHY_LE_RX_MASK))
+ cp_phy.all_phys |= 0x02;
+
+ if (selected_phys & MGMT_PHY_LE_1M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED;
+
+ if (selected_phys & MGMT_PHY_LE_1M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED;
+
+ hci_req_add(&req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy);
+
+ err = hci_req_run_skb(&req, set_default_phy_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -4037,9 +4389,14 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
* HCI_ADVERTISING flag is not yet set.
*/
hdev->cur_adv_instance = 0x00;
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
- __hci_req_enable_advertising(&req);
+
+ if (ext_adv_capable(hdev)) {
+ __hci_req_start_ext_adv(&req, 0x00);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ __hci_req_enable_advertising(&req);
+ }
} else {
__hci_req_disable_advertising(&req);
}
@@ -4609,6 +4966,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
if (cp->privacy == 0x02)
hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
else
@@ -4617,6 +4975,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
memset(hdev->irk, 0, sizeof(hdev->irk));
hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, false);
hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
}
@@ -5967,9 +6326,23 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_APPEARANCE;
flags |= MGMT_ADV_FLAG_LOCAL_NAME;
- if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
+ /* In extended adv TX_POWER returned from Set Adv Param
+ * will be always valid.
+ */
+ if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) ||
+ ext_adv_capable(hdev))
flags |= MGMT_ADV_FLAG_TX_POWER;
+ if (ext_adv_capable(hdev)) {
+ flags |= MGMT_ADV_FLAG_SEC_1M;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_2M)
+ flags |= MGMT_ADV_FLAG_SEC_2M;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_CODED)
+ flags |= MGMT_ADV_FLAG_SEC_CODED;
+ }
+
return flags;
}
@@ -6175,7 +6548,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_add_advertising *cp = data;
struct mgmt_rp_add_advertising rp;
u32 flags;
- u32 supported_flags;
+ u32 supported_flags, phy_flags;
u8 status;
u16 timeout, duration;
unsigned int prev_instance_cnt = hdev->adv_instance_cnt;
@@ -6205,10 +6578,12 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
duration = __le16_to_cpu(cp->duration);
/* The current implementation only supports a subset of the specified
- * flags.
+ * flags. Also need to check mutual exclusiveness of sec flags.
*/
supported_flags = get_supported_adv_flags(hdev);
- if (flags & ~supported_flags)
+ phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK;
+ if (flags & ~supported_flags ||
+ ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6544,6 +6919,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
HCI_MGMT_UNTRUSTED },
{ set_appearance, MGMT_SET_APPEARANCE_SIZE },
+ { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE },
+ { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 5e44d842cc5d..0c7d31c6c18c 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -839,18 +839,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned l
BT_DBG("TIOCMIWAIT");
break;
- case TIOCGSERIAL:
- BT_ERR("TIOCGSERIAL is not supported");
- return -ENOIOCTLCMD;
-
- case TIOCSSERIAL:
- BT_ERR("TIOCSSERIAL is not supported");
- return -ENOIOCTLCMD;
-
- case TIOCSERGSTRUCT:
- BT_ERR("TIOCSERGSTRUCT is not supported");
- return -ENOIOCTLCMD;
-
case TIOCSERGETLSR:
BT_ERR("TIOCSERGETLSR is not supported");
return -ENOIOCTLCMD;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 413b8ee49fec..8f0f9279eac9 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -393,7 +393,8 @@ static void sco_sock_cleanup_listen(struct sock *parent)
*/
static void sco_sock_kill(struct sock *sk)
{
- if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
+ sock_flag(sk, SOCK_DEAD))
return;
BT_DBG("sk %p state %d", sk, sk->sk_state);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index ae91e2d40056..a1c1b7e8a45c 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -83,13 +83,11 @@ enum {
struct smp_dev {
/* Secure Connections OOB data */
+ bool local_oob;
u8 local_pk[64];
u8 local_rand[16];
bool debug_key;
- u8 min_key_size;
- u8 max_key_size;
-
struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
@@ -599,6 +597,8 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
memcpy(rand, smp->local_rand, 16);
+ smp->local_oob = true;
+
return 0;
}
@@ -717,7 +717,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
if (rsp == NULL) {
req->io_capability = conn->hcon->io_capability;
req->oob_flag = oob_flag;
- req->max_key_size = SMP_DEV(hdev)->max_key_size;
+ req->max_key_size = hdev->le_max_key_size;
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -728,7 +728,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
rsp->io_capability = conn->hcon->io_capability;
rsp->oob_flag = oob_flag;
- rsp->max_key_size = SMP_DEV(hdev)->max_key_size;
+ rsp->max_key_size = hdev->le_max_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -742,7 +742,7 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
struct hci_dev *hdev = conn->hcon->hdev;
struct smp_chan *smp = chan->data;
- if (max_key_size > SMP_DEV(hdev)->max_key_size ||
+ if (max_key_size > hdev->le_max_key_size ||
max_key_size < SMP_MIN_ENC_KEY_SIZE)
return SMP_ENC_KEY_SIZE;
@@ -1785,7 +1785,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
* successfully received our local OOB data - therefore set the
* flag to indicate that local OOB is in use.
*/
- if (req->oob_flag == SMP_OOB_PRESENT)
+ if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob)
set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
/* SMP over BR/EDR requires special treatment */
@@ -1967,7 +1967,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
* successfully received our local OOB data - therefore set the
* flag to indicate that local OOB is in use.
*/
- if (rsp->oob_flag == SMP_OOB_PRESENT)
+ if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob)
set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags);
smp->prsp[0] = SMP_CMD_PAIRING_RSP;
@@ -2419,30 +2419,51 @@ unlock:
return ret;
}
-void smp_cancel_pairing(struct hci_conn *hcon)
+int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type)
{
- struct l2cap_conn *conn = hcon->l2cap_data;
+ struct hci_conn *hcon;
+ struct l2cap_conn *conn;
struct l2cap_chan *chan;
struct smp_chan *smp;
+ int err;
+
+ err = hci_remove_ltk(hdev, bdaddr, addr_type);
+ hci_remove_irk(hdev, bdaddr, addr_type);
+ hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type);
+ if (!hcon)
+ goto done;
+
+ conn = hcon->l2cap_data;
if (!conn)
- return;
+ goto done;
chan = conn->smp;
if (!chan)
- return;
+ goto done;
l2cap_chan_lock(chan);
smp = chan->data;
if (smp) {
+ /* Set keys to NULL to make sure smp_failure() does not try to
+ * remove and free already invalidated rcu list entries. */
+ smp->ltk = NULL;
+ smp->slave_ltk = NULL;
+ smp->remote_irk = NULL;
+
if (test_bit(SMP_FLAG_COMPLETE, &smp->flags))
smp_failure(conn, 0);
else
smp_failure(conn, SMP_UNSPECIFIED);
+ err = 0;
}
l2cap_chan_unlock(chan);
+
+done:
+ return err;
}
static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
@@ -2697,7 +2718,13 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
* key was set/generated.
*/
if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
- struct smp_dev *smp_dev = chan->data;
+ struct l2cap_chan *hchan = hdev->smp_data;
+ struct smp_dev *smp_dev;
+
+ if (!hchan || !hchan->data)
+ return SMP_UNSPECIFIED;
+
+ smp_dev = hchan->data;
tfm_ecdh = smp_dev->tfm_ecdh;
} else {
@@ -3230,11 +3257,10 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
return ERR_CAST(tfm_ecdh);
}
+ smp->local_oob = false;
smp->tfm_aes = tfm_aes;
smp->tfm_cmac = tfm_cmac;
smp->tfm_ecdh = tfm_ecdh;
- smp->min_key_size = SMP_MIN_ENC_KEY_SIZE;
- smp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
create_chan:
chan = l2cap_chan_create();
@@ -3360,7 +3386,7 @@ static ssize_t le_min_key_size_read(struct file *file,
struct hci_dev *hdev = file->private_data;
char buf[4];
- snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size);
+ snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size);
return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
}
@@ -3381,11 +3407,11 @@ static ssize_t le_min_key_size_write(struct file *file,
sscanf(buf, "%hhu", &key_size);
- if (key_size > SMP_DEV(hdev)->max_key_size ||
+ if (key_size > hdev->le_max_key_size ||
key_size < SMP_MIN_ENC_KEY_SIZE)
return -EINVAL;
- SMP_DEV(hdev)->min_key_size = key_size;
+ hdev->le_min_key_size = key_size;
return count;
}
@@ -3404,7 +3430,7 @@ static ssize_t le_max_key_size_read(struct file *file,
struct hci_dev *hdev = file->private_data;
char buf[4];
- snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size);
+ snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size);
return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
}
@@ -3426,10 +3452,10 @@ static ssize_t le_max_key_size_write(struct file *file,
sscanf(buf, "%hhu", &key_size);
if (key_size > SMP_MAX_ENC_KEY_SIZE ||
- key_size < SMP_DEV(hdev)->min_key_size)
+ key_size < hdev->le_min_key_size)
return -EINVAL;
- SMP_DEV(hdev)->max_key_size = key_size;
+ hdev->le_max_key_size = key_size;
return count;
}
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index 0ff6247eaa6c..121edadd5f8d 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -181,7 +181,8 @@ enum smp_key_pref {
};
/* SMP Commands */
-void smp_cancel_pairing(struct hci_conn *hcon);
+int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type);
bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
enum smp_key_pref key_pref);
int smp_conn_security(struct hci_conn *hcon, __u8 sec_level);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 22a78eedf4b1..c89c22c49015 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,13 +10,17 @@
#include <linux/etherdevice.h>
#include <linux/filter.h>
#include <linux/sched/signal.h>
+#include <net/sock.h>
+#include <net/tcp.h>
-static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx)
+static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
{
u32 ret;
preempt_disable();
rcu_read_lock();
+ bpf_cgroup_storage_set(storage);
ret = BPF_PROG_RUN(prog, ctx);
rcu_read_unlock();
preempt_enable();
@@ -26,14 +30,26 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx)
static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
{
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 };
+ enum bpf_cgroup_storage_type stype;
u64 time_start, time_spent = 0;
u32 ret = 0, i;
+ for_each_cgroup_storage_type(stype) {
+ storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+ if (IS_ERR(storage[stype])) {
+ storage[stype] = NULL;
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(storage[stype]);
+ return -ENOMEM;
+ }
+ }
+
if (!repeat)
repeat = 1;
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
- ret = bpf_test_run_one(prog, ctx);
+ ret = bpf_test_run_one(prog, ctx, storage);
if (need_resched()) {
if (signal_pending(current))
break;
@@ -46,6 +62,9 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
do_div(time_spent, repeat);
*time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(storage[stype]);
+
return ret;
}
@@ -98,6 +117,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
u32 retval, duration;
int hh_len = ETH_HLEN;
struct sk_buff *skb;
+ struct sock *sk;
void *data;
int ret;
@@ -120,11 +140,21 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
break;
}
+ sk = kzalloc(sizeof(struct sock), GFP_USER);
+ if (!sk) {
+ kfree(data);
+ return -ENOMEM;
+ }
+ sock_net_set(sk, current->nsproxy->net_ns);
+ sock_init_data(NULL, sk);
+
skb = build_skb(data, 0);
if (!skb) {
kfree(data);
+ kfree(sk);
return -ENOMEM;
}
+ skb->sk = sk;
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
__skb_put(skb, size);
@@ -142,6 +172,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
kfree_skb(skb);
+ kfree(sk);
return -ENOMEM;
}
}
@@ -154,6 +185,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
size = skb_headlen(skb);
ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
kfree_skb(skb);
+ kfree(sk);
return ret;
}
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index 76deb6615883..e558b46596c4 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -13,4 +13,3 @@ config BPFILTER_UMH
help
This builds bpfilter kernel module with embedded user mode helper
endif
-
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index 39c6980b5d99..0947ee7f70d5 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -5,14 +5,14 @@
hostprogs-y := bpfilter_umh
bpfilter_umh-objs := main.o
-HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
+KBUILD_HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
HOSTCC := $(CC)
ifeq ($(CONFIG_BPFILTER_UMH), y)
# builtin bpfilter_umh should be compiled with -static
# since rootfs isn't mounted at the time of __init
# function is called and do_execv won't find elf interpreter
-HOSTLDFLAGS += -static
+KBUILD_HOSTLDFLAGS += -static
endif
$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index f0fc182d3db7..7acfc83087d5 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -23,9 +23,11 @@ static void shutdown_umh(struct umh_info *info)
if (!info->pid)
return;
- tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
- if (tsk)
+ tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID);
+ if (tsk) {
force_sig(SIGKILL, tsk);
+ put_task_struct(tsk);
+ }
fput(info->pipe_to_umh);
fput(info->pipe_from_umh);
info->pid = 0;
@@ -59,7 +61,7 @@ static int __bpfilter_process_sockopt(struct sock *sk, int optname,
req.is_set = is_set;
req.pid = current->pid;
req.cmd = optname;
- req.addr = (long)optval;
+ req.addr = (long __force __user)optval;
req.len = optlen;
mutex_lock(&bpfilter_lock);
if (!info.pid)
@@ -90,6 +92,7 @@ static int __init load_umh(void)
int err;
/* fork usermode process */
+ info.cmdline = "bpfilter_umh";
err = fork_usermode_blob(&bpfilter_umh_start,
&bpfilter_umh_end - &bpfilter_umh_start,
&info);
@@ -98,7 +101,7 @@ static int __init load_umh(void)
pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
/* health check that usermode process started correctly */
- if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+ if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
stop_umh();
return -EFAULT;
}
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index aa0d3b2f1bb7..3625d6ade45c 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -17,7 +17,7 @@ config BRIDGE
other third party bridge products.
In order to use the Ethernet bridge, you'll need the bridge
- configuration tools; see <file:Documentation/networking/bridge.txt>
+ configuration tools; see <file:Documentation/networking/bridge.rst>
for location. Please read the Bridge mini-HOWTO for more
information.
diff --git a/net/bridge/br.c b/net/bridge/br.c
index b0a0b82e2d91..360ad66c21e9 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -151,7 +151,7 @@ static int br_switchdev_event(struct notifier_block *unused,
break;
}
br_fdb_offloaded_set(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, true);
break;
case SWITCHDEV_FDB_DEL_TO_BRIDGE:
fdb_info = ptr;
@@ -163,7 +163,7 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_OFFLOADED:
fdb_info = ptr;
br_fdb_offloaded_set(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, fdb_info->offloaded);
break;
}
@@ -175,6 +175,22 @@ static struct notifier_block br_switchdev_notifier = {
.notifier_call = br_switchdev_event,
};
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
+{
+ bool cur = !!br_opt_get(br, opt);
+
+ br_debug(br, "toggle option: %d state: %d -> %d\n",
+ opt, cur, on);
+
+ if (cur == on)
+ return;
+
+ if (on)
+ set_bit(opt, &br->options);
+ else
+ clear_bit(opt, &br->options);
+}
+
static void __net_exit br_net_exit(struct net *net)
{
struct net_device *dev;
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 2cf7716254be..6b78e6351719 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -39,7 +39,7 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
}
}
- br->neigh_suppress_enabled = neigh_suppress;
+ br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress);
}
#if IS_ENABLED(CONFIG_INET)
@@ -155,7 +155,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
ipv4_is_multicast(tip))
return;
- if (br->neigh_suppress_enabled) {
+ if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
if (p && (p->flags & BR_NEIGH_SUPPRESS))
return;
if (ipv4_is_zeronet(sip) || sip == tip) {
@@ -175,7 +175,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
return;
}
- if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) {
+ if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
+ br_is_local_ip(vlandev, tip)) {
/* its our local ip, so don't proxy reply
* and don't forward to neigh suppress ports
*/
@@ -213,7 +214,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
/* If we have replied or as long as we know the
* mac, indicate to arp replied
*/
- if (replied || br->neigh_suppress_enabled)
+ if (replied ||
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
}
@@ -311,7 +313,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
/* Neighbor Advertisement */
memset(na, 0, sizeof(*na) + na_olen);
na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
- na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */
+ na->icmph.icmp6_router = (n->flags & NTF_ROUTER) ? 1 : 0;
na->icmph.icmp6_override = 1;
na->icmph.icmp6_solicited = 1;
na->target = ns->target;
@@ -460,7 +462,8 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
* mac, indicate to NEIGH_SUPPRESS ports that we
* have replied
*/
- if (replied || br->neigh_suppress_enabled)
+ if (replied ||
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
}
neigh_release(n);
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e682a668ce57..c6abf927f0c9 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -67,11 +67,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
if (IS_ENABLED(CONFIG_INET) &&
(eth->h_proto == htons(ETH_P_ARP) ||
eth->h_proto == htons(ETH_P_RARP)) &&
- br->neigh_suppress_enabled) {
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
br_do_proxy_suppress_arp(skb, br, vid, NULL);
} else if (IS_ENABLED(CONFIG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6) &&
- br->neigh_suppress_enabled &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
pskb_may_pull(skb, sizeof(struct ipv6hdr) +
sizeof(struct nd_msg)) &&
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
@@ -228,7 +228,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
dev->mtu = new_mtu;
/* this flag will be cleared if the MTU was automatically adjusted */
- br->mtu_set_by_user = true;
+ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true);
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
/* remember the MTU in the rtable for PMTU */
dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
@@ -344,7 +344,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
p->np = NULL;
- __netpoll_free_async(np);
+ __netpoll_free(np);
}
#endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 502f66349530..e56ba3912a90 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -504,6 +504,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
fdb->added_by_user = 0;
fdb->added_by_external_learn = 0;
fdb->offloaded = 0;
+ fdb->is_sticky = 0;
fdb->updated = fdb->used = jiffies;
if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
&fdb->rhnode,
@@ -584,7 +585,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
unsigned long now = jiffies;
/* fastpath: update of existing entry */
- if (unlikely(source != fdb->dst)) {
+ if (unlikely(source != fdb->dst && !fdb->is_sticky)) {
fdb->dst = source;
fdb_modified = true;
/* Take over HW learned entry */
@@ -656,6 +657,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
ndm->ndm_flags |= NTF_OFFLOADED;
if (fdb->added_by_external_learn)
ndm->ndm_flags |= NTF_EXT_LEARNED;
+ if (fdb->is_sticky)
+ ndm->ndm_flags |= NTF_STICKY;
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
goto nla_put_failure;
@@ -772,8 +775,10 @@ skip:
/* Update (create or replace) forwarding database entry */
static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
- const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
+ const u8 *addr, u16 state, u16 flags, u16 vid,
+ u8 ndm_flags)
{
+ u8 is_sticky = !!(ndm_flags & NTF_STICKY);
struct net_bridge_fdb_entry *fdb;
bool modified = false;
@@ -789,6 +794,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
return -EINVAL;
}
+ if (is_sticky && (state & NUD_PERMANENT))
+ return -EINVAL;
+
fdb = br_fdb_find(br, addr, vid);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
@@ -832,6 +840,12 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
modified = true;
}
+
+ if (is_sticky != fdb->is_sticky) {
+ fdb->is_sticky = is_sticky;
+ modified = true;
+ }
+
fdb->added_by_user = 1;
fdb->used = jiffies;
@@ -865,7 +879,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
} else {
spin_lock_bh(&br->hash_lock);
err = fdb_add_entry(br, p, addr, ndm->ndm_state,
- nlh_flags, vid);
+ nlh_flags, vid, ndm->ndm_flags);
spin_unlock_bh(&br->hash_lock);
}
@@ -1138,7 +1152,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
}
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid, bool offloaded)
{
struct net_bridge_fdb_entry *fdb;
@@ -1146,7 +1160,7 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
fdb = br_fdb_find(br, addr, vid);
if (fdb)
- fdb->offloaded = 1;
+ fdb->offloaded = offloaded;
spin_unlock_bh(&br->hash_lock);
}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 9019f326fe81..5372e2042adf 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev,
void br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, bool local_rcv, bool local_orig)
{
- if (to && should_deliver(to, skb)) {
+ if (unlikely(!to))
+ goto out;
+
+ /* redirect to backup link if the destination port is down */
+ if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
+ struct net_bridge_port *backup_port;
+
+ backup_port = rcu_dereference(to->backup_port);
+ if (unlikely(!backup_port))
+ goto out;
+ to = backup_port;
+ }
+
+ if (should_deliver(to, skb)) {
if (local_rcv)
deliver_clone(to, skb, local_orig);
else
@@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to,
return;
}
+out:
if (!local_rcv)
kfree_skb(skb);
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 05e42d86882d..9b46d2dc4c22 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -26,6 +26,7 @@
#include <net/sock.h>
#include <linux/if_vlan.h>
#include <net/switchdev.h>
+#include <net/net_namespace.h>
#include "br_private.h"
@@ -169,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br)
}
}
+int nbp_backup_change(struct net_bridge_port *p,
+ struct net_device *backup_dev)
+{
+ struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
+ struct net_bridge_port *backup_p = NULL;
+
+ ASSERT_RTNL();
+
+ if (backup_dev) {
+ if (!br_port_exists(backup_dev))
+ return -ENOENT;
+
+ backup_p = br_port_get_rtnl(backup_dev);
+ if (backup_p->br != p->br)
+ return -EINVAL;
+ }
+
+ if (p == backup_p)
+ return -EINVAL;
+
+ if (old_backup == backup_p)
+ return 0;
+
+ /* if the backup link is already set, clear it */
+ if (old_backup)
+ old_backup->backup_redirected_cnt--;
+
+ if (backup_p)
+ backup_p->backup_redirected_cnt++;
+ rcu_assign_pointer(p->backup_port, backup_p);
+
+ return 0;
+}
+
+static void nbp_backup_clear(struct net_bridge_port *p)
+{
+ nbp_backup_change(p, NULL);
+ if (p->backup_redirected_cnt) {
+ struct net_bridge_port *cur_p;
+
+ list_for_each_entry(cur_p, &p->br->port_list, list) {
+ struct net_bridge_port *backup_p;
+
+ backup_p = rtnl_dereference(cur_p->backup_port);
+ if (backup_p == p)
+ nbp_backup_change(cur_p, NULL);
+ }
+ }
+
+ WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
+}
+
static void nbp_update_port_count(struct net_bridge *br)
{
struct net_bridge_port *p;
@@ -204,11 +257,19 @@ static void release_nbp(struct kobject *kobj)
kfree(p);
}
+static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
+{
+ struct net_bridge_port *p = kobj_to_brport(kobj);
+
+ net_ns_get_ownership(dev_net(p->dev), uid, gid);
+}
+
static struct kobj_type brport_ktype = {
#ifdef CONFIG_SYSFS
.sysfs_ops = &brport_sysfs_ops,
#endif
.release = release_nbp,
+ .get_ownership = brport_get_ownership,
};
static void destroy_nbp(struct net_bridge_port *p)
@@ -286,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p)
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
switchdev_deferred_process();
+ nbp_backup_clear(p);
nbp_update_port_count(br);
@@ -332,8 +394,7 @@ static int find_portno(struct net_bridge *br)
struct net_bridge_port *p;
unsigned long *inuse;
- inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
- GFP_KERNEL);
+ inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
if (!inuse)
return -ENOMEM;
@@ -342,7 +403,7 @@ static int find_portno(struct net_bridge *br)
set_bit(p->port_no, inuse);
}
index = find_first_zero_bit(inuse, BR_MAX_PORTS);
- kfree(inuse);
+ bitmap_free(inuse);
return (index >= BR_MAX_PORTS) ? -EXFULL : index;
}
@@ -447,14 +508,14 @@ void br_mtu_auto_adjust(struct net_bridge *br)
ASSERT_RTNL();
/* if the bridge MTU was manually configured don't mess with it */
- if (br->mtu_set_by_user)
+ if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
return;
/* change to the minimum MTU and clear the flag which was set by
* the bridge ndo_change_mtu callback
*/
dev_set_mtu(br->dev, br_mtu_min(br));
- br->mtu_set_by_user = false;
+ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false);
}
static void br_set_gso_limits(struct net_bridge *br)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 72074276c088..3ddca11f44c2 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -122,7 +122,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
br_do_proxy_suppress_arp(skb, br, vid, p);
} else if (IS_ENABLED(CONFIG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6) &&
- br->neigh_suppress_enabled &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
pskb_may_pull(skb, sizeof(struct ipv6hdr) +
sizeof(struct nd_msg)) &&
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 6d9f48bd374a..a7ea2d431714 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -84,7 +84,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
int i, err = 0;
int idx = 0, s_idx = cb->args[1];
- if (br->multicast_disabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
mdb = rcu_dereference(br->mdb);
@@ -162,6 +162,29 @@ out:
return err;
}
+static int br_mdb_valid_dump_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct br_port_msg *bpm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for mdb dump request");
+ return -EINVAL;
+ }
+
+ bpm = nlmsg_data(nlh);
+ if (bpm->ifindex) {
+ NL_SET_ERR_MSG_MOD(extack, "Filtering by device index is not supported for mdb dump request");
+ return -EINVAL;
+ }
+ if (nlmsg_attrlen(nlh, sizeof(*bpm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net_device *dev;
@@ -169,6 +192,13 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct nlmsghdr *nlh = NULL;
int idx = 0, s_idx;
+ if (cb->strict_check) {
+ int err = br_mdb_valid_dump_req(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
s_idx = cb->args[0];
rcu_read_lock();
@@ -598,7 +628,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
struct net_bridge_port *p;
int ret;
- if (!netif_running(br->dev) || br->multicast_disabled)
+ if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
return -EINVAL;
dev = __dev_get_by_index(net, entry->ifindex);
@@ -673,7 +703,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
struct br_ip ip;
int err = -EINVAL;
- if (!netif_running(br->dev) || br->multicast_disabled)
+ if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
return -EINVAL;
__mdb_entry_to_br_ip(entry, &ip);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 920665dd92db..024139b51d3a 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -158,7 +158,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
struct br_ip ip;
- if (br->multicast_disabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return NULL;
if (BR_INPUT_SKB_CB(skb)->igmp)
@@ -411,7 +411,7 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->protocol = IPPROTO_IGMP;
- iph->saddr = br->multicast_query_use_ifaddr ?
+ iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
((u8 *)&iph[1])[0] = IPOPT_RA;
@@ -503,11 +503,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
&ip6h->saddr)) {
kfree_skb(skb);
- br->has_ipv6_addr = 0;
+ br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false);
return NULL;
}
- br->has_ipv6_addr = 1;
+ br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
hopopt = (u8 *)(ip6h + 1);
@@ -628,7 +628,7 @@ static struct net_bridge_mdb_entry *br_multicast_get_group(
port ? port->dev->name : br->dev->name);
err = -E2BIG;
disable:
- br->multicast_disabled = 1;
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
goto err;
}
}
@@ -894,7 +894,7 @@ static void br_multicast_querier_expired(struct net_bridge *br,
struct bridge_mcast_own_query *query)
{
spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || br->multicast_disabled)
+ if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
goto out;
br_multicast_start_querier(br, query);
@@ -965,8 +965,9 @@ static void br_multicast_send_query(struct net_bridge *br,
struct br_ip br_group;
unsigned long time;
- if (!netif_running(br->dev) || br->multicast_disabled ||
- !br->multicast_querier)
+ if (!netif_running(br->dev) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ !br_opt_get(br, BROPT_MULTICAST_QUERIER))
return;
memset(&br_group.u, 0, sizeof(br_group.u));
@@ -1036,7 +1037,7 @@ static void br_mc_disabled_update(struct net_device *dev, bool value)
.orig_dev = dev,
.id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
.flags = SWITCHDEV_F_DEFER,
- .u.mc_disabled = value,
+ .u.mc_disabled = !value,
};
switchdev_port_attr_set(dev, &attr);
@@ -1054,7 +1055,8 @@ int br_multicast_add_port(struct net_bridge_port *port)
timer_setup(&port->ip6_own_query.timer,
br_ip6_multicast_port_query_expired, 0);
#endif
- br_mc_disabled_update(port->dev, port->br->multicast_disabled);
+ br_mc_disabled_update(port->dev,
+ br_opt_get(port->br, BROPT_MULTICAST_ENABLED));
port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
if (!port->mcast_stats)
@@ -1091,7 +1093,7 @@ static void __br_multicast_enable_port(struct net_bridge_port *port)
{
struct net_bridge *br = port->br;
- if (br->multicast_disabled || !netif_running(br->dev))
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev))
return;
br_multicast_enable(&port->ip4_own_query);
@@ -1423,10 +1425,10 @@ static void br_multicast_query_received(struct net_bridge *br,
br_multicast_mark_router(br, port);
}
-static int br_ip4_multicast_query(struct net_bridge *br,
- struct net_bridge_port *port,
- struct sk_buff *skb,
- u16 vid)
+static void br_ip4_multicast_query(struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb,
+ u16 vid)
{
const struct iphdr *iph = ip_hdr(skb);
struct igmphdr *ih = igmp_hdr(skb);
@@ -1439,7 +1441,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
unsigned long now = jiffies;
unsigned int offset = skb_transport_offset(skb);
__be32 group;
- int err = 0;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) ||
@@ -1498,7 +1499,6 @@ static int br_ip4_multicast_query(struct net_bridge *br,
out:
spin_unlock(&br->multicast_lock);
- return err;
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -1636,7 +1636,7 @@ br_multicast_leave_group(struct net_bridge *br,
if (timer_pending(&other_query->timer))
goto out;
- if (br->multicast_querier) {
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
__br_multicast_send_query(br, port, &mp->addr);
time = jiffies + br->multicast_last_member_count *
@@ -1748,7 +1748,7 @@ static void br_multicast_err_count(const struct net_bridge *br,
struct bridge_mcast_stats __percpu *stats;
struct bridge_mcast_stats *pstats;
- if (!br->multicast_stats_enabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
return;
if (p)
@@ -1828,7 +1828,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
break;
case IGMP_HOST_MEMBERSHIP_QUERY:
- err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
+ br_ip4_multicast_query(br, port, skb_trimmed, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
@@ -1906,7 +1906,7 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
BR_INPUT_SKB_CB(skb)->igmp = 0;
BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
- if (br->multicast_disabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return 0;
switch (skb->protocol) {
@@ -1958,8 +1958,6 @@ void br_multicast_init(struct net_bridge *br)
br->hash_max = 512;
br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
- br->multicast_querier = 0;
- br->multicast_query_use_ifaddr = 0;
br->multicast_last_member_count = 2;
br->multicast_startup_query_count = 2;
@@ -1978,7 +1976,8 @@ void br_multicast_init(struct net_bridge *br)
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
#endif
- br->has_ipv6_addr = 1;
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true);
+ br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
spin_lock_init(&br->multicast_lock);
timer_setup(&br->multicast_router_timer,
@@ -2000,7 +1999,7 @@ static void __br_multicast_open(struct net_bridge *br,
{
query->startup_sent = 0;
- if (br->multicast_disabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
return;
mod_timer(&query->timer, jiffies);
@@ -2175,12 +2174,12 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
int err = 0;
spin_lock_bh(&br->multicast_lock);
- if (br->multicast_disabled == !val)
+ if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
goto unlock;
- br_mc_disabled_update(br->dev, !val);
- br->multicast_disabled = !val;
- if (br->multicast_disabled)
+ br_mc_disabled_update(br->dev, val);
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
goto unlock;
if (!netif_running(br->dev))
@@ -2191,7 +2190,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
if (mdb->old) {
err = -EEXIST;
rollback:
- br->multicast_disabled = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
goto unlock;
}
@@ -2215,7 +2214,7 @@ bool br_multicast_enabled(const struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
- return !br->multicast_disabled;
+ return !!br_opt_get(br, BROPT_MULTICAST_ENABLED);
}
EXPORT_SYMBOL_GPL(br_multicast_enabled);
@@ -2238,10 +2237,10 @@ int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
val = !!val;
spin_lock_bh(&br->multicast_lock);
- if (br->multicast_querier == val)
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER) == val)
goto unlock;
- br->multicast_querier = val;
+ br_opt_toggle(br, BROPT_MULTICAST_QUERIER, !!val);
if (!val)
goto unlock;
@@ -2562,7 +2561,7 @@ void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
struct bridge_mcast_stats __percpu *stats;
/* if multicast_disabled is true then igmp type can't be set */
- if (!type || !br->multicast_stats_enabled)
+ if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
return;
if (p)
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 9b16eaf33819..b1b5e8516724 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -26,6 +26,7 @@
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/netfilter_bridge.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
@@ -486,14 +487,15 @@ static unsigned int br_nf_pre_routing(void *priv,
br = p->br;
if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
- if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+ if (!brnf_call_ip6tables &&
+ !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
return br_nf_pre_routing_ipv6(priv, skb, state);
}
- if (!brnf_call_iptables && !br->nf_call_iptables)
+ if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
return NF_ACCEPT;
if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
@@ -635,7 +637,7 @@ static unsigned int br_nf_forward_arp(void *priv,
return NF_ACCEPT;
br = p->br;
- if (!brnf_call_arptables && !br->nf_call_arptables)
+ if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
return NF_ACCEPT;
if (!IS_ARP(skb)) {
@@ -834,7 +836,8 @@ static unsigned int ip_sabotage_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
+ if (skb->nf_bridge && !skb->nf_bridge->in_prerouting &&
+ !netif_is_l3_master(skb->dev)) {
state->okfn(state->net, state->sk, skb);
return NF_STOLEN;
}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 9f5eb05b0373..3345f1984542 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
+ nla_total_size(br_get_link_af_size_filtered(dev,
- filter_mask)); /* IFLA_AF_SPEC */
+ filter_mask)) /* IFLA_AF_SPEC */
+ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */
}
static int br_port_fill_attrs(struct sk_buff *skb,
const struct net_bridge_port *p)
{
u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
+ struct net_bridge_port *backup_p;
u64 timerval;
if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
@@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb,
return -EMSGSIZE;
#endif
+ /* we might be called only with br->lock */
+ rcu_read_lock();
+ backup_p = rcu_dereference(p->backup_port);
+ if (backup_p)
+ nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT,
+ backup_p->dev->ifindex);
+ rcu_read_unlock();
+
return 0;
}
@@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
[IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
};
/* Change the state of the port and notify spanning tree */
@@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
if (err)
return err;
+ if (tb[IFLA_BRPORT_BACKUP_PORT]) {
+ struct net_device *backup_dev = NULL;
+ u32 backup_ifindex;
+
+ backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
+ if (backup_ifindex) {
+ backup_dev = __dev_get_by_index(dev_net(p->dev),
+ backup_ifindex);
+ if (!backup_dev)
+ return -ENOENT;
+ }
+
+ err = nbp_backup_change(p, backup_dev);
+ if (err)
+ return err;
+ }
+
br_port_flags_change(p, old_flags ^ p->flags);
return 0;
}
@@ -1006,6 +1034,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1086,6 +1115,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (err)
return err;
}
+
+ if (data[IFLA_BR_VLAN_STATS_PER_PORT]) {
+ __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]);
+
+ err = br_vlan_set_stats_per_port(br, per_port);
+ if (err)
+ return err;
+ }
#endif
if (data[IFLA_BR_GROUP_FWD_MASK]) {
@@ -1111,7 +1148,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
spin_lock_bh(&br->lock);
memcpy(br->group_addr, new_addr, sizeof(br->group_addr));
spin_unlock_bh(&br->lock);
- br->group_addr_set = true;
+ br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
br_recalculate_fwd_mask(br);
}
@@ -1139,7 +1176,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
u8 val;
val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]);
- br->multicast_query_use_ifaddr = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
}
if (data[IFLA_BR_MCAST_QUERIER]) {
@@ -1216,7 +1253,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
__u8 mcast_stats;
mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
- br->multicast_stats_enabled = !!mcast_stats;
+ br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats);
}
if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
@@ -1243,19 +1280,19 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]);
- br->nf_call_iptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
}
if (data[IFLA_BR_NF_CALL_IP6TABLES]) {
u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]);
- br->nf_call_ip6tables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
}
if (data[IFLA_BR_NF_CALL_ARPTABLES]) {
u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]);
- br->nf_call_arptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
}
#endif
@@ -1299,6 +1336,7 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */
nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */
nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */
#endif
nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */
nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */
@@ -1388,17 +1426,22 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
- nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled))
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
+ br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT,
+ br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT)))
return -EMSGSIZE;
#endif
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) ||
- nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING,
+ br_opt_get(br, BROPT_MULTICAST_ENABLED)) ||
nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
- br->multicast_query_use_ifaddr) ||
- nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) ||
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
+ br_opt_get(br, BROPT_MULTICAST_QUERIER)) ||
nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
- br->multicast_stats_enabled) ||
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
br->hash_elasticity) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
@@ -1441,11 +1484,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES,
- br->nf_call_iptables ? 1 : 0) ||
+ br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) ||
nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES,
- br->nf_call_ip6tables ? 1 : 0) ||
+ br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) ||
nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES,
- br->nf_call_arptables ? 1 : 0))
+ br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0))
return -EMSGSIZE;
#endif
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5216a524b537..2920e06a5403 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -54,14 +54,12 @@ typedef struct bridge_id bridge_id;
typedef struct mac_addr mac_addr;
typedef __u16 port_id;
-struct bridge_id
-{
+struct bridge_id {
unsigned char prio[2];
unsigned char addr[ETH_ALEN];
};
-struct mac_addr
-{
+struct mac_addr {
unsigned char addr[ETH_ALEN];
};
@@ -181,6 +179,7 @@ struct net_bridge_fdb_entry {
struct hlist_node fdb_node;
unsigned char is_local:1,
is_static:1,
+ is_sticky:1,
added_by_user:1,
added_by_external_learn:1,
offloaded:1;
@@ -206,8 +205,7 @@ struct net_bridge_port_group {
unsigned char eth_addr[ETH_ALEN];
};
-struct net_bridge_mdb_entry
-{
+struct net_bridge_mdb_entry {
struct hlist_node hlist[2];
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
@@ -217,8 +215,7 @@ struct net_bridge_mdb_entry
bool host_joined;
};
-struct net_bridge_mdb_htable
-{
+struct net_bridge_mdb_htable {
struct hlist_head *mhash;
struct rcu_head rcu;
struct net_bridge_mdb_htable *old;
@@ -237,6 +234,7 @@ struct net_bridge_port {
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
+ struct net_bridge_port __rcu *backup_port;
/* STP */
u8 priority;
@@ -281,8 +279,11 @@ struct net_bridge_port {
int offload_fwd_mark;
#endif
u16 group_fwd_mask;
+ u16 backup_redirected_cnt;
};
+#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
+
#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
#define br_promisc_port(p) ((p)->flags & BR_PROMISC)
@@ -305,16 +306,32 @@ static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_devi
rcu_dereference_rtnl(dev->rx_handler_data) : NULL;
}
+enum net_bridge_opts {
+ BROPT_VLAN_ENABLED,
+ BROPT_VLAN_STATS_ENABLED,
+ BROPT_NF_CALL_IPTABLES,
+ BROPT_NF_CALL_IP6TABLES,
+ BROPT_NF_CALL_ARPTABLES,
+ BROPT_GROUP_ADDR_SET,
+ BROPT_MULTICAST_ENABLED,
+ BROPT_MULTICAST_QUERIER,
+ BROPT_MULTICAST_QUERY_USE_IFADDR,
+ BROPT_MULTICAST_STATS_ENABLED,
+ BROPT_HAS_IPV6_ADDR,
+ BROPT_NEIGH_SUPPRESS_ENABLED,
+ BROPT_MTU_SET_BY_USER,
+ BROPT_VLAN_STATS_PER_PORT,
+};
+
struct net_bridge {
spinlock_t lock;
spinlock_t hash_lock;
struct list_head port_list;
struct net_device *dev;
struct pcpu_sw_netstats __percpu *stats;
+ unsigned long options;
/* These fields are accessed on each packet */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- u8 vlan_enabled;
- u8 vlan_stats_enabled;
__be16 vlan_proto;
u16 default_pvid;
struct net_bridge_vlan_group __rcu *vlgrp;
@@ -326,9 +343,6 @@ struct net_bridge {
struct rtable fake_rtable;
struct rt6_info fake_rt6_info;
};
- bool nf_call_iptables;
- bool nf_call_ip6tables;
- bool nf_call_arptables;
#endif
u16 group_fwd_mask;
u16 group_fwd_mask_required;
@@ -336,7 +350,6 @@ struct net_bridge {
/* STP */
bridge_id designated_root;
bridge_id bridge_id;
- u32 root_path_cost;
unsigned char topology_change;
unsigned char topology_change_detected;
u16 root_port;
@@ -348,9 +361,9 @@ struct net_bridge {
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
unsigned long bridge_ageing_time;
+ u32 root_path_cost;
u8 group_addr[ETH_ALEN];
- bool group_addr_set;
enum {
BR_NO_STP, /* no spanning tree */
@@ -359,13 +372,6 @@ struct net_bridge {
} stp_enabled;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- unsigned char multicast_router;
-
- u8 multicast_disabled:1;
- u8 multicast_querier:1;
- u8 multicast_query_use_ifaddr:1;
- u8 has_ipv6_addr:1;
- u8 multicast_stats_enabled:1;
u32 hash_elasticity;
u32 hash_max;
@@ -374,7 +380,11 @@ struct net_bridge {
u32 multicast_startup_query_count;
u8 multicast_igmp_version;
-
+ u8 multicast_router;
+#if IS_ENABLED(CONFIG_IPV6)
+ u8 multicast_mld_version;
+#endif
+ spinlock_t multicast_lock;
unsigned long multicast_last_member_interval;
unsigned long multicast_membership_interval;
unsigned long multicast_querier_interval;
@@ -382,7 +392,6 @@ struct net_bridge {
unsigned long multicast_query_response_interval;
unsigned long multicast_startup_query_interval;
- spinlock_t multicast_lock;
struct net_bridge_mdb_htable __rcu *mdb;
struct hlist_head router_list;
@@ -395,7 +404,6 @@ struct net_bridge {
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
- u8 multicast_mld_version;
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif
@@ -409,8 +417,6 @@ struct net_bridge {
#ifdef CONFIG_NET_SWITCHDEV
int offload_fwd_mark;
#endif
- bool neigh_suppress_enabled;
- bool mtu_set_by_user;
struct hlist_head fdb_list;
};
@@ -488,6 +494,14 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
return true;
}
+static inline int br_opt_get(const struct net_bridge *br,
+ enum net_bridge_opts opt)
+{
+ return test_bit(opt, &br->options);
+}
+
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
+
/* br_device.c */
void br_dev_setup(struct net_device *dev);
void br_dev_delete(struct net_device *dev, struct list_head *list);
@@ -560,7 +574,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid,
bool swdev_notify);
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid, bool offloaded);
/* br_forward.c */
enum br_pkt_type {
@@ -595,6 +609,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br,
netdev_features_t features);
void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
void br_manage_promisc(struct net_bridge *br);
+int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
/* br_input.c */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
@@ -693,8 +708,8 @@ __br_multicast_querier_exists(struct net_bridge *br,
{
bool own_querier_enabled;
- if (br->multicast_querier) {
- if (is_ipv6 && !br->has_ipv6_addr)
+ if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
+ if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR))
own_querier_enabled = false;
else
own_querier_enabled = true;
@@ -845,6 +860,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index d77f807420c4..b993df770675 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -103,7 +103,7 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
static void
br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
u16 vid, struct net_device *dev,
- bool added_by_user)
+ bool added_by_user, bool offloaded)
{
struct switchdev_notifier_fdb_info info;
unsigned long notifier_type;
@@ -111,6 +111,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
info.addr = mac;
info.vid = vid;
info.added_by_user = added_by_user;
+ info.offloaded = offloaded;
notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
call_switchdev_notifiers(notifier_type, dev, &info.info);
}
@@ -126,13 +127,15 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
fdb->key.vlan_id,
fdb->dst->dev,
- fdb->added_by_user);
+ fdb->added_by_user,
+ fdb->offloaded);
break;
case RTM_NEWNEIGH:
br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
fdb->key.vlan_id,
fdb->dst->dev,
- fdb->added_by_user);
+ fdb->added_by_user,
+ fdb->offloaded);
break;
}
}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 0318a69888d4..60182bef6341 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -303,7 +303,7 @@ static ssize_t group_addr_store(struct device *d,
ether_addr_copy(br->group_addr, new_addr);
spin_unlock_bh(&br->lock);
- br->group_addr_set = true;
+ br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
br_recalculate_fwd_mask(br);
netdev_state_change(br->dev);
@@ -349,7 +349,7 @@ static ssize_t multicast_snooping_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", !br->multicast_disabled);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
}
static ssize_t multicast_snooping_store(struct device *d,
@@ -365,12 +365,13 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr);
+ return sprintf(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
}
static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val)
{
- br->multicast_query_use_ifaddr = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
return 0;
}
@@ -388,7 +389,7 @@ static ssize_t multicast_querier_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_querier);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER));
}
static ssize_t multicast_querier_store(struct device *d,
@@ -636,12 +637,13 @@ static ssize_t multicast_stats_enabled_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_stats_enabled);
+ return sprintf(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
}
static int set_stats_enabled(struct net_bridge *br, unsigned long val)
{
- br->multicast_stats_enabled = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val);
return 0;
}
@@ -678,12 +680,12 @@ static ssize_t nf_call_iptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->nf_call_iptables);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
}
static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
{
- br->nf_call_iptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
return 0;
}
@@ -699,12 +701,12 @@ static ssize_t nf_call_ip6tables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->nf_call_ip6tables);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
}
static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
{
- br->nf_call_ip6tables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
return 0;
}
@@ -720,12 +722,12 @@ static ssize_t nf_call_arptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->nf_call_arptables);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
}
static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
{
- br->nf_call_arptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
return 0;
}
@@ -743,7 +745,7 @@ static ssize_t vlan_filtering_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->vlan_enabled);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
}
static ssize_t vlan_filtering_store(struct device *d,
@@ -791,7 +793,7 @@ static ssize_t vlan_stats_enabled_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->vlan_stats_enabled);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
}
static ssize_t vlan_stats_enabled_store(struct device *d,
@@ -801,6 +803,22 @@ static ssize_t vlan_stats_enabled_store(struct device *d,
return store_bridge_parm(d, buf, len, br_vlan_set_stats);
}
static DEVICE_ATTR_RW(vlan_stats_enabled);
+
+static ssize_t vlan_stats_per_port_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
+}
+
+static ssize_t vlan_stats_per_port_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port);
+}
+static DEVICE_ATTR_RW(vlan_stats_per_port);
#endif
static struct attribute *bridge_attrs[] = {
@@ -854,6 +872,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_vlan_protocol.attr,
&dev_attr_default_pvid.attr,
&dev_attr_vlan_stats_enabled.attr,
+ &dev_attr_vlan_stats_per_port.attr,
#endif
NULL
};
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index f99c5bf5c906..7c87a2fe5248 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -25,6 +25,15 @@ struct brport_attribute {
struct attribute attr;
ssize_t (*show)(struct net_bridge_port *, char *);
int (*store)(struct net_bridge_port *, unsigned long);
+ int (*store_raw)(struct net_bridge_port *, char *);
+};
+
+#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \
+const struct brport_attribute brport_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode }, \
+ .show = _show, \
+ .store_raw = _store, \
};
#define BRPORT_ATTR(_name, _mode, _show, _store) \
@@ -182,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p,
static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
store_group_fwd_mask);
+static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
+{
+ struct net_bridge_port *backup_p;
+ int ret = 0;
+
+ rcu_read_lock();
+ backup_p = rcu_dereference(p->backup_port);
+ if (backup_p)
+ ret = sprintf(buf, "%s\n", backup_p->dev->name);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int store_backup_port(struct net_bridge_port *p, char *buf)
+{
+ struct net_device *backup_dev = NULL;
+ char *nl = strchr(buf, '\n');
+
+ if (nl)
+ *nl = '\0';
+
+ if (strlen(buf) > 0) {
+ backup_dev = __dev_get_by_name(dev_net(p->dev), buf);
+ if (!backup_dev)
+ return -ENOENT;
+ }
+
+ return nbp_backup_change(p, backup_dev);
+}
+static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port);
+
BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -245,17 +286,17 @@ static const struct brport_attribute *brport_attrs[] = {
&brport_attr_group_fwd_mask,
&brport_attr_neigh_suppress,
&brport_attr_isolated,
+ &brport_attr_backup_port,
NULL
};
#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
-#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
static ssize_t brport_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct brport_attribute *brport_attr = to_brport_attr(attr);
- struct net_bridge_port *p = to_brport(kobj);
+ struct net_bridge_port *p = kobj_to_brport(kobj);
if (!brport_attr->show)
return -EINVAL;
@@ -268,29 +309,48 @@ static ssize_t brport_store(struct kobject *kobj,
const char *buf, size_t count)
{
struct brport_attribute *brport_attr = to_brport_attr(attr);
- struct net_bridge_port *p = to_brport(kobj);
+ struct net_bridge_port *p = kobj_to_brport(kobj);
ssize_t ret = -EINVAL;
- char *endp;
unsigned long val;
+ char *endp;
if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- val = simple_strtoul(buf, &endp, 0);
- if (endp != buf) {
- if (!rtnl_trylock())
- return restart_syscall();
- if (p->dev && p->br && brport_attr->store) {
- spin_lock_bh(&p->br->lock);
- ret = brport_attr->store(p, val);
- spin_unlock_bh(&p->br->lock);
- if (!ret) {
- br_ifinfo_notify(RTM_NEWLINK, NULL, p);
- ret = count;
- }
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (!p->dev || !p->br)
+ goto out_unlock;
+
+ if (brport_attr->store_raw) {
+ char *buf_copy;
+
+ buf_copy = kstrndup(buf, count, GFP_KERNEL);
+ if (!buf_copy) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- rtnl_unlock();
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store_raw(p, buf_copy);
+ spin_unlock_bh(&p->br->lock);
+ kfree(buf_copy);
+ } else if (brport_attr->store) {
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ goto out_unlock;
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store(p, val);
+ spin_unlock_bh(&p->br->lock);
}
+
+ if (!ret) {
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+ ret = count;
+ }
+out_unlock:
+ rtnl_unlock();
+
return ret;
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 7df269092103..8c9297a01947 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -190,6 +190,19 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv)
}
}
+static void nbp_vlan_rcu_free(struct rcu_head *rcu)
+{
+ struct net_bridge_vlan *v;
+
+ v = container_of(rcu, struct net_bridge_vlan, rcu);
+ WARN_ON(br_vlan_is_master(v));
+ /* if we had per-port stats configured then free them here */
+ if (v->brvlan->stats != v->stats)
+ free_percpu(v->stats);
+ v->stats = NULL;
+ kfree(v);
+}
+
/* This is the shared VLAN add function which works for both ports and bridge
* devices. There are four possible calls to this function in terms of the
* vlan entry type:
@@ -245,7 +258,15 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
if (!masterv)
goto out_filt;
v->brvlan = masterv;
- v->stats = masterv->stats;
+ if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
+ v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
+ if (!v->stats) {
+ err = -ENOMEM;
+ goto out_filt;
+ }
+ } else {
+ v->stats = masterv->stats;
+ }
} else {
err = br_switchdev_port_vlan_add(dev, v->vid, flags);
if (err && err != -EOPNOTSUPP)
@@ -282,6 +303,10 @@ out_filt:
if (p) {
__vlan_vid_del(dev, br, v->vid);
if (masterv) {
+ if (v->stats && masterv->stats != v->stats)
+ free_percpu(v->stats);
+ v->stats = NULL;
+
br_vlan_put_master(masterv);
v->brvlan = NULL;
}
@@ -329,7 +354,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
br_vlan_rht_params);
__vlan_del_list(v);
- kfree_rcu(v, rcu);
+ call_rcu(&v->rcu, nbp_vlan_rcu_free);
}
br_vlan_put_master(masterv);
@@ -386,7 +411,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
return NULL;
}
}
- if (br->vlan_stats_enabled) {
+ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
stats = this_cpu_ptr(v->stats);
u64_stats_update_begin(&stats->syncp);
stats->tx_bytes += skb->len;
@@ -475,14 +500,14 @@ static bool __allowed_ingress(const struct net_bridge *br,
skb->vlan_tci |= pvid;
/* if stats are disabled we can avoid the lookup */
- if (!br->vlan_stats_enabled)
+ if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED))
return true;
}
v = br_vlan_find(vg, *vid);
if (!v || !br_vlan_should_use(v))
goto drop;
- if (br->vlan_stats_enabled) {
+ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
stats = this_cpu_ptr(v->stats);
u64_stats_update_begin(&stats->syncp);
stats->rx_bytes += skb->len;
@@ -504,7 +529,7 @@ bool br_allowed_ingress(const struct net_bridge *br,
/* If VLAN filtering is disabled on the bridge, all packets are
* permitted.
*/
- if (!br->vlan_enabled) {
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
return true;
}
@@ -538,7 +563,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
struct net_bridge *br = p->br;
/* If filtering was disabled at input, let it pass. */
- if (!br->vlan_enabled)
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
return true;
vg = nbp_vlan_group_rcu(p);
@@ -695,11 +720,12 @@ struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
/* Must be protected by RTNL. */
static void recalculate_group_addr(struct net_bridge *br)
{
- if (br->group_addr_set)
+ if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
return;
spin_lock_bh(&br->lock);
- if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) {
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+ br->vlan_proto == htons(ETH_P_8021Q)) {
/* Bridge Group Address */
br->group_addr[5] = 0x00;
} else { /* vlan_enabled && ETH_P_8021AD */
@@ -712,7 +738,8 @@ static void recalculate_group_addr(struct net_bridge *br)
/* Must be protected by RTNL. */
void br_recalculate_fwd_mask(struct net_bridge *br)
{
- if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q))
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+ br->vlan_proto == htons(ETH_P_8021Q))
br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
else /* vlan_enabled && ETH_P_8021AD */
br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
@@ -729,14 +756,14 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
};
int err;
- if (br->vlan_enabled == val)
+ if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
return 0;
err = switchdev_port_attr_set(br->dev, &attr);
if (err && err != -EOPNOTSUPP)
return err;
- br->vlan_enabled = val;
+ br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
br_manage_promisc(br);
recalculate_group_addr(br);
br_recalculate_fwd_mask(br);
@@ -753,7 +780,7 @@ bool br_vlan_enabled(const struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
- return !!br->vlan_enabled;
+ return br_opt_get(br, BROPT_VLAN_ENABLED);
}
EXPORT_SYMBOL_GPL(br_vlan_enabled);
@@ -819,7 +846,31 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
switch (val) {
case 0:
case 1:
- br->vlan_stats_enabled = val;
+ br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
+{
+ struct net_bridge_port *p;
+
+ /* allow to change the option if there are no port vlans configured */
+ list_for_each_entry(p, &br->port_list, list) {
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+
+ if (vg->num_vlans)
+ return -EBUSY;
+ }
+
+ switch (val) {
+ case 0:
+ case 1:
+ br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
break;
default:
return -EINVAL;
@@ -877,8 +928,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
return 0;
}
- changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
- GFP_KERNEL);
+ changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
if (!changed)
return -ENOMEM;
@@ -925,7 +975,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
br->default_pvid = pvid;
out:
- kfree(changed);
+ bitmap_free(changed);
return err;
err_port:
@@ -965,7 +1015,7 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
goto out;
/* Only allow default pvid change when filtering is disabled */
- if (br->vlan_enabled) {
+ if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
pr_info_once("Please disable vlan filtering to change default_pvid\n");
err = -EPERM;
goto out;
@@ -1019,7 +1069,7 @@ int nbp_vlan_init(struct net_bridge_port *p)
.orig_dev = p->br->dev,
.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
- .u.vlan_filtering = p->br->vlan_enabled,
+ .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
};
struct net_bridge_vlan_group *vg;
int ret = -ENOMEM;
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index c41da5fac84f..550324c516ee 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -9,6 +9,7 @@
*/
#include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/module.h>
#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 08df7406ecb3..c0fb3ca518af 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -9,6 +9,7 @@
*/
#include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/module.h>
#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 6de981270566..08cbed7d940e 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -89,8 +89,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
net->ipv4.sysctl_ip_default_ttl);
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- niph->ttl = net->ipv4.sysctl_ip_default_ttl;
- niph->tot_len = htons(nskb->len);
+ niph->tot_len = htons(nskb->len);
ip_send_check(niph);
nft_reject_br_push_etherhdr(oldskb, nskb);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index a6fb1b3bcad9..416717c57cd1 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file,
__poll_t mask;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index b82440e1fcb4..a931a71ef6df 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -264,9 +264,6 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
frontpkt = rearpkt;
rearpkt = NULL;
- err = -ENOMEM;
- if (frontpkt == NULL)
- goto out;
err = -EPROTO;
if (cfpkt_add_head(frontpkt, head, 6) < 0)
goto out;
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f8cceb99e732..cd2d5b9301a1 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -41,4 +41,3 @@ config CEPH_LIB_USE_DNS_RESOLVER
Documentation/networking/dns_resolver.txt
If unsure, say N.
-
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 12bf49772d24..db09defe27d0 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -15,4 +15,3 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
pagevec.o snapshot.o string_table.o
-
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index dbde2b3c3c15..fbeee068ea14 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -315,6 +315,22 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
}
EXPORT_SYMBOL(ceph_auth_update_authorizer);
+int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len)
+{
+ int ret = 0;
+
+ mutex_lock(&ac->mutex);
+ if (ac->ops && ac->ops->add_authorizer_challenge)
+ ret = ac->ops->add_authorizer_challenge(ac, a, challenge_buf,
+ challenge_buf_len);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
+
int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
struct ceph_authorizer *a)
{
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index 41d2a0c72236..edb7042479ed 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -142,4 +142,3 @@ int ceph_auth_none_init(struct ceph_auth_client *ac)
ac->ops = &ceph_auth_none_ops;
return 0;
}
-
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
index 860ed9875791..4158f064302e 100644
--- a/net/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -26,4 +26,3 @@ struct ceph_auth_none_info {
int ceph_auth_none_init(struct ceph_auth_client *ac);
#endif
-
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 2f4a1baf5f52..b52732337ca6 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -9,6 +9,7 @@
#include <linux/ceph/decode.h>
#include <linux/ceph/auth.h>
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
@@ -70,25 +71,40 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf,
return sizeof(u32) + ciphertext_len;
}
+static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p,
+ int ciphertext_len)
+{
+ struct ceph_x_encrypt_header *hdr = p;
+ int plaintext_len;
+ int ret;
+
+ ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len,
+ &plaintext_len);
+ if (ret)
+ return ret;
+
+ if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) {
+ pr_err("%s bad magic\n", __func__);
+ return -EINVAL;
+ }
+
+ return plaintext_len - sizeof(*hdr);
+}
+
static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end)
{
- struct ceph_x_encrypt_header *hdr = *p + sizeof(u32);
- int ciphertext_len, plaintext_len;
+ int ciphertext_len;
int ret;
ceph_decode_32_safe(p, end, ciphertext_len, e_inval);
ceph_decode_need(p, end, ciphertext_len, e_inval);
- ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len,
- &plaintext_len);
- if (ret)
+ ret = __ceph_x_decrypt(secret, *p, ciphertext_len);
+ if (ret < 0)
return ret;
- if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC)
- return -EPERM;
-
*p += ciphertext_len;
- return plaintext_len - sizeof(struct ceph_x_encrypt_header);
+ return ret;
e_inval:
return -EINVAL;
@@ -149,12 +165,12 @@ static int process_one_ticket(struct ceph_auth_client *ac,
void *dp, *dend;
int dlen;
char is_enc;
- struct timespec validity;
+ struct timespec64 validity;
void *tp, *tpend;
void **ptp;
struct ceph_crypto_key new_session_key = { 0 };
struct ceph_buffer *new_ticket_blob;
- unsigned long new_expires, new_renew_after;
+ time64_t new_expires, new_renew_after;
u64 new_secret_id;
int ret;
@@ -189,11 +205,11 @@ static int process_one_ticket(struct ceph_auth_client *ac,
if (ret)
goto out;
- ceph_decode_timespec(&validity, dp);
+ ceph_decode_timespec64(&validity, dp);
dp += sizeof(struct ceph_timespec);
- new_expires = get_seconds() + validity.tv_sec;
+ new_expires = ktime_get_real_seconds() + validity.tv_sec;
new_renew_after = new_expires - (validity.tv_sec / 4);
- dout(" expires=%lu renew_after=%lu\n", new_expires,
+ dout(" expires=%llu renew_after=%llu\n", new_expires,
new_renew_after);
/* ticket blob for service */
@@ -275,6 +291,51 @@ bad:
return -EINVAL;
}
+/*
+ * Encode and encrypt the second part (ceph_x_authorize_b) of the
+ * authorizer. The first part (ceph_x_authorize_a) should already be
+ * encoded.
+ */
+static int encrypt_authorizer(struct ceph_x_authorizer *au,
+ u64 *server_challenge)
+{
+ struct ceph_x_authorize_a *msg_a;
+ struct ceph_x_authorize_b *msg_b;
+ void *p, *end;
+ int ret;
+
+ msg_a = au->buf->vec.iov_base;
+ WARN_ON(msg_a->ticket_blob.secret_id != cpu_to_le64(au->secret_id));
+ p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len);
+ end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+ msg_b = p + ceph_x_encrypt_offset();
+ msg_b->struct_v = 2;
+ msg_b->nonce = cpu_to_le64(au->nonce);
+ if (server_challenge) {
+ msg_b->have_challenge = 1;
+ msg_b->server_challenge_plus_one =
+ cpu_to_le64(*server_challenge + 1);
+ } else {
+ msg_b->have_challenge = 0;
+ msg_b->server_challenge_plus_one = 0;
+ }
+
+ ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
+ if (ret < 0)
+ return ret;
+
+ p += ret;
+ if (server_challenge) {
+ WARN_ON(p != end);
+ } else {
+ WARN_ON(p > end);
+ au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+ }
+
+ return 0;
+}
+
static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au)
{
ceph_crypto_key_destroy(&au->session_key);
@@ -291,7 +352,6 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
int maxlen;
struct ceph_x_authorize_a *msg_a;
struct ceph_x_authorize_b *msg_b;
- void *p, *end;
int ret;
int ticket_blob_len =
(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
@@ -335,21 +395,13 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
le64_to_cpu(msg_a->ticket_blob.secret_id));
- p = msg_a + 1;
- p += ticket_blob_len;
- end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-
- msg_b = p + ceph_x_encrypt_offset();
- msg_b->struct_v = 1;
get_random_bytes(&au->nonce, sizeof(au->nonce));
- msg_b->nonce = cpu_to_le64(au->nonce);
- ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b));
- if (ret < 0)
+ ret = encrypt_authorizer(au, NULL);
+ if (ret) {
+ pr_err("failed to encrypt authorizer: %d", ret);
goto out_au;
+ }
- p += ret;
- WARN_ON(p > end);
- au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
(int)au->buf->vec.iov_len);
return 0;
@@ -385,13 +437,13 @@ static bool need_key(struct ceph_x_ticket_handler *th)
if (!th->have_key)
return true;
- return get_seconds() >= th->renew_after;
+ return ktime_get_real_seconds() >= th->renew_after;
}
static bool have_key(struct ceph_x_ticket_handler *th)
{
if (th->have_key) {
- if (get_seconds() >= th->expires)
+ if (ktime_get_real_seconds() >= th->expires)
th->have_key = false;
}
@@ -626,6 +678,54 @@ static int ceph_x_update_authorizer(
return 0;
}
+static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
+ void *challenge_buf,
+ int challenge_buf_len,
+ u64 *server_challenge)
+{
+ struct ceph_x_authorize_challenge *ch =
+ challenge_buf + sizeof(struct ceph_x_encrypt_header);
+ int ret;
+
+ /* no leading len */
+ ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
+ challenge_buf_len);
+ if (ret < 0)
+ return ret;
+ if (ret < sizeof(*ch)) {
+ pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
+ return -EINVAL;
+ }
+
+ *server_challenge = le64_to_cpu(ch->server_challenge);
+ return 0;
+}
+
+static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+ u64 server_challenge;
+ int ret;
+
+ ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
+ &server_challenge);
+ if (ret) {
+ pr_err("failed to decrypt authorize challenge: %d", ret);
+ return ret;
+ }
+
+ ret = encrypt_authorizer(au, &server_challenge);
+ if (ret) {
+ pr_err("failed to encrypt authorizer w/ challenge: %d", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
struct ceph_authorizer *a)
{
@@ -637,8 +737,10 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
if (ret < 0)
return ret;
- if (ret != sizeof(*reply))
- return -EPERM;
+ if (ret < sizeof(*reply)) {
+ pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
+ return -EINVAL;
+ }
if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
ret = -EPERM;
@@ -704,26 +806,64 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg,
__le64 *psig)
{
void *enc_buf = au->enc_buf;
- struct {
- __le32 len;
- __le32 header_crc;
- __le32 front_crc;
- __le32 middle_crc;
- __le32 data_crc;
- } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
int ret;
- sigblock->len = cpu_to_le32(4*sizeof(u32));
- sigblock->header_crc = msg->hdr.crc;
- sigblock->front_crc = msg->footer.front_crc;
- sigblock->middle_crc = msg->footer.middle_crc;
- sigblock->data_crc = msg->footer.data_crc;
- ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN,
- sizeof(*sigblock));
- if (ret < 0)
- return ret;
+ if (!CEPH_HAVE_FEATURE(msg->con->peer_features, CEPHX_V2)) {
+ struct {
+ __le32 len;
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 middle_crc;
+ __le32 data_crc;
+ } __packed *sigblock = enc_buf + ceph_x_encrypt_offset();
+
+ sigblock->len = cpu_to_le32(4*sizeof(u32));
+ sigblock->header_crc = msg->hdr.crc;
+ sigblock->front_crc = msg->footer.front_crc;
+ sigblock->middle_crc = msg->footer.middle_crc;
+ sigblock->data_crc = msg->footer.data_crc;
+
+ ret = ceph_x_encrypt(&au->session_key, enc_buf,
+ CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock));
+ if (ret < 0)
+ return ret;
+
+ *psig = *(__le64 *)(enc_buf + sizeof(u32));
+ } else {
+ struct {
+ __le32 header_crc;
+ __le32 front_crc;
+ __le32 front_len;
+ __le32 middle_crc;
+ __le32 middle_len;
+ __le32 data_crc;
+ __le32 data_len;
+ __le32 seq_lower_word;
+ } __packed *sigblock = enc_buf;
+ struct {
+ __le64 a, b, c, d;
+ } __packed *penc = enc_buf;
+ int ciphertext_len;
+
+ sigblock->header_crc = msg->hdr.crc;
+ sigblock->front_crc = msg->footer.front_crc;
+ sigblock->front_len = msg->hdr.front_len;
+ sigblock->middle_crc = msg->footer.middle_crc;
+ sigblock->middle_len = msg->hdr.middle_len;
+ sigblock->data_crc = msg->footer.data_crc;
+ sigblock->data_len = msg->hdr.data_len;
+ sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq;
+
+ /* no leading len, no ceph_x_encrypt_header */
+ ret = ceph_crypt(&au->session_key, true, enc_buf,
+ CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock),
+ &ciphertext_len);
+ if (ret)
+ return ret;
+
+ *psig = penc->a ^ penc->b ^ penc->c ^ penc->d;
+ }
- *psig = *(__le64 *)(enc_buf + sizeof(u32));
return 0;
}
@@ -778,6 +918,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
.handle_reply = ceph_x_handle_reply,
.create_authorizer = ceph_x_create_authorizer,
.update_authorizer = ceph_x_update_authorizer,
+ .add_authorizer_challenge = ceph_x_add_authorizer_challenge,
.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
.invalidate_authorizer = ceph_x_invalidate_authorizer,
.reset = ceph_x_reset,
@@ -823,5 +964,3 @@ out_nomem:
out:
return ret;
}
-
-
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index 454cb54568af..c03735f96df9 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -22,7 +22,7 @@ struct ceph_x_ticket_handler {
u64 secret_id;
struct ceph_buffer *ticket_blob;
- unsigned long renew_after, expires;
+ time64_t renew_after, expires;
};
#define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */
@@ -52,4 +52,3 @@ struct ceph_x_info {
int ceph_x_init(struct ceph_auth_client *ac);
#endif
-
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 32c13d763b9a..24b0b74564d0 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -70,6 +70,13 @@ struct ceph_x_authorize_a {
struct ceph_x_authorize_b {
__u8 struct_v;
__le64 nonce;
+ __u8 have_challenge;
+ __le64 server_challenge_plus_one;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_challenge {
+ __u8 struct_v;
+ __le64 server_challenge;
} __attribute__ ((packed));
struct ceph_x_authorize_reply {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 584fdbef2088..87afb9ec4c68 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -304,7 +304,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
struct ceph_crypto_key *ckey;
ukey = request_key(&key_type_ceph, name, NULL);
- if (!ukey || IS_ERR(ukey)) {
+ if (IS_ERR(ukey)) {
/* request_key errors don't map nicely to mount(2)
errors; don't even try, but still printk */
key_err = PTR_ERR(ukey);
@@ -379,7 +379,7 @@ ceph_parse_options(char *options, const char *dev_name,
/* parse mount options */
while ((c = strsep(&options, ",")) != NULL) {
- int token, intval, ret;
+ int token, intval;
if (!*c)
continue;
err = -EINVAL;
@@ -394,11 +394,10 @@ ceph_parse_options(char *options, const char *dev_name,
continue;
}
if (token < Opt_last_int) {
- ret = match_int(&argstr[0], &intval);
- if (ret < 0) {
- pr_err("bad mount option arg (not int) "
- "at '%s'\n", c);
- continue;
+ err = match_int(&argstr[0], &intval);
+ if (err < 0) {
+ pr_err("bad option arg (not int) at '%s'\n", c);
+ goto out;
}
dout("got int token %d val %d\n", token, intval);
} else if (token > Opt_last_int && token < Opt_last_string) {
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 8d2032b2f225..2105a6eaa66c 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -32,7 +32,7 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
int desc_len = strlen(desc);
void *p, *end;
struct page *lock_op_page;
- struct timespec mtime;
+ struct timespec64 mtime;
int ret;
lock_op_buf_size = name_len + sizeof(__le32) +
@@ -63,7 +63,7 @@ int ceph_cls_lock(struct ceph_osd_client *osdc,
ceph_encode_string(&p, end, desc, desc_len);
/* only support infinite duration */
memset(&mtime, 0, sizeof(mtime));
- ceph_encode_timespec(p, &mtime);
+ ceph_encode_timespec64(p, &mtime);
p += sizeof(struct ceph_timespec);
ceph_encode_8(&p, flags);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 417df675c71b..3f323ed9df52 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -514,7 +514,7 @@ static int crush_choose_firstn(const struct crush_map *map,
in, work->work[-1-in->id],
x, r,
(choose_args ?
- &choose_args[-1-in->id] : 0),
+ &choose_args[-1-in->id] : NULL),
outpos);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
@@ -725,7 +725,7 @@ static void crush_choose_indep(const struct crush_map *map,
in, work->work[-1-in->id],
x, r,
(choose_args ?
- &choose_args[-1-in->id] : 0),
+ &choose_args[-1-in->id] : NULL),
outpos);
if (item >= map->max_devices) {
dprintk(" bad item %d\n", item);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c6413c360771..0a187196aeed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1417,11 +1417,11 @@ static void prepare_write_keepalive(struct ceph_connection *con)
dout("prepare_write_keepalive %p\n", con);
con_out_kvec_reset(con);
if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
- struct timespec now;
+ struct timespec64 now;
- ktime_get_real_ts(&now);
+ ktime_get_real_ts64(&now);
con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
- ceph_encode_timespec(&con->out_temp_keepalive2, &now);
+ ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
&con->out_temp_keepalive2);
} else {
@@ -1434,24 +1434,26 @@ static void prepare_write_keepalive(struct ceph_connection *con)
* Connection negotiation.
*/
-static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con,
- int *auth_proto)
+static int get_connect_authorizer(struct ceph_connection *con)
{
struct ceph_auth_handshake *auth;
+ int auth_proto;
if (!con->ops->get_authorizer) {
+ con->auth = NULL;
con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
con->out_connect.authorizer_len = 0;
- return NULL;
+ return 0;
}
- auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
+ auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
if (IS_ERR(auth))
- return auth;
+ return PTR_ERR(auth);
- con->auth_reply_buf = auth->authorizer_reply_buf;
- con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
- return auth;
+ con->auth = auth;
+ con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+ con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+ return 0;
}
/*
@@ -1467,12 +1469,22 @@ static void prepare_write_banner(struct ceph_connection *con)
con_flag_set(con, CON_FLAG_WRITE_PENDING);
}
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
+ if (con->auth)
+ con_out_kvec_add(con, con->auth->authorizer_buf_len,
+ con->auth->authorizer_buf);
+
+ con->out_more = 0;
+ con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
static int prepare_write_connect(struct ceph_connection *con)
{
unsigned int global_seq = get_global_seq(con->msgr, 0);
int proto;
- int auth_proto;
- struct ceph_auth_handshake *auth;
+ int ret;
switch (con->peer_name.type) {
case CEPH_ENTITY_TYPE_MON:
@@ -1499,24 +1511,11 @@ static int prepare_write_connect(struct ceph_connection *con)
con->out_connect.protocol_version = cpu_to_le32(proto);
con->out_connect.flags = 0;
- auth_proto = CEPH_AUTH_UNKNOWN;
- auth = get_connect_authorizer(con, &auth_proto);
- if (IS_ERR(auth))
- return PTR_ERR(auth);
-
- con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
- con->out_connect.authorizer_len = auth ?
- cpu_to_le32(auth->authorizer_buf_len) : 0;
-
- con_out_kvec_add(con, sizeof (con->out_connect),
- &con->out_connect);
- if (auth && auth->authorizer_buf_len)
- con_out_kvec_add(con, auth->authorizer_buf_len,
- auth->authorizer_buf);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
+ ret = get_connect_authorizer(con);
+ if (ret)
+ return ret;
+ __prepare_write_connect(con);
return 0;
}
@@ -1781,11 +1780,21 @@ static int read_partial_connect(struct ceph_connection *con)
if (ret <= 0)
goto out;
- size = le32_to_cpu(con->in_reply.authorizer_len);
- end += size;
- ret = read_partial(con, end, size, con->auth_reply_buf);
- if (ret <= 0)
- goto out;
+ if (con->auth) {
+ size = le32_to_cpu(con->in_reply.authorizer_len);
+ if (size > con->auth->authorizer_reply_buf_len) {
+ pr_err("authorizer reply too big: %d > %zu\n", size,
+ con->auth->authorizer_reply_buf_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ end += size;
+ ret = read_partial(con, end, size,
+ con->auth->authorizer_reply_buf);
+ if (ret <= 0)
+ goto out;
+ }
dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
con, (int)con->in_reply.tag,
@@ -1793,7 +1802,6 @@ static int read_partial_connect(struct ceph_connection *con)
le32_to_cpu(con->in_reply.global_seq));
out:
return ret;
-
}
/*
@@ -2076,12 +2084,27 @@ static int process_connect(struct ceph_connection *con)
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
- if (con->auth_reply_buf) {
+ if (con->auth) {
/*
* Any connection that defines ->get_authorizer()
- * should also define ->verify_authorizer_reply().
+ * should also define ->add_authorizer_challenge() and
+ * ->verify_authorizer_reply().
+ *
* See get_connect_authorizer().
*/
+ if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ret = con->ops->add_authorizer_challenge(
+ con, con->auth->authorizer_reply_buf,
+ le32_to_cpu(con->in_reply.authorizer_len));
+ if (ret < 0)
+ return ret;
+
+ con_out_kvec_reset(con);
+ __prepare_write_connect(con);
+ prepare_read_connect(con);
+ return 0;
+ }
+
ret = con->ops->verify_authorizer_reply(con);
if (ret < 0) {
con->error_msg = "bad authorize reply";
@@ -2555,7 +2578,7 @@ static int read_keepalive_ack(struct ceph_connection *con)
int ret = read_partial(con, size, size, &ceph_ts);
if (ret <= 0)
return ret;
- ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+ ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
prepare_read_tag(con);
return 1;
}
@@ -3223,12 +3246,12 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con,
{
if (interval > 0 &&
(con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
- struct timespec now;
- struct timespec ts;
- ktime_get_real_ts(&now);
- jiffies_to_timespec(interval, &ts);
- ts = timespec_add(con->last_keepalive_ack, ts);
- return timespec_compare(&now, &ts) >= 0;
+ struct timespec64 now;
+ struct timespec64 ts;
+ ktime_get_real_ts64(&now);
+ jiffies_to_timespec64(interval, &ts);
+ ts = timespec64_add(con->last_keepalive_ack, ts);
+ return timespec64_compare(&now, &ts) >= 0;
}
return false;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index d7a7a2330ef7..18deb3d889c4 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1249,7 +1249,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
if (monc->client->extra_mon_dispatch &&
monc->client->extra_mon_dispatch(monc->client, msg) == 0)
break;
-
+
pr_err("received unknown message type %d %s\n", type,
ceph_msg_type_name(type));
}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a00c74f1154e..60934bd8796c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1978,7 +1978,7 @@ static void encode_request_partial(struct ceph_osd_request *req,
p += sizeof(struct ceph_blkin_trace_info);
ceph_encode_32(&p, 0); /* client_inc, always 0 */
- ceph_encode_timespec(p, &req->r_mtime);
+ ceph_encode_timespec64(p, &req->r_mtime);
p += sizeof(struct ceph_timespec);
encode_oloc(&p, end, &req->r_t.target_oloc);
@@ -4512,7 +4512,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
ceph_oid_copy(&lreq->t.base_oid, oid);
ceph_oloc_copy(&lreq->t.base_oloc, oloc);
lreq->t.flags = CEPH_OSD_FLAG_WRITE;
- ktime_get_real_ts(&lreq->mtime);
+ ktime_get_real_ts64(&lreq->mtime);
lreq->reg_req = alloc_linger_request(lreq);
if (!lreq->reg_req) {
@@ -4570,7 +4570,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
req->r_flags = CEPH_OSD_FLAG_WRITE;
- ktime_get_real_ts(&req->r_mtime);
+ ktime_get_real_ts64(&req->r_mtime);
osd_req_op_watch_init(req, 0, lreq->linger_id,
CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4591,7 +4591,7 @@ EXPORT_SYMBOL(ceph_osdc_unwatch);
static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
u64 notify_id, u64 cookie, void *payload,
- size_t payload_len)
+ u32 payload_len)
{
struct ceph_osd_req_op *op;
struct ceph_pagelist *pl;
@@ -4628,7 +4628,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
u64 notify_id,
u64 cookie,
void *payload,
- size_t payload_len)
+ u32 payload_len)
{
struct ceph_osd_request *req;
int ret;
@@ -4661,7 +4661,7 @@ EXPORT_SYMBOL(ceph_osdc_notify_ack);
static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
u64 cookie, u32 prot_ver, u32 timeout,
- void *payload, size_t payload_len)
+ void *payload, u32 payload_len)
{
struct ceph_osd_req_op *op;
struct ceph_pagelist *pl;
@@ -4701,7 +4701,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
struct ceph_object_id *oid,
struct ceph_object_locator *oloc,
void *payload,
- size_t payload_len,
+ u32 payload_len,
u32 timeout,
struct page ***preply_pages,
size_t *preply_len)
@@ -5136,7 +5136,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
struct ceph_snap_context *snapc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
- struct timespec *mtime,
+ struct timespec64 *mtime,
struct page **pages, int num_pages)
{
struct ceph_osd_request *req;
@@ -5393,6 +5393,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
return auth;
}
+static int add_authorizer_challenge(struct ceph_connection *con,
+ void *challenge_buf, int challenge_buf_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer,
+ challenge_buf, challenge_buf_len);
+}
static int verify_authorizer_reply(struct ceph_connection *con)
{
@@ -5442,6 +5452,7 @@ static const struct ceph_connection_operations osd_con_ops = {
.put = put_osd_con,
.dispatch = dispatch,
.get_authorizer = get_authorizer,
+ .add_authorizer_challenge = add_authorizer_challenge,
.verify_authorizer_reply = verify_authorizer_reply,
.invalidate_authorizer = invalidate_authorizer,
.alloc_msg = alloc_msg,
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index e560d3975f41..d3736f5bffec 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -197,4 +197,3 @@ void ceph_zero_page_vector_range(int off, int len, struct page **pages)
}
}
EXPORT_SYMBOL(ceph_zero_page_vector_range);
-
diff --git a/net/compat.c b/net/compat.c
index 7242cce5631b..47a614b370cd 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -466,8 +466,7 @@ int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
ctv = (struct compat_timeval __user *) userstamp;
err = -ENOENT;
- if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
tv = ktime_to_timeval(sk->sk_stamp);
if (tv.tv_sec == -1)
return err;
@@ -494,8 +493,7 @@ int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *usersta
ctv = (struct compat_timespec __user *) userstamp;
err = -ENOENT;
- if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ts = ktime_to_timespec(sk->sk_stamp);
if (ts.tv_sec == -1)
return err;
@@ -814,21 +812,21 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len
static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags,
- struct compat_timespec __user *timeout)
+ struct old_timespec32 __user *timeout)
{
int datagrams;
- struct timespec ktspec;
+ struct timespec64 ktspec;
if (timeout == NULL)
return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
flags | MSG_CMSG_COMPAT, NULL);
- if (compat_get_timespec(&ktspec, timeout))
+ if (compat_get_timespec64(&ktspec, timeout))
return -EFAULT;
datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
flags | MSG_CMSG_COMPAT, &ktspec);
- if (datagrams > 0 && compat_put_timespec(&ktspec, timeout))
+ if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout))
datagrams = -EFAULT;
return datagrams;
@@ -836,7 +834,7 @@ static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
unsigned int, vlen, unsigned int, flags,
- struct compat_timespec __user *, timeout)
+ struct old_timespec32 __user *, timeout)
{
return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
}
diff --git a/net/core/Makefile b/net/core/Makefile
index 80175e6a2eb8..fccd31e0e7f7 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9938952c5c78..6a034eb538a1 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
diff --git a/net/core/dev.c b/net/core/dev.c
index 559a91271f82..022ad73d6253 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -93,7 +93,6 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
-#include <linux/notifier.h>
#include <linux/skbuff.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
@@ -149,7 +148,6 @@
#include "net-sysfs.h"
-/* Instead of increasing this, you should create a hash table. */
#define MAX_GRO_SKBS 8
/* This should be increased if a protocol with a bigger head is added. */
@@ -1754,6 +1752,28 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
}
EXPORT_SYMBOL(call_netdevice_notifiers);
+/**
+ * call_netdevice_notifiers_mtu - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @arg: additional u32 argument passed to the notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+static int call_netdevice_notifiers_mtu(unsigned long val,
+ struct net_device *dev, u32 arg)
+{
+ struct netdev_notifier_info_ext info = {
+ .info.dev = dev,
+ .ext.mtu = arg,
+ };
+
+ BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
+
+ return call_netdevice_notifiers_info(val, &info.info);
+}
+
#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
@@ -1956,6 +1976,17 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
return false;
}
+/**
+ * dev_nit_active - return true if any network interface taps are in use
+ *
+ * @dev: network device to check for the presence of taps
+ */
+bool dev_nit_active(struct net_device *dev)
+{
+ return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
+}
+EXPORT_SYMBOL_GPL(dev_nit_active);
+
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
@@ -1971,6 +2002,9 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
rcu_read_lock();
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (ptype->ignore_outgoing)
+ continue;
+
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
@@ -2068,11 +2102,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
int i;
+ /* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
+ /* didn't find it, just return -1 to indicate no match */
return -1;
}
@@ -2081,6 +2117,10 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
+struct static_key xps_needed __read_mostly;
+EXPORT_SYMBOL(xps_needed);
+struct static_key xps_rxqs_needed __read_mostly;
+EXPORT_SYMBOL(xps_rxqs_needed);
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
@@ -2092,7 +2132,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
int pos;
if (dev_maps)
- map = xmap_dereference(dev_maps->cpu_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
@@ -2105,7 +2145,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
- RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
}
@@ -2135,34 +2175,71 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
return active;
}
+static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
+ struct xps_dev_maps *dev_maps, unsigned int nr_ids,
+ u16 offset, u16 count, bool is_rxqs_map)
+{
+ bool active = false;
+ int i, j;
+
+ for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
+ j < nr_ids;)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
+ count);
+ if (!active) {
+ if (is_rxqs_map) {
+ RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+ } else {
+ RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+
+ for (i = offset + (count - 1); count--; i--)
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+ }
+ kfree_rcu(dev_maps, rcu);
+ }
+}
+
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
+ const unsigned long *possible_mask = NULL;
struct xps_dev_maps *dev_maps;
- int cpu, i;
- bool active = false;
+ unsigned int nr_ids;
+ if (!static_key_false(&xps_needed))
+ return;
+
+ cpus_read_lock();
mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
+ if (static_key_false(&xps_rxqs_needed)) {
+ dev_maps = xmap_dereference(dev->xps_rxqs_map);
+ if (dev_maps) {
+ nr_ids = dev->num_rx_queues;
+ clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
+ offset, count, true);
+ }
+ }
+
+ dev_maps = xmap_dereference(dev->xps_cpus_map);
if (!dev_maps)
goto out_no_maps;
- for_each_possible_cpu(cpu)
- active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
- offset, count);
-
- if (!active) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
- kfree_rcu(dev_maps, rcu);
- }
-
- for (i = offset + (count - 1); count--; i--)
- netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
+ if (num_possible_cpus() > 1)
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ nr_ids = nr_cpu_ids;
+ clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
+ false);
out_no_maps:
+ if (static_key_enabled(&xps_rxqs_needed))
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ static_key_slow_dec_cpuslocked(&xps_needed);
mutex_unlock(&xps_map_mutex);
+ cpus_read_unlock();
}
static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
@@ -2170,8 +2247,8 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}
-static struct xps_map *expand_xps_map(struct xps_map *map,
- int cpu, u16 index)
+static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
+ u16 index, bool is_rxqs_map)
{
struct xps_map *new_map;
int alloc_len = XPS_MIN_MAP_ALLOC;
@@ -2183,7 +2260,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return map;
}
- /* Need to add queue to this CPU's existing map */
+ /* Need to add tx-queue to this CPU's/rx-queue's existing map */
if (map) {
if (pos < map->alloc_len)
return map;
@@ -2191,9 +2268,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
alloc_len = map->alloc_len * 2;
}
- /* Need to allocate new map to store queue on this CPU's map */
- new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
- cpu_to_node(cpu));
+ /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
+ * map
+ */
+ if (is_rxqs_map)
+ new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
+ else
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(attr_index));
if (!new_map)
return NULL;
@@ -2205,32 +2287,53 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
return new_map;
}
-int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
- u16 index)
+/* Must be called under cpus_read_lock */
+int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
+ u16 index, bool is_rxqs_map)
{
+ const unsigned long *online_mask = NULL, *possible_mask = NULL;
struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
- int i, cpu, tci, numa_node_id = -2;
+ int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
bool active = false;
+ unsigned int nr_ids;
if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+ if (num_tc < 0)
+ return -EINVAL;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
}
- maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
- if (maps_sz < L1_CACHE_BYTES)
- maps_sz = L1_CACHE_BYTES;
-
mutex_lock(&xps_map_mutex);
+ if (is_rxqs_map) {
+ maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
+ dev_maps = xmap_dereference(dev->xps_rxqs_map);
+ nr_ids = dev->num_rx_queues;
+ } else {
+ maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
+ if (num_possible_cpus() > 1) {
+ online_mask = cpumask_bits(cpu_online_mask);
+ possible_mask = cpumask_bits(cpu_possible_mask);
+ }
+ dev_maps = xmap_dereference(dev->xps_cpus_map);
+ nr_ids = nr_cpu_ids;
+ }
- dev_maps = xmap_dereference(dev->xps_maps);
+ if (maps_sz < L1_CACHE_BYTES)
+ maps_sz = L1_CACHE_BYTES;
/* allocate memory for queue storage */
- for_each_cpu_and(cpu, cpu_online_mask, mask) {
+ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
+ j < nr_ids;) {
if (!new_dev_maps)
new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
@@ -2238,73 +2341,85 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
return -ENOMEM;
}
- tci = cpu * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
+ tci = j * num_tc + tc;
+ map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
- map = expand_xps_map(map, cpu, index);
+ map = expand_xps_map(map, j, index, is_rxqs_map);
if (!map)
goto error;
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
if (!new_dev_maps)
goto out_no_new_maps;
- for_each_possible_cpu(cpu) {
+ static_key_slow_inc_cpuslocked(&xps_needed);
+ if (is_rxqs_map)
+ static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
/* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+ for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
/* We need to explicitly update tci as prevous loop
* could break out early if dev_maps is NULL.
*/
- tci = cpu * num_tc + tc;
+ tci = j * num_tc + tc;
- if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
- /* add queue to CPU maps */
+ if (netif_attr_test_mask(j, mask, nr_ids) &&
+ netif_attr_test_online(j, online_mask, nr_ids)) {
+ /* add tx-queue to CPU/rx-queue maps */
int pos = 0;
- map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (numa_node_id == -2)
- numa_node_id = cpu_to_node(cpu);
- else if (numa_node_id != cpu_to_node(cpu))
- numa_node_id = -1;
+ if (!is_rxqs_map) {
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(j);
+ else if (numa_node_id != cpu_to_node(j))
+ numa_node_id = -1;
+ }
#endif
} else if (dev_maps) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
/* copy maps belonging to foreign traffic classes */
for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
/* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->cpu_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
}
- rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+ if (is_rxqs_map)
+ rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
+ else
+ rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for_each_possible_cpu(cpu) {
- for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
- map = xmap_dereference(dev_maps->cpu_map[tci]);
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (map && map != new_map)
kfree_rcu(map, rcu);
}
@@ -2317,19 +2432,23 @@ out_no_old_maps:
active = true;
out_no_new_maps:
- /* update Tx queue numa node */
- netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
- (numa_node_id >= 0) ? numa_node_id :
- NUMA_NO_NODE);
+ if (!is_rxqs_map) {
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ?
+ numa_node_id : NUMA_NO_NODE);
+ }
if (!dev_maps)
goto out_no_maps;
- /* removes queue from unused CPUs */
- for_each_possible_cpu(cpu) {
- for (i = tc, tci = cpu * num_tc; i--; tci++)
+ /* removes tx-queue from unused CPUs/rx-queues */
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = tc, tci = j * num_tc; i--; tci++)
active |= remove_xps_queue(dev_maps, tci, index);
- if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+ if (!netif_attr_test_mask(j, mask, nr_ids) ||
+ !netif_attr_test_online(j, online_mask, nr_ids))
active |= remove_xps_queue(dev_maps, tci, index);
for (i = num_tc - tc, tci++; --i; tci++)
active |= remove_xps_queue(dev_maps, tci, index);
@@ -2337,7 +2456,10 @@ out_no_new_maps:
/* free map if not active */
if (!active) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
+ if (is_rxqs_map)
+ RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
+ else
+ RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
kfree_rcu(dev_maps, rcu);
}
@@ -2347,11 +2469,12 @@ out_no_maps:
return 0;
error:
/* remove any maps that we added */
- for_each_possible_cpu(cpu) {
- for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
+ j < nr_ids;) {
+ for (i = num_tc, tci = j * num_tc; i--; tci++) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
map = dev_maps ?
- xmap_dereference(dev_maps->cpu_map[tci]) :
+ xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
kfree(new_map);
@@ -2363,14 +2486,41 @@ error:
kfree(new_dev_maps);
return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
+
+int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
+ u16 index)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ cpus_read_unlock();
+
+ return ret;
+}
EXPORT_SYMBOL(netif_set_xps_queue);
#endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+ /* Unbind any subordinate channels */
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev)
+ netdev_unbind_sb_channel(dev, txq->sb_dev);
+ }
+}
+
void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
#endif
+ netdev_unbind_all_sb_channels(dev);
+
+ /* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2399,11 +2549,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
#endif
+ netdev_unbind_all_sb_channels(dev);
+
dev->num_tc = num_tc;
return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev)
+{
+ struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
+
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(sb_dev, 0);
+#endif
+ memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
+ memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
+
+ while (txq-- != &dev->_tx[0]) {
+ if (txq->sb_dev == sb_dev)
+ txq->sb_dev = NULL;
+ }
+}
+EXPORT_SYMBOL(netdev_unbind_sb_channel);
+
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+ struct net_device *sb_dev,
+ u8 tc, u16 count, u16 offset)
+{
+ /* Make certain the sb_dev and dev are already configured */
+ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
+ return -EINVAL;
+
+ /* We cannot hand out queues we don't have */
+ if ((offset + count) > dev->real_num_tx_queues)
+ return -EINVAL;
+
+ /* Record the mapping */
+ sb_dev->tc_to_txq[tc].count = count;
+ sb_dev->tc_to_txq[tc].offset = offset;
+
+ /* Provide a way for Tx queue to find the tc_to_txq map or
+ * XPS map for itself.
+ */
+ while (count--)
+ netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
+
+int netdev_set_sb_channel(struct net_device *dev, u16 channel)
+{
+ /* Do not use a multiqueue device to represent a subordinate channel */
+ if (netif_is_multiqueue(dev))
+ return -ENODEV;
+
+ /* We allow channels 1 - 32767 to be used for subordinate channels.
+ * Channel 0 is meant to be "native" mode and used only to represent
+ * the main root device. We allow writing 0 to reset the device back
+ * to normal mode after being used as a subordinate channel.
+ */
+ if (channel > S16_MAX)
+ return -EINVAL;
+
+ dev->num_tc = -channel;
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_set_sb_channel);
+
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -2615,24 +2831,26 @@ EXPORT_SYMBOL(netif_device_attach);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
+static u16 skb_tx_hash(const struct net_device *dev,
+ const struct net_device *sb_dev,
+ struct sk_buff *skb)
{
u32 hash;
u16 qoffset = 0;
u16 qcount = dev->real_num_tx_queues;
+ if (dev->num_tc) {
+ u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+
+ qoffset = sb_dev->tc_to_txq[tc].offset;
+ qcount = sb_dev->tc_to_txq[tc].count;
+ }
+
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
while (unlikely(hash >= qcount))
hash -= qcount;
- return hash;
- }
-
- if (dev->num_tc) {
- u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-
- qoffset = dev->tc_to_txq[tc].offset;
- qcount = dev->tc_to_txq[tc].count;
+ return hash + qoffset;
}
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
@@ -3026,7 +3244,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+ if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -3046,7 +3264,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
while (skb) {
struct sk_buff *next = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
@@ -3146,7 +3364,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
for (; skb != NULL; skb = next) {
next = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
/* in case skb wont be segmented, point to itself */
skb->prev = skb;
@@ -3376,32 +3594,64 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
}
#endif /* CONFIG_NET_EGRESS */
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+#ifdef CONFIG_XPS
+static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
+ struct xps_dev_maps *dev_maps, unsigned int tci)
+{
+ struct xps_map *map;
+ int queue_index = -1;
+
+ if (dev->num_tc) {
+ tci *= dev->num_tc;
+ tci += netdev_get_prio_tc_map(dev, skb->priority);
+ }
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (map) {
+ if (map->len == 1)
+ queue_index = map->queues[0];
+ else
+ queue_index = map->queues[reciprocal_scale(
+ skb_get_hash(skb), map->len)];
+ if (unlikely(queue_index >= dev->real_num_tx_queues))
+ queue_index = -1;
+ }
+ return queue_index;
+}
+#endif
+
+static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
+ struct sk_buff *skb)
{
#ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
- struct xps_map *map;
+ struct sock *sk = skb->sk;
int queue_index = -1;
+ if (!static_key_false(&xps_needed))
+ return -1;
+
rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
+ if (!static_key_false(&xps_rxqs_needed))
+ goto get_cpus_map;
+
+ dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
if (dev_maps) {
- unsigned int tci = skb->sender_cpu - 1;
+ int tci = sk_rx_queue_get(sk);
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tci >= 0 && tci < dev->num_rx_queues)
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
+ }
- map = rcu_dereference(dev_maps->cpu_map[tci]);
- if (map) {
- if (map->len == 1)
- queue_index = map->queues[0];
- else
- queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
- map->len)];
- if (unlikely(queue_index >= dev->real_num_tx_queues))
- queue_index = -1;
+get_cpus_map:
+ if (queue_index < 0) {
+ dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ if (dev_maps) {
+ unsigned int tci = skb->sender_cpu - 1;
+
+ queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
+ tci);
}
}
rcu_read_unlock();
@@ -3412,17 +3662,36 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
#endif
}
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev,
+ select_queue_fallback_t fallback)
+{
+ return 0;
+}
+EXPORT_SYMBOL(dev_pick_tx_zero);
+
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev,
+ select_queue_fallback_t fallback)
+{
+ return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+}
+EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
+ sb_dev = sb_dev ? : dev;
+
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
- int new_index = get_xps_queue(dev, skb);
+ int new_index = get_xps_queue(dev, sb_dev, skb);
if (new_index < 0)
- new_index = skb_tx_hash(dev, skb);
+ new_index = skb_tx_hash(dev, sb_dev, skb);
if (queue_index != new_index && sk &&
sk_fullsock(sk) &&
@@ -3437,7 +3706,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
- void *accel_priv)
+ struct net_device *sb_dev)
{
int queue_index = 0;
@@ -3452,10 +3721,10 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
__netdev_pick_tx);
else
- queue_index = __netdev_pick_tx(dev, skb);
+ queue_index = __netdev_pick_tx(dev, skb, sb_dev);
queue_index = netdev_cap_txqueue(dev, queue_index);
}
@@ -3467,7 +3736,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
- * @accel_priv: private data used for L2 forwarding offload
+ * @sb_dev: suboordinate device used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
@@ -3490,7 +3759,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
-static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
+static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -3529,7 +3798,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
else
skb_dst_force(skb);
- txq = netdev_pick_tx(dev, skb, accel_priv);
+ txq = netdev_pick_tx(dev, skb, sb_dev);
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
@@ -3603,9 +3872,9 @@ int dev_queue_xmit(struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_queue_xmit);
-int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
+int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
{
- return __dev_queue_xmit(skb, accel_priv);
+ return __dev_queue_xmit(skb, sb_dev);
}
EXPORT_SYMBOL(dev_queue_xmit_accel);
@@ -4022,13 +4291,16 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
+ __be16 orig_eth_type;
+ struct ethhdr *eth;
+ bool orig_bcast;
int hlen, off;
u32 mac_len;
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_cloned(skb))
+ if (skb_cloned(skb) || skb_is_tc_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
@@ -4062,6 +4334,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp->data_hard_start = skb->data - skb_headroom(skb);
orig_data_end = xdp->data_end;
orig_data = xdp->data;
+ eth = (struct ethhdr *)xdp->data;
+ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
+ orig_eth_type = eth->h_proto;
rxqueue = netif_get_rxqueue(skb);
xdp->rxq = &rxqueue->xdp_rxq;
@@ -4085,6 +4360,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
}
+ /* check if XDP changed eth hdr such SKB needs update */
+ eth = (struct ethhdr *)xdp->data;
+ if ((orig_eth_type != eth->h_proto) ||
+ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ }
+
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
@@ -4378,6 +4661,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
return NULL;
+ case TC_ACT_REINSERT:
+ /* this does not scrub the packet, and updates stats on error */
+ skb_tc_reinsert(skb, &cl_res);
+ return NULL;
default:
break;
}
@@ -4494,7 +4781,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+ struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
@@ -4624,8 +4912,7 @@ skip_classify:
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
- else
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ *ppt_prev = pt_prev;
} else {
drop:
if (!deliver_exact)
@@ -4643,6 +4930,18 @@ out:
return ret;
}
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+ int ret;
+
+ ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ if (pt_prev)
+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ return ret;
+}
+
/**
* netif_receive_skb_core - special purpose version of netif_receive_skb
* @skb: buffer to process
@@ -4663,13 +4962,72 @@ int netif_receive_skb_core(struct sk_buff *skb)
int ret;
rcu_read_lock();
- ret = __netif_receive_skb_core(skb, false);
+ ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
+{
+ struct sk_buff *skb, *next;
+
+ if (!pt_prev)
+ return;
+ if (list_empty(head))
+ return;
+ if (pt_prev->list_func != NULL)
+ pt_prev->list_func(head, pt_prev, orig_dev);
+ else
+ list_for_each_entry_safe(skb, next, head, list)
+ pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+{
+ /* Fast-path assumptions:
+ * - There is no RX handler.
+ * - Only one packet_type matches.
+ * If either of these fails, we will end up doing some per-packet
+ * processing in-line, then handling the 'last ptype' for the whole
+ * sublist. This can't cause out-of-order delivery to any single ptype,
+ * because the 'last ptype' must be constant across the sublist, and all
+ * other ptypes are handled per-packet.
+ */
+ /* Current (common) ptype of sublist */
+ struct packet_type *pt_curr = NULL;
+ /* Current (common) orig_dev of sublist */
+ struct net_device *od_curr = NULL;
+ struct list_head sublist;
+ struct sk_buff *skb, *next;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *orig_dev = skb->dev;
+ struct packet_type *pt_prev = NULL;
+
+ list_del(&skb->list);
+ __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ if (!pt_prev)
+ continue;
+ if (pt_curr != pt_prev || od_curr != orig_dev) {
+ /* dispatch old sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ pt_curr = pt_prev;
+ od_curr = orig_dev;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+
+ /* dispatch final sublist */
+ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+}
+
static int __netif_receive_skb(struct sk_buff *skb)
{
int ret;
@@ -4687,14 +5045,44 @@ static int __netif_receive_skb(struct sk_buff *skb)
* context down to all allocation sites.
*/
noreclaim_flag = memalloc_noreclaim_save();
- ret = __netif_receive_skb_core(skb, true);
+ ret = __netif_receive_skb_one_core(skb, true);
memalloc_noreclaim_restore(noreclaim_flag);
} else
- ret = __netif_receive_skb_core(skb, false);
+ ret = __netif_receive_skb_one_core(skb, false);
return ret;
}
+static void __netif_receive_skb_list(struct list_head *head)
+{
+ unsigned long noreclaim_flag = 0;
+ struct sk_buff *skb, *next;
+ bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
+ struct list_head sublist;
+
+ /* Handle the previous sublist */
+ list_cut_before(&sublist, head, &skb->list);
+ if (!list_empty(&sublist))
+ __netif_receive_skb_list_core(&sublist, pfmemalloc);
+ pfmemalloc = !pfmemalloc;
+ /* See comments in __netif_receive_skb */
+ if (pfmemalloc)
+ noreclaim_flag = memalloc_noreclaim_save();
+ else
+ memalloc_noreclaim_restore(noreclaim_flag);
+ }
+ }
+ /* Handle the remaining sublist */
+ if (!list_empty(head))
+ __netif_receive_skb_list_core(head, pfmemalloc);
+ /* Restore pflags */
+ if (pfmemalloc)
+ memalloc_noreclaim_restore(noreclaim_flag);
+}
+
static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4717,7 +5105,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
break;
case XDP_QUERY_PROG:
- xdp->prog_attached = !!old;
xdp->prog_id = old ? old->aux->id : 0;
break;
@@ -4769,6 +5156,55 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
return ret;
}
+static void netif_receive_skb_list_internal(struct list_head *head)
+{
+ struct bpf_prog *xdp_prog = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
+ list_del(&skb->list);
+ if (!skb_defer_rx_timestamp(skb))
+ list_add_tail(&skb->list, &sublist);
+ }
+ list_splice_init(&sublist, head);
+
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
+ preempt_disable();
+ rcu_read_lock();
+ list_for_each_entry_safe(skb, next, head, list) {
+ xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+ list_del(&skb->list);
+ if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
+ list_add_tail(&skb->list, &sublist);
+ }
+ rcu_read_unlock();
+ preempt_enable();
+ /* Put passed packets back on main list */
+ list_splice_init(&sublist, head);
+ }
+
+ rcu_read_lock();
+#ifdef CONFIG_RPS
+ if (static_key_false(&rps_needed)) {
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+
+ if (cpu >= 0) {
+ /* Will be handled, remove from list */
+ list_del(&skb->list);
+ enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ }
+ }
+ }
+#endif
+ __netif_receive_skb_list(head);
+ rcu_read_unlock();
+}
+
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
@@ -4792,6 +5228,28 @@ int netif_receive_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(netif_receive_skb);
+/**
+ * netif_receive_skb_list - process many receive buffers from network
+ * @head: list of skbs to process.
+ *
+ * Since return value of netif_receive_skb() is normally ignored, and
+ * wouldn't be meaningful for a list, this function returns void.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+ struct sk_buff *skb;
+
+ if (list_empty(head))
+ return;
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ netif_receive_skb_list_internal(head);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
@@ -4875,42 +5333,49 @@ out:
return netif_receive_skb_internal(skb);
}
-/* napi->gro_list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
+ bool flush_old)
{
- struct sk_buff *skb, *prev = NULL;
-
- /* scan list and build reverse chain */
- for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
- skb->prev = prev;
- prev = skb;
- }
-
- for (skb = prev; skb; skb = prev) {
- skb->next = NULL;
+ struct list_head *head = &napi->gro_hash[index].list;
+ struct sk_buff *skb, *p;
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
-
- prev = skb->prev;
+ skb_list_del_init(skb);
napi_gro_complete(skb);
- napi->gro_count--;
+ napi->gro_hash[index].count--;
}
- napi->gro_list = NULL;
+ if (!napi->gro_hash[index].count)
+ __clear_bit(index, &napi->gro_bitmask);
+}
+
+/* napi->gro_hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ u32 i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ if (test_bit(i, &napi->gro_bitmask))
+ __napi_gro_flush_chain(napi, i, flush_old);
+ }
}
EXPORT_SYMBOL(napi_gro_flush);
-static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
+static struct list_head *gro_list_prepare(struct napi_struct *napi,
+ struct sk_buff *skb)
{
- struct sk_buff *p;
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
+ struct list_head *head;
+ struct sk_buff *p;
- for (p = napi->gro_list; p; p = p->next) {
+ head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
+ list_for_each_entry(p, head, list) {
unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
@@ -4933,6 +5398,8 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
+
+ return head;
}
static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -4975,20 +5442,41 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
}
}
+static void gro_flush_oldest(struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ list_del(&oldest->list);
+ napi_gro_complete(oldest);
+}
+
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *head = &offload_base;
- int same_flow;
+ struct list_head *gro_head;
+ struct sk_buff *pp = NULL;
enum gro_result ret;
+ int same_flow;
int grow;
if (netif_elide_gro(skb->dev))
goto normal;
- gro_list_prepare(napi, skb);
+ gro_head = gro_list_prepare(napi, skb);
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
@@ -5022,7 +5510,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->csum_valid = 0;
}
- pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
+ pp = ptype->callbacks.gro_receive(gro_head, skb);
break;
}
rcu_read_unlock();
@@ -5039,12 +5527,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
if (pp) {
- struct sk_buff *nskb = *pp;
-
- *pp = nskb->next;
- nskb->next = NULL;
- napi_gro_complete(nskb);
- napi->gro_count--;
+ skb_list_del_init(pp);
+ napi_gro_complete(pp);
+ napi->gro_hash[hash].count--;
}
if (same_flow)
@@ -5053,26 +5538,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (NAPI_GRO_CB(skb)->flush)
goto normal;
- if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
- struct sk_buff *nskb = napi->gro_list;
-
- /* locate the end of the list to select the 'oldest' flow */
- while (nskb->next) {
- pp = &nskb->next;
- nskb = *pp;
- }
- *pp = NULL;
- nskb->next = NULL;
- napi_gro_complete(nskb);
+ if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+ gro_flush_oldest(gro_head);
} else {
- napi->gro_count++;
+ napi->gro_hash[hash].count++;
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- skb->next = napi->gro_list;
- napi->gro_list = skb;
+ list_add(&skb->list, gro_head);
ret = GRO_HELD;
pull:
@@ -5080,6 +5555,13 @@ pull:
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
+ if (napi->gro_hash[hash].count) {
+ if (!test_bit(hash, &napi->gro_bitmask))
+ __set_bit(hash, &napi->gro_bitmask);
+ } else if (test_bit(hash, &napi->gro_bitmask)) {
+ __clear_bit(hash, &napi->gro_bitmask);
+ }
+
return ret;
normal:
@@ -5478,7 +5960,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
- if (n->gro_list) {
+ if (n->gro_bitmask) {
unsigned long timeout = 0;
if (work_done)
@@ -5687,21 +6169,31 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_list && !napi_disable_pending(napi) &&
+ if (napi->gro_bitmask && !napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
return HRTIMER_NORESTART;
}
+static void init_gro_hash(struct napi_struct *napi)
+{
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&napi->gro_hash[i].list);
+ napi->gro_hash[i].count = 0;
+ }
+ napi->gro_bitmask = 0;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
INIT_LIST_HEAD(&napi->poll_list);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
- napi->gro_count = 0;
- napi->gro_list = NULL;
+ init_gro_hash(napi);
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
@@ -5734,6 +6226,19 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
+static void flush_gro_hash(struct napi_struct *napi)
+{
+ int i;
+
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ struct sk_buff *skb, *n;
+
+ list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
+ kfree_skb(skb);
+ napi->gro_hash[i].count = 0;
+ }
+}
+
/* Must be called in process context */
void netif_napi_del(struct napi_struct *napi)
{
@@ -5743,9 +6248,8 @@ void netif_napi_del(struct napi_struct *napi)
list_del_init(&napi->dev_list);
napi_free_frags(napi);
- kfree_skb_list(napi->gro_list);
- napi->gro_list = NULL;
- napi->gro_count = 0;
+ flush_gro_hash(napi);
+ napi->gro_bitmask = 0;
}
EXPORT_SYMBOL(netif_napi_del);
@@ -5787,7 +6291,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
- if (n->gro_list) {
+ if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
@@ -7080,13 +7584,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
EXPORT_SYMBOL(__dev_set_mtu);
/**
- * dev_set_mtu - Change maximum transfer unit
+ * dev_set_mtu_ext - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
+ * @extack: netlink extended ack
*
* Change the maximum transfer size of the network device.
*/
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
int err, orig_mtu;
@@ -7095,14 +7601,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
/* MTU must be positive, and in range */
if (new_mtu < 0 || new_mtu < dev->min_mtu) {
- net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
- dev->name, new_mtu, dev->min_mtu);
+ NL_SET_ERR_MSG(extack, "mtu less than device minimum");
return -EINVAL;
}
if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
- net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
- dev->name, new_mtu, dev->max_mtu);
+ NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
return -EINVAL;
}
@@ -7118,18 +7622,32 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
err = __dev_set_mtu(dev, new_mtu);
if (!err) {
- err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ orig_mtu);
err = notifier_to_errno(err);
if (err) {
/* setting mtu back and notifying everyone again,
* so that they have a chance to revert changes.
*/
__dev_set_mtu(dev, orig_mtu);
- call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ new_mtu);
}
}
return err;
}
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ struct netlink_ext_ack extack;
+ int err;
+
+ memset(&extack, 0, sizeof(extack));
+ err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ if (err && extack._msg)
+ net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
+ return err;
+}
EXPORT_SYMBOL(dev_set_mtu);
/**
@@ -7279,23 +7797,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
-void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
- struct netdev_bpf *xdp)
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+ enum bpf_netdev_command cmd)
{
- memset(xdp, 0, sizeof(*xdp));
- xdp->command = XDP_QUERY_PROG;
+ struct netdev_bpf xdp;
- /* Query must always succeed. */
- WARN_ON(bpf_op(dev, xdp) < 0);
-}
+ if (!bpf_op)
+ return 0;
-static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
-{
- struct netdev_bpf xdp;
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = cmd;
- __dev_xdp_query(dev, bpf_op, &xdp);
+ /* Query must always succeed. */
+ WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
- return xdp.prog_attached;
+ return xdp.prog_id;
}
static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
@@ -7329,12 +7845,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
if (!ndo_bpf)
return;
- __dev_xdp_query(dev, ndo_bpf, &xdp);
- if (xdp.prog_attached == XDP_ATTACHED_NONE)
- return;
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+ WARN_ON(ndo_bpf(dev, &xdp));
+ if (xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
- /* Program removal should always succeed */
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+ /* Remove HW offload */
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG_HW;
+ if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
}
/**
@@ -7350,12 +7873,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ enum bpf_netdev_command query;
struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
int err;
ASSERT_RTNL();
+ query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
bpf_op = bpf_chk = ops->ndo_bpf;
if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
return -EOPNOTSUPP;
@@ -7365,10 +7891,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
- if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
+ if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+ __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
return -EEXIST;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_attached(dev, bpf_op))
+ __dev_xdp_query(dev, bpf_op, query))
return -EBUSY;
prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
@@ -8837,6 +9364,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
+ BUILD_BUG_ON(GRO_HASH_BUCKETS >
+ 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+
if (net != &init_net)
INIT_LIST_HEAD(&net->dev_base_head);
@@ -9107,6 +9637,7 @@ static int __init net_dev_init(void)
sd->cpu = i;
#endif
+ init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
}
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 50537ff961a7..90e8aa36881e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -284,12 +284,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
case SIOCSIFTXQLEN:
if (ifr->ifr_qlen < 0)
return -EINVAL;
- if (dev->tx_queue_len ^ ifr->ifr_qlen) {
- err = dev_change_tx_queue_len(dev, ifr->ifr_qlen);
- if (err)
- return err;
- }
- return 0;
+ return dev_change_tx_queue_len(dev, ifr->ifr_qlen);
case SIOCSIFNAME:
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 22099705cc41..3a4b29a13d31 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -326,6 +326,57 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
pool_type, p_tc_index);
}
+struct devlink_region {
+ struct devlink *devlink;
+ struct list_head list;
+ const char *name;
+ struct list_head snapshot_list;
+ u32 max_snapshots;
+ u32 cur_snapshots;
+ u64 size;
+};
+
+struct devlink_snapshot {
+ struct list_head list;
+ struct devlink_region *region;
+ devlink_snapshot_data_dest_t *data_destructor;
+ u64 data_len;
+ u8 *data;
+ u32 id;
+};
+
+static struct devlink_region *
+devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ if (!strcmp(region->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_snapshot *
+devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
+{
+ struct devlink_snapshot *snapshot;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list)
+ if (snapshot->id == id)
+ return snapshot;
+
+ return NULL;
+}
+
+static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot)
+{
+ snapshot->region->cur_snapshots--;
+ list_del(&snapshot->list);
+ (*snapshot->data_destructor)(snapshot->data);
+ kfree(snapshot);
+}
+
#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
@@ -1575,7 +1626,7 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
if (!ops->eswitch_mode_set)
return -EOPNOTSUPP;
mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
- err = ops->eswitch_mode_set(devlink, mode);
+ err = ops->eswitch_mode_set(devlink, mode, info->extack);
if (err)
return err;
}
@@ -1585,7 +1636,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
inline_mode = nla_get_u8(
info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
- err = ops->eswitch_inline_mode_set(devlink, inline_mode);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode,
+ info->extack);
if (err)
return err;
}
@@ -1594,7 +1646,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
if (!ops->eswitch_encap_mode_set)
return -EOPNOTSUPP;
encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
- err = ops->eswitch_encap_mode_set(devlink, encap_mode);
+ err = ops->eswitch_encap_mode_set(devlink, encap_mode,
+ info->extack);
if (err)
return err;
}
@@ -2541,7 +2594,7 @@ send_done:
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
- goto err_skb_send_alloc;
+ return err;
goto send_done;
}
return genlmsg_reply(skb, info);
@@ -2549,7 +2602,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_resource_put:
-err_skb_send_alloc:
nlmsg_free(skb);
return err;
}
@@ -2604,6 +2656,942 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return devlink->ops->reload(devlink, info->extack);
}
+static const struct devlink_param devlink_param_generic[] = {
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+ .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+ .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
+ .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+ .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
+ .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+ .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+ .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
+ },
+};
+
+static int devlink_param_generic_verify(const struct devlink_param *param)
+{
+ /* verify it match generic parameter by id and name */
+ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ if (strcmp(param->name, devlink_param_generic[param->id].name))
+ return -ENOENT;
+
+ WARN_ON(param->type != devlink_param_generic[param->id].type);
+
+ return 0;
+}
+
+static int devlink_param_driver_verify(const struct devlink_param *param)
+{
+ int i;
+
+ if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ /* verify no such name in generic params */
+ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
+ if (!strcmp(param->name, devlink_param_generic[i].name))
+ return -EEXIST;
+
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_name(struct list_head *param_list,
+ const char *param_name)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, param_list, list)
+ if (!strcmp(param_item->param->name, param_name))
+ return param_item;
+ return NULL;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_id(struct list_head *param_list, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, param_list, list)
+ if (param_item->param->id == param_id)
+ return param_item;
+ return NULL;
+}
+
+static bool
+devlink_param_cmode_is_supported(const struct devlink_param *param,
+ enum devlink_param_cmode cmode)
+{
+ return test_bit(cmode, &param->supported_cmodes);
+}
+
+static int devlink_param_get(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx)
+{
+ if (!param->get)
+ return -EOPNOTSUPP;
+ return param->get(devlink, param->id, ctx);
+}
+
+static int devlink_param_set(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx)
+{
+ if (!param->set)
+ return -EOPNOTSUPP;
+ return param->set(devlink, param->id, ctx);
+}
+
+static int
+devlink_param_type_to_nla_type(enum devlink_param_type param_type)
+{
+ switch (param_type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ return NLA_U8;
+ case DEVLINK_PARAM_TYPE_U16:
+ return NLA_U16;
+ case DEVLINK_PARAM_TYPE_U32:
+ return NLA_U32;
+ case DEVLINK_PARAM_TYPE_STRING:
+ return NLA_STRING;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ return NLA_FLAG;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int
+devlink_nl_param_value_fill_one(struct sk_buff *msg,
+ enum devlink_param_type type,
+ enum devlink_param_cmode cmode,
+ union devlink_param_value val)
+{
+ struct nlattr *param_value_attr;
+
+ param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE);
+ if (!param_value_attr)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
+ goto value_nest_cancel;
+
+ switch (type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
+ val.vstr))
+ goto value_nest_cancel;
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ if (val.vbool &&
+ nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
+ goto value_nest_cancel;
+ break;
+ }
+
+ nla_nest_end(msg, param_value_attr);
+ return 0;
+
+value_nest_cancel:
+ nla_nest_cancel(msg, param_value_attr);
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ const struct devlink_param *param = param_item->param;
+ struct devlink_param_gset_ctx ctx;
+ struct nlattr *param_values_list;
+ struct nlattr *param_attr;
+ int nla_type;
+ void *hdr;
+ int err;
+ int i;
+
+ /* Get value from driver part to driverinit configuration mode */
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (!param_item->driverinit_value_valid)
+ return -EOPNOTSUPP;
+ param_value[i] = param_item->driverinit_value;
+ } else {
+ ctx.cmode = i;
+ err = devlink_param_get(devlink, param, &ctx);
+ if (err)
+ return err;
+ param_value[i] = ctx.val;
+ }
+ }
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+ param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
+ if (!param_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
+ goto param_nest_cancel;
+ if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
+ goto param_nest_cancel;
+
+ nla_type = devlink_param_type_to_nla_type(param->type);
+ if (nla_type < 0)
+ goto param_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
+ goto param_nest_cancel;
+
+ param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST);
+ if (!param_values_list)
+ goto param_nest_cancel;
+
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ err = devlink_nl_param_value_fill_one(msg, param->type,
+ i, param_value[i]);
+ if (err)
+ goto values_list_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_values_list);
+ nla_nest_end(msg, param_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+values_list_nest_cancel:
+ nla_nest_end(msg, param_values_list);
+param_nest_cancel:
+ nla_nest_cancel(msg, param_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+ err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_param_item *param_item;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(param_item, &devlink->param_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_param_fill(msg, devlink, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_param_type_get_from_info(struct genl_info *info,
+ enum devlink_param_type *param_type)
+{
+ if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
+ return -EINVAL;
+
+ switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
+ case NLA_U8:
+ *param_type = DEVLINK_PARAM_TYPE_U8;
+ break;
+ case NLA_U16:
+ *param_type = DEVLINK_PARAM_TYPE_U16;
+ break;
+ case NLA_U32:
+ *param_type = DEVLINK_PARAM_TYPE_U32;
+ break;
+ case NLA_STRING:
+ *param_type = DEVLINK_PARAM_TYPE_STRING;
+ break;
+ case NLA_FLAG:
+ *param_type = DEVLINK_PARAM_TYPE_BOOL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+devlink_param_value_get_from_info(const struct devlink_param *param,
+ struct genl_info *info,
+ union devlink_param_value *value)
+{
+ int len;
+
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
+ !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
+ return -EINVAL;
+
+ switch (param->type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]),
+ nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
+ if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) ||
+ len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
+ return -EINVAL;
+ strcpy(value->vstr,
+ nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
+ true : false;
+ break;
+ }
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ char *param_name;
+
+ if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
+ return NULL;
+
+ param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
+ return devlink_param_find_by_name(&devlink->param_list, param_name);
+}
+
+static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_param_item *param_item;
+ struct sk_buff *msg;
+ int err;
+
+ param_item = devlink_param_get_from_info(devlink, info);
+ if (!param_item)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_param_fill(msg, devlink, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_param_type param_type;
+ struct devlink_param_gset_ctx ctx;
+ enum devlink_param_cmode cmode;
+ struct devlink_param_item *param_item;
+ const struct devlink_param *param;
+ union devlink_param_value value;
+ int err = 0;
+
+ param_item = devlink_param_get_from_info(devlink, info);
+ if (!param_item)
+ return -EINVAL;
+ param = param_item->param;
+ err = devlink_param_type_get_from_info(info, &param_type);
+ if (err)
+ return err;
+ if (param_type != param->type)
+ return -EINVAL;
+ err = devlink_param_value_get_from_info(param, info, &value);
+ if (err)
+ return err;
+ if (param->validate) {
+ err = param->validate(devlink, param->id, value, info->extack);
+ if (err)
+ return err;
+ }
+
+ if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
+ return -EINVAL;
+ cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
+ if (!devlink_param_cmode_is_supported(param, cmode))
+ return -EOPNOTSUPP;
+
+ if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(param_item->driverinit_value.vstr, value.vstr);
+ else
+ param_item->driverinit_value = value;
+ param_item->driverinit_value_valid = true;
+ } else {
+ if (!param->set)
+ return -EOPNOTSUPP;
+ ctx.val = value;
+ ctx.cmode = cmode;
+ err = devlink_param_set(devlink, param, &ctx);
+ if (err)
+ return err;
+ }
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+
+static int devlink_param_register_one(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ if (devlink_param_find_by_name(&devlink->param_list,
+ param->name))
+ return -EEXIST;
+
+ if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+ WARN_ON(param->get || param->set);
+ else
+ WARN_ON(!param->get || !param->set);
+
+ param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
+ if (!param_item)
+ return -ENOMEM;
+ param_item->param = param;
+
+ list_add_tail(&param_item->list, &devlink->param_list);
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+
+static void devlink_param_unregister_one(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_name(&devlink->param_list,
+ param->name);
+ WARN_ON(!param_item);
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
+ list_del(&param_item->list);
+ kfree(param_item);
+}
+
+static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_snapshot *snapshot)
+{
+ struct nlattr *snap_attr;
+ int err;
+
+ snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
+ if (!snap_attr)
+ return -EINVAL;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, snap_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snap_attr);
+ return err;
+}
+
+static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_region *region)
+{
+ struct devlink_snapshot *snapshot;
+ struct nlattr *snapshots_attr;
+ int err;
+
+ snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS);
+ if (!snapshots_attr)
+ return -EINVAL;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list) {
+ err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, snapshots_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snapshots_attr);
+ return err;
+}
+
+static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct devlink_region *region)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size,
+ DEVLINK_ATTR_PAD);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto out_free_msg;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out_cancel_msg;
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
+ region->name);
+ if (err)
+ goto out_cancel_msg;
+
+ if (snapshot) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ snapshot->id);
+ if (err)
+ goto out_cancel_msg;
+ } else {
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size, DEVLINK_ATTR_PAD);
+ if (err)
+ goto out_cancel_msg;
+ }
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+
+ return;
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_region *region;
+ const char *region_name;
+ struct sk_buff *msg;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
+ info->snd_portid, info->snd_seq, 0,
+ region);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_region *region;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_region_del(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_region *region;
+ const char *region_name;
+ u32 snapshot_id;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
+ !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ return -EINVAL;
+
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot)
+ return -EINVAL;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+ devlink_region_snapshot_del(snapshot);
+ return 0;
+}
+
+static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ u8 *chunk, u32 chunk_size,
+ u64 addr)
+{
+ struct nlattr *chunk_attr;
+ int err;
+
+ chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK);
+ if (!chunk_attr)
+ return -EINVAL;
+
+ err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
+ DEVLINK_ATTR_PAD);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, chunk_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, chunk_attr);
+ return err;
+}
+
+#define DEVLINK_REGION_READ_CHUNK_SIZE 256
+
+static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
+ struct devlink *devlink,
+ struct devlink_region *region,
+ struct nlattr **attrs,
+ u64 start_offset,
+ u64 end_offset,
+ bool dump,
+ u64 *new_offset)
+{
+ struct devlink_snapshot *snapshot;
+ u64 curr_offset = start_offset;
+ u32 snapshot_id;
+ int err = 0;
+
+ *new_offset = start_offset;
+
+ snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot)
+ return -EINVAL;
+
+ if (end_offset > snapshot->data_len || dump)
+ end_offset = snapshot->data_len;
+
+ while (curr_offset < end_offset) {
+ u32 data_size;
+ u8 *data;
+
+ if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE)
+ data_size = end_offset - curr_offset;
+ else
+ data_size = DEVLINK_REGION_READ_CHUNK_SIZE;
+
+ data = &snapshot->data[curr_offset];
+ err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink,
+ data, data_size,
+ curr_offset);
+ if (err)
+ break;
+
+ curr_offset += data_size;
+ }
+ *new_offset = curr_offset;
+
+ return err;
+}
+
+static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ u64 ret_offset, start_offset, end_offset = 0;
+ struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];
+ const struct genl_ops *ops = cb->data;
+ struct devlink_region *region;
+ struct nlattr *chunks_attr;
+ const char *region_name;
+ struct devlink *devlink;
+ bool dump = true;
+ void *hdr;
+ int err;
+
+ start_offset = *((u64 *)&cb->args[0]);
+
+ err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
+ attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack);
+ if (err)
+ goto out;
+
+ devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+ if (IS_ERR(devlink))
+ goto out;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink->lock);
+
+ if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
+ !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ goto out_unlock;
+
+ region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region)
+ goto out_unlock;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
+ DEVLINK_CMD_REGION_READ);
+ if (!hdr)
+ goto out_unlock;
+
+ err = devlink_nl_put_handle(skb, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
+ if (err)
+ goto nla_put_failure;
+
+ chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS);
+ if (!chunks_attr)
+ goto nla_put_failure;
+
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ dump = false;
+ }
+
+ err = devlink_nl_region_read_snapshot_fill(skb, devlink,
+ region, attrs,
+ start_offset,
+ end_offset, dump,
+ &ret_offset);
+
+ if (err && err != -EMSGSIZE)
+ goto nla_put_failure;
+
+ /* Check if there was any progress done to prevent infinite loop */
+ if (ret_offset == start_offset)
+ goto nla_put_failure;
+
+ *((u64 *)&cb->args[0]) = ret_offset;
+
+ nla_nest_end(skb, chunks_attr);
+ genlmsg_end(skb, hdr);
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+out_unlock:
+ mutex_unlock(&devlink->lock);
+ mutex_unlock(&devlink_mutex);
+out:
+ return 0;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2624,6 +3612,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -2807,6 +3800,43 @@ static const struct genl_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
DEVLINK_NL_FLAG_NO_LOCK,
},
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .doit = devlink_nl_cmd_param_get_doit,
+ .dumpit = devlink_nl_cmd_param_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_SET,
+ .doit = devlink_nl_cmd_param_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .doit = devlink_nl_cmd_region_get_doit,
+ .dumpit = devlink_nl_cmd_region_get_dumpit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_DEL,
+ .doit = devlink_nl_cmd_region_del,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_READ,
+ .dumpit = devlink_nl_cmd_region_read_dumpit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2845,6 +3875,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->sb_list);
INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
INIT_LIST_HEAD(&devlink->resource_list);
+ INIT_LIST_HEAD(&devlink->param_list);
+ INIT_LIST_HEAD(&devlink->region_list);
mutex_init(&devlink->lock);
return devlink;
}
@@ -3434,6 +4466,343 @@ out:
}
EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
+/**
+ * devlink_params_register - register configuration parameters
+ *
+ * @devlink: devlink
+ * @params: configuration parameters array
+ * @params_count: number of parameters provided
+ *
+ * Register the configuration parameters supported by the driver.
+ */
+int devlink_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+ int err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < params_count; i++, param++) {
+ if (!param || !param->name || !param->supported_cmodes) {
+ err = -EINVAL;
+ goto rollback;
+ }
+ if (param->generic) {
+ err = devlink_param_generic_verify(param);
+ if (err)
+ goto rollback;
+ } else {
+ err = devlink_param_driver_verify(param);
+ if (err)
+ goto rollback;
+ }
+ err = devlink_param_register_one(devlink, param);
+ if (err)
+ goto rollback;
+ }
+
+ mutex_unlock(&devlink->lock);
+ return 0;
+
+rollback:
+ if (!i)
+ goto unlock;
+ for (param--; i > 0; i--, param--)
+ devlink_param_unregister_one(devlink, param);
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_params_register);
+
+/**
+ * devlink_params_unregister - unregister configuration parameters
+ * @devlink: devlink
+ * @params: configuration parameters to unregister
+ * @params_count: number of parameters provided
+ */
+void devlink_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < params_count; i++, param++)
+ devlink_param_unregister_one(devlink, param);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
+
+/**
+ * devlink_param_driverinit_value_get - get configuration parameter
+ * value for driver initializing
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter in driverinit configuration mode
+ *
+ * This function should be used by the driver to get driverinit
+ * configuration for initialization after reload command.
+ */
+int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
+ union devlink_param_value *init_val)
+{
+ struct devlink_param_item *param_item;
+
+ if (!devlink->ops || !devlink->ops->reload)
+ return -EOPNOTSUPP;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!param_item->driverinit_value_valid ||
+ !devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(init_val->vstr, param_item->driverinit_value.vstr);
+ else
+ *init_val = param_item->driverinit_value;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
+
+/**
+ * devlink_param_driverinit_value_set - set value of configuration
+ * parameter for driverinit
+ * configuration mode
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter to set for driverinit configuration mode
+ *
+ * This function should be used by the driver to set driverinit
+ * configuration mode default value.
+ */
+int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
+ union devlink_param_value init_val)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(param_item->driverinit_value.vstr, init_val.vstr);
+ else
+ param_item->driverinit_value = init_val;
+ param_item->driverinit_value_valid = true;
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
+
+/**
+ * devlink_param_value_changed - notify devlink on a parameter's value
+ * change. Should be called by the driver
+ * right after the change.
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ *
+ * This function should be used by the driver to notify devlink on value
+ * change, excluding driverinit configuration mode.
+ * For driverinit configuration mode driver should use the function
+ * devlink_param_driverinit_value_set() instead.
+ */
+void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ WARN_ON(!param_item);
+
+ devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_changed);
+
+/**
+ * devlink_param_value_str_fill - Safely fill-up the string preventing
+ * from overflow of the preallocated buffer
+ *
+ * @dst_val: destination devlink_param_value
+ * @src: source buffer
+ */
+void devlink_param_value_str_fill(union devlink_param_value *dst_val,
+ const char *src)
+{
+ size_t len;
+
+ len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE);
+ WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE);
+}
+EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
+
+/**
+ * devlink_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @region_name: region name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ */
+struct devlink_region *devlink_region_create(struct devlink *devlink,
+ const char *region_name,
+ u32 region_max_snapshots,
+ u64 region_size)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ mutex_lock(&devlink->lock);
+
+ if (devlink_region_get_by_name(devlink, region_name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ region->devlink = devlink;
+ region->max_snapshots = region_max_snapshots;
+ region->name = region_name;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ list_add_tail(&region->list, &devlink->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ mutex_unlock(&devlink->lock);
+ return region;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(devlink_region_create);
+
+/**
+ * devlink_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot, *ts;
+
+ mutex_lock(&devlink->lock);
+
+ /* Free all snapshots of region */
+ list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
+ devlink_region_snapshot_del(snapshot);
+
+ list_del(&region->list);
+
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+ mutex_unlock(&devlink->lock);
+ kfree(region);
+}
+EXPORT_SYMBOL_GPL(devlink_region_destroy);
+
+/**
+ * devlink_region_shapshot_id_get - get snapshot ID
+ *
+ * This callback should be called when adding a new snapshot,
+ * Driver should use the same id for multiple snapshots taken
+ * on multiple regions at the same time/by the same trigger.
+ *
+ * @devlink: devlink
+ */
+u32 devlink_region_shapshot_id_get(struct devlink *devlink)
+{
+ u32 id;
+
+ mutex_lock(&devlink->lock);
+ id = ++devlink->snapshot_id;
+ mutex_unlock(&devlink->lock);
+
+ return id;
+}
+EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
+
+/**
+ * devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * @devlink_region: devlink region of the snapshot
+ * @data_len: size of snapshot data
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ * @data_destructor: pointer to destructor function to free data
+ */
+int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
+ u8 *data, u32 snapshot_id,
+ devlink_snapshot_data_dest_t *data_destructor)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ mutex_lock(&devlink->lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+ snapshot->data_len = data_len;
+ snapshot->data_destructor = data_destructor;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ mutex_unlock(&devlink->lock);
+ return 0;
+
+unlock:
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
+
static int __init devlink_module_init(void)
{
return genl_register_family(&devlink_nl_family);
diff --git a/net/core/dst.c b/net/core/dst.c
index 2d9b37f8944a..81ccf20e2826 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -307,6 +307,7 @@ void metadata_dst_free(struct metadata_dst *md_dst)
#endif
kfree(md_dst);
}
+EXPORT_SYMBOL_GPL(metadata_dst_free);
struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e677a20180cf..d05402868575 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -27,6 +27,7 @@
#include <linux/rtnetlink.h>
#include <linux/sched/signal.h>
#include <linux/net.h>
+#include <net/xdp_sock.h>
/*
* Some useful ethtool_ops methods that're device independent.
@@ -111,6 +112,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
[NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
};
static const char
@@ -538,47 +540,17 @@ struct ethtool_link_usettings {
} link_modes;
};
-/* Internal kernel helper to query a device ethtool_link_settings.
- *
- * Backward compatibility note: for compatibility with legacy drivers
- * that implement only the ethtool_cmd API, this has to work with both
- * drivers implementing get_link_ksettings API and drivers
- * implementing get_settings API. When drivers implement get_settings
- * and report ethtool_cmd deprecated fields
- * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
- * because the resulting struct ethtool_link_settings does not report them.
- */
+/* Internal kernel helper to query a device ethtool_link_settings. */
int __ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *link_ksettings)
{
- int err;
- struct ethtool_cmd cmd;
-
ASSERT_RTNL();
- if (dev->ethtool_ops->get_link_ksettings) {
- memset(link_ksettings, 0, sizeof(*link_ksettings));
- return dev->ethtool_ops->get_link_ksettings(dev,
- link_ksettings);
- }
-
- /* driver doesn't support %ethtool_link_ksettings API. revert to
- * legacy %ethtool_cmd API, unless it's not supported either.
- * TODO: remove when ethtool_ops::get_settings disappears internally
- */
- if (!dev->ethtool_ops->get_settings)
+ if (!dev->ethtool_ops->get_link_ksettings)
return -EOPNOTSUPP;
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd = ETHTOOL_GSET;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
-
- /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
- */
- convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
- return err;
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
}
EXPORT_SYMBOL(__ethtool_get_link_ksettings);
@@ -634,16 +606,7 @@ store_link_ksettings_for_user(void __user *to,
return 0;
}
-/* Query device for its ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::get_link_ksettings, even if legacy
- * ethtool_ops::get_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_GSET for
- * this driver, so that they can correctly access the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Query device for its ethtool_link_settings. */
static int ethtool_get_link_ksettings(struct net_device *dev,
void __user *useraddr)
{
@@ -651,7 +614,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings link_ksettings;
ASSERT_RTNL();
-
if (!dev->ethtool_ops->get_link_ksettings)
return -EOPNOTSUPP;
@@ -698,16 +660,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
return store_link_ksettings_for_user(useraddr, &link_ksettings);
}
-/* Update device ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::set_link_ksettings, even if legacy
- * ethtool_ops::set_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_SSET for
- * this driver, so that they can correctly update the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Update device ethtool_link_settings. */
static int ethtool_set_link_ksettings(struct net_device *dev,
void __user *useraddr)
{
@@ -745,51 +698,31 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
/* Query device for its ethtool_cmd settings.
*
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing get_link_ksettings
- * API and drivers implementing get_settings API. When drivers
- * implement get_link_ksettings and report higher link mode bits, a
- * kernel warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, but the command is successful
- * (only the lower link mode bits reported back to user).
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now implemented via get_link_ksettings. When driver reports higher link mode
+ * bits, a kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful (only the
+ * lower link mode bits reported back to user). Deprecated fields from
+ * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero.
*/
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
+ int err;
ASSERT_RTNL();
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
- if (dev->ethtool_ops->get_link_ksettings) {
- /* First, use link_ksettings API if it is supported */
- int err;
- struct ethtool_link_ksettings link_ksettings;
-
- memset(&link_ksettings, 0, sizeof(link_ksettings));
- err = dev->ethtool_ops->get_link_ksettings(dev,
- &link_ksettings);
- if (err < 0)
- return err;
- convert_link_ksettings_to_legacy_settings(&cmd,
- &link_ksettings);
-
- /* send a sensible cmd tag back to user */
- cmd.cmd = ETHTOOL_GSET;
- } else {
- /* driver doesn't support %ethtool_link_ksettings
- * API. revert to legacy %ethtool_cmd API, unless it's
- * not supported either.
- */
- int err;
-
- if (!dev->ethtool_ops->get_settings)
- return -EOPNOTSUPP;
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+ if (err < 0)
+ return err;
+ convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings);
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd = ETHTOOL_GSET;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
- }
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
@@ -799,48 +732,29 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
/* Update device link settings with given ethtool_cmd.
*
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing set_link_ksettings
- * API and drivers implementing set_settings API. When drivers
- * implement set_link_ksettings and user's request updates deprecated
- * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
- * warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, and the request is rejected.
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now always implemented via set_link_settings. When user's request updates
+ * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to recommend user to
+ * upgrade ethtool, and the request is rejected.
*/
static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
ASSERT_RTNL();
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
-
- /* first, try new %ethtool_link_ksettings API. */
- if (dev->ethtool_ops->set_link_ksettings) {
- struct ethtool_link_ksettings link_ksettings;
-
- if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
- &cmd))
- return -EINVAL;
-
- link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
- link_ksettings.base.link_mode_masks_nwords
- = __ETHTOOL_LINK_MODE_MASK_NU32;
- return dev->ethtool_ops->set_link_ksettings(dev,
- &link_ksettings);
- }
-
- /* legacy %ethtool_cmd API */
-
- /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
- * disappears internally
- */
-
- if (!dev->ethtool_ops->set_settings)
+ if (!dev->ethtool_ops->set_link_ksettings)
return -EOPNOTSUPP;
- return dev->ethtool_ops->set_settings(dev, &cmd);
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd))
+ return -EINVAL;
+ link_ksettings.base.link_mode_masks_nwords =
+ __ETHTOOL_LINK_MODE_MASK_NU32;
+ return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
}
static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
@@ -1014,6 +928,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
return -EINVAL;
}
+ if (info.cmd != cmd)
+ return -EINVAL;
+
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
if (info.rule_cnt > 0) {
if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
@@ -1482,6 +1399,7 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
{
struct ethtool_wolinfo wol;
+ int ret;
if (!dev->ethtool_ops->set_wol)
return -EOPNOTSUPP;
@@ -1489,7 +1407,13 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
if (copy_from_user(&wol, useraddr, sizeof(wol)))
return -EFAULT;
- return dev->ethtool_ops->set_wol(dev, &wol);
+ ret = dev->ethtool_ops->set_wol(dev, &wol);
+ if (ret)
+ return ret;
+
+ dev->wol_enabled = !!wol.wolopts;
+
+ return 0;
}
static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
@@ -1742,8 +1666,10 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
+ struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
+ u16 from_channel, to_channel;
u32 max_rx_in_use = 0;
+ unsigned int i;
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
@@ -1751,13 +1677,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
- dev->ethtool_ops->get_channels(dev, &max);
+ dev->ethtool_ops->get_channels(dev, &curr);
/* ensure new counts are within the maximums */
- if ((channels.rx_count > max.max_rx) ||
- (channels.tx_count > max.max_tx) ||
- (channels.combined_count > max.max_combined) ||
- (channels.other_count > max.max_other))
+ if (channels.rx_count > curr.max_rx ||
+ channels.tx_count > curr.max_tx ||
+ channels.combined_count > curr.max_combined ||
+ channels.other_count > curr.max_other)
return -EINVAL;
/* ensure the new Rx count fits within the configured Rx flow
@@ -1767,6 +1693,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
(channels.combined_count + channels.rx_count) <= max_rx_in_use)
return -EINVAL;
+ /* Disabling channels, query zero-copy AF_XDP sockets */
+ from_channel = channels.combined_count +
+ min(channels.rx_count, channels.tx_count);
+ to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
+ for (i = from_channel; i < to_channel; i++)
+ if (xdp_get_umem_from_qid(dev, i))
+ return -EINVAL;
+
return dev->ethtool_ops->set_channels(dev, &channels);
}
@@ -2461,13 +2395,17 @@ roll_back:
return ret;
}
-static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+static int ethtool_set_per_queue(struct net_device *dev,
+ void __user *useraddr, u32 sub_cmd)
{
struct ethtool_per_queue_op per_queue_opt;
if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
return -EFAULT;
+ if (per_queue_opt.sub_command != sub_cmd)
+ return -EINVAL;
+
switch (per_queue_opt.sub_command) {
case ETHTOOL_GCOALESCE:
return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
@@ -2623,6 +2561,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPHYSTATS:
case ETHTOOL_GTSO:
case ETHTOOL_GPERMADDR:
+ case ETHTOOL_GUFO:
case ETHTOOL_GGSO:
case ETHTOOL_GGRO:
case ETHTOOL_GFLAGS:
@@ -2837,7 +2776,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
rc = ethtool_get_phy_stats(dev, useraddr);
break;
case ETHTOOL_PERQUEUE:
- rc = ethtool_set_per_queue(dev, useraddr);
+ rc = ethtool_set_per_queue(dev, useraddr, sub_cmd);
break;
case ETHTOOL_GLINKSETTINGS:
rc = ethtool_get_link_ksettings(dev, useraddr);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f64aa13811ea..ffbb827723a2 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -924,8 +924,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
errout:
- if (nlrule)
- kfree(nlrule);
+ kfree(nlrule);
rules_ops_put(ops);
return err;
}
@@ -1064,13 +1063,47 @@ skip:
return err;
}
+static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rule_hdr *frh;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
+ return -EINVAL;
+ }
+
+ frh = nlmsg_data(nlh);
+ if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
+ frh->res1 || frh->res2 || frh->action || frh->flags) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid values in header for fib rule dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct fib_rules_ops *ops;
int idx = 0, family;
- family = rtnl_msg_family(cb->nlh);
+ if (cb->strict_check) {
+ int err = fib_valid_dumprule_req(nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
+ family = rtnl_msg_family(nlh);
if (family != AF_UNSPEC) {
/* Protocol specific dump request */
ops = lookup_rules_ops(net, family);
diff --git a/net/core/filter.c b/net/core/filter.c
index 9dfd145eedcc..35c6933c2622 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
+#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
@@ -58,13 +59,17 @@
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
+#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
+#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
@@ -1453,30 +1458,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
return 0;
}
-static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
-{
- struct bpf_prog *old_prog;
- int err;
-
- if (bpf_prog_size(prog->len) > sysctl_optmem_max)
- return -ENOMEM;
-
- if (sk_unhashed(sk) && sk->sk_reuseport) {
- err = reuseport_alloc(sk);
- if (err)
- return err;
- } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
- /* The socket wasn't bound with SO_REUSEPORT */
- return -EINVAL;
- }
-
- old_prog = reuseport_attach_prog(sk, prog);
- if (old_prog)
- bpf_prog_destroy(old_prog);
-
- return 0;
-}
-
static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
@@ -1550,13 +1531,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __reuseport_attach_prog(prog, sk);
- if (err < 0) {
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ err = -ENOMEM;
+ else
+ err = reuseport_attach_prog(sk, prog);
+
+ if (err)
__bpf_prog_release(prog);
- return err;
- }
- return 0;
+ return err;
}
static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
@@ -1586,19 +1569,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
- struct bpf_prog *prog = __get_bpf(ufd, sk);
+ struct bpf_prog *prog;
int err;
+ if (sock_flag(sk, SOCK_FILTER_LOCKED))
+ return -EPERM;
+
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
+ prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __reuseport_attach_prog(prog, sk);
- if (err < 0) {
- bpf_prog_put(prog);
- return err;
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
+ /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
+ * bpf prog (e.g. sockmap). It depends on the
+ * limitation imposed by bpf_prog_load().
+ * Hence, sysctl_optmem_max is not checked.
+ */
+ if ((sk->sk_type != SOCK_STREAM &&
+ sk->sk_type != SOCK_DGRAM) ||
+ (sk->sk_protocol != IPPROTO_UDP &&
+ sk->sk_protocol != IPPROTO_TCP) ||
+ (sk->sk_family != AF_INET &&
+ sk->sk_family != AF_INET6)) {
+ err = -ENOTSUPP;
+ goto err_prog_put;
+ }
+ } else {
+ /* BPF_PROG_TYPE_SOCKET_FILTER */
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+ err = -ENOMEM;
+ goto err_prog_put;
+ }
}
- return 0;
+ err = reuseport_attach_prog(sk, prog);
+err_prog_put:
+ if (err)
+ bpf_prog_put(prog);
+
+ return err;
+}
+
+void sk_reuseport_prog_free(struct bpf_prog *prog)
+{
+ if (!prog)
+ return;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ bpf_prog_put(prog);
+ else
+ bpf_prog_destroy(prog);
}
struct bpf_scratchpad {
@@ -2082,19 +2104,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
-struct redirect_info {
- u32 ifindex;
- u32 flags;
- struct bpf_map *map;
- struct bpf_map *map_to_flush;
- unsigned long map_owner;
-};
-
-static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT;
@@ -2107,7 +2122,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
int skb_do_redirect(struct sk_buff *skb)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
@@ -2128,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
- struct bpf_map *, map, void *, key, u64, flags)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
- .func = bpf_sk_redirect_hash,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
- struct bpf_map *, map, u32, key, u64, flags)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-struct sock *do_sk_redirect_map(struct sk_buff *skb)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
- return tcb->bpf.sk_redir;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
- .func = bpf_sk_redirect_map,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
- struct bpf_map *, map, void *, key, u64, flags)
-{
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- msg->flags = flags;
- msg->sk_redir = __sock_hash_lookup_elem(map, key);
- if (!msg->sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
- .func = bpf_msg_redirect_hash,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
- struct bpf_map *, map, u32, key, u64, flags)
-{
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- msg->flags = flags;
- msg->sk_redir = __sock_map_lookup_elem(map, key);
- if (!msg->sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
-{
- return msg->sk_redir;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
- .func = bpf_msg_redirect_map,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_ANYTHING,
-};
-
-BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->apply_bytes = bytes;
return 0;
@@ -2258,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->cork_bytes = bytes;
return 0;
@@ -2272,39 +2171,39 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_msg_pull_data,
- struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+ u32, end, u64, flags)
{
- unsigned int len = 0, offset = 0, copy = 0;
- struct scatterlist *sg = msg->sg_data;
- int first_sg, last_sg, i, shift;
- unsigned char *p, *to, *from;
- int bytes = end - start;
+ u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+ u32 first_sge, last_sge, i, shift, bytes_sg_total;
+ struct scatterlist *sge;
+ u8 *raw, *to, *from;
struct page *page;
if (unlikely(flags || end <= start))
return -EINVAL;
/* First find the starting scatterlist element */
- i = msg->sg_start;
+ i = msg->sg.start;
do {
- len = sg[i].length;
- offset += len;
+ len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- } while (i != msg->sg_end);
+ offset += len;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
if (unlikely(start >= offset + len))
return -EINVAL;
- if (!msg->sg_copy[i] && bytes <= len)
+ first_sge = i;
+ /* The start may point into the sg element so we need to also
+ * account for the headroom.
+ */
+ bytes_sg_total = start - offset + bytes;
+ if (!msg->sg.copy[i] && bytes_sg_total <= len)
goto out;
- first_sg = i;
-
/* At this point we need to linearize multiple scatterlist
* elements or a single shared page. Either way we need to
* copy into a linear buffer exclusively owned by BPF. Then
@@ -2316,79 +2215,75 @@ BPF_CALL_4(bpf_msg_pull_data,
* will copy the entire sg entry.
*/
do {
- copy += sg[i].length;
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- if (bytes < copy)
+ copy += sk_msg_elem(msg, i)->length;
+ sk_msg_iter_var_next(i);
+ if (bytes_sg_total <= copy)
break;
- } while (i != msg->sg_end);
- last_sg = i;
+ } while (i != msg->sg.end);
+ last_sge = i;
- if (unlikely(copy < end - start))
+ if (unlikely(bytes_sg_total > copy))
return -EINVAL;
- page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+ get_order(copy));
if (unlikely(!page))
return -ENOMEM;
- p = page_address(page);
- offset = 0;
- i = first_sg;
+ raw = page_address(page);
+ i = first_sge;
do {
- from = sg_virt(&sg[i]);
- len = sg[i].length;
- to = p + offset;
+ sge = sk_msg_elem(msg, i);
+ from = sg_virt(sge);
+ len = sge->length;
+ to = raw + poffset;
memcpy(to, from, len);
- offset += len;
- sg[i].length = 0;
- put_page(sg_page(&sg[i]));
+ poffset += len;
+ sge->length = 0;
+ put_page(sg_page(sge));
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- } while (i != last_sg);
+ sk_msg_iter_var_next(i);
+ } while (i != last_sge);
- sg[first_sg].length = copy;
- sg_set_page(&sg[first_sg], page, copy, 0);
+ sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
* be done. Otherwise walk the ring and shift the entries.
*/
- shift = last_sg - first_sg - 1;
+ WARN_ON_ONCE(last_sge == first_sge);
+ shift = last_sge > first_sge ?
+ last_sge - first_sge - 1 :
+ MAX_SKB_FRAGS - first_sge + last_sge - 1;
if (!shift)
goto out;
- i = first_sg + 1;
+ i = first_sge;
+ sk_msg_iter_var_next(i);
do {
- int move_from;
+ u32 move_from;
- if (i + shift >= MAX_SKB_FRAGS)
- move_from = i + shift - MAX_SKB_FRAGS;
+ if (i + shift >= MAX_MSG_FRAGS)
+ move_from = i + shift - MAX_MSG_FRAGS;
else
move_from = i + shift;
-
- if (move_from == msg->sg_end)
+ if (move_from == msg->sg.end)
break;
- sg[i] = sg[move_from];
- sg[move_from].length = 0;
- sg[move_from].page_link = 0;
- sg[move_from].offset = 0;
-
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
+ msg->sg.data[i] = msg->sg.data[move_from];
+ msg->sg.data[move_from].length = 0;
+ msg->sg.data[move_from].page_link = 0;
+ msg->sg.data[move_from].offset = 0;
+ sk_msg_iter_var_next(i);
} while (1);
- msg->sg_end -= shift;
- if (msg->sg_end < 0)
- msg->sg_end += MAX_SKB_FRAGS;
+
+ msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+ msg->sg.end - shift + MAX_MSG_FRAGS :
+ msg->sg.end - shift;
out:
- msg->data = sg_virt(&sg[i]) + start - offset;
+ msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
msg->data_end = msg->data + bytes;
-
return 0;
}
@@ -2402,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
+ u32, len, u64, flags)
+{
+ struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
+ u32 new, i = 0, l, space, copy = 0, offset = 0;
+ u8 *raw, *to, *from;
+ struct page *page;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ l = sk_msg_elem(msg, i)->length;
+
+ if (start < offset + l)
+ break;
+ offset += l;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ if (start >= offset + l)
+ return -EINVAL;
+
+ space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+ /* If no space available will fallback to copy, we need at
+ * least one scatterlist elem available to push data into
+ * when start aligns to the beginning of an element or two
+ * when it falls inside an element. We handle the start equals
+ * offset case because its the common case for inserting a
+ * header.
+ */
+ if (!space || (space == 1 && start != offset))
+ copy = msg->sg.data[i].length;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+ get_order(copy + len));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ if (copy) {
+ int front, back;
+
+ raw = page_address(page);
+
+ psge = sk_msg_elem(msg, i);
+ front = start - offset;
+ back = psge->length - front;
+ from = sg_virt(psge);
+
+ if (front)
+ memcpy(raw, from, front);
+
+ if (back) {
+ from += front;
+ to = raw + front + len;
+
+ memcpy(to, from, back);
+ }
+
+ put_page(sg_page(psge));
+ } else if (start - offset) {
+ psge = sk_msg_elem(msg, i);
+ rsge = sk_msg_elem_cpy(msg, i);
+
+ psge->length = start - offset;
+ rsge.length -= psge->length;
+ rsge.offset += start;
+
+ sk_msg_iter_var_next(i);
+ sg_unmark_end(psge);
+ sk_msg_iter_next(msg, end);
+ }
+
+ /* Slot(s) to place newly allocated data */
+ new = i;
+
+ /* Shift one or two slots as needed */
+ if (!copy) {
+ sge = sk_msg_elem_cpy(msg, i);
+
+ sk_msg_iter_var_next(i);
+ sg_unmark_end(&sge);
+ sk_msg_iter_next(msg, end);
+
+ nsge = sk_msg_elem_cpy(msg, i);
+ if (rsge.length) {
+ sk_msg_iter_var_next(i);
+ nnsge = sk_msg_elem_cpy(msg, i);
+ }
+
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sge = nsge;
+ sk_msg_iter_var_next(i);
+ if (rsge.length) {
+ nsge = nnsge;
+ nnsge = sk_msg_elem_cpy(msg, i);
+ } else {
+ nsge = sk_msg_elem_cpy(msg, i);
+ }
+ }
+ }
+
+ /* Place newly allocated data buffer */
+ sk_mem_charge(msg->sk, len);
+ msg->sg.size += len;
+ msg->sg.copy[new] = false;
+ sg_set_page(&msg->sg.data[new], page, len + copy, 0);
+ if (rsge.length) {
+ get_page(sg_page(&rsge));
+ sk_msg_iter_var_next(new);
+ msg->sg.data[new] = rsge;
+ }
+
+ sk_msg_compute_data_pointers(msg);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_push_data_proto = {
+ .func = bpf_msg_push_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -3160,6 +3186,32 @@ static int __bpf_tx_xdp(struct net_device *dev,
return 0;
}
+static noinline int
+xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
+{
+ struct net_device *fwd;
+ u32 index = ri->ifindex;
+ int err;
+
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ ri->ifindex = 0;
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+ if (unlikely(err))
+ goto err;
+
+ _trace_xdp_redirect(dev, xdp_prog, index);
+ return 0;
+err:
+ _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ return err;
+}
+
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_map *map,
struct xdp_buff *xdp,
@@ -3172,7 +3224,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_dtab_netdev *dst = fwd;
err = dev_map_enqueue(dst, xdp, dev_rx);
- if (err)
+ if (unlikely(err))
return err;
__dev_map_insert_ctx(map, index);
break;
@@ -3181,7 +3233,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
struct bpf_cpu_map_entry *rcpu = fwd;
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
- if (err)
+ if (unlikely(err))
return err;
__cpu_map_insert_ctx(map, index);
break;
@@ -3200,7 +3252,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
void xdp_do_flush_map(void)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = ri->map_to_flush;
ri->map_to_flush = NULL;
@@ -3222,7 +3274,7 @@ void xdp_do_flush_map(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush_map);
-static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
@@ -3236,38 +3288,40 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
}
}
-static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
- unsigned long aux)
+void bpf_clear_redirect_map(struct bpf_map *map)
{
- return (unsigned long)xdp_prog->aux != aux;
+ struct bpf_redirect_info *ri;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+ /* Avoid polluting remote cacheline due to writes if
+ * not needed. Once we pass this test, we need the
+ * cmpxchg() to make sure it hasn't been changed in
+ * the meantime by remote CPU.
+ */
+ if (unlikely(READ_ONCE(ri->map) == map))
+ cmpxchg(&ri->map, map, NULL);
+ }
}
static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ struct bpf_redirect_info *ri)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
- unsigned long map_owner = ri->map_owner;
- struct bpf_map *map = ri->map;
u32 index = ri->ifindex;
void *fwd = NULL;
int err;
ri->ifindex = 0;
- ri->map = NULL;
- ri->map_owner = 0;
-
- if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
- err = -EFAULT;
- map = NULL;
- goto err;
- }
+ WRITE_ONCE(ri->map, NULL);
fwd = __xdp_map_lookup_elem(map, index);
- if (!fwd) {
+ if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
- if (ri->map_to_flush && ri->map_to_flush != map)
+ if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map();
err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
@@ -3285,54 +3339,30 @@ err:
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
- struct net_device *fwd;
- u32 index = ri->ifindex;
- int err;
-
- if (ri->map)
- return xdp_do_redirect_map(dev, xdp, xdp_prog);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map = READ_ONCE(ri->map);
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->ifindex = 0;
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
-
- err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
- if (unlikely(err))
- goto err;
+ if (likely(map))
+ return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
- _trace_xdp_redirect(dev, xdp_prog, index);
- return 0;
-err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
- return err;
+ return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ struct bpf_prog *xdp_prog,
+ struct bpf_map *map)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
- unsigned long map_owner = ri->map_owner;
- struct bpf_map *map = ri->map;
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
u32 index = ri->ifindex;
void *fwd = NULL;
int err = 0;
ri->ifindex = 0;
- ri->map = NULL;
- ri->map_owner = 0;
+ WRITE_ONCE(ri->map, NULL);
- if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
- err = -EFAULT;
- map = NULL;
- goto err;
- }
fwd = __xdp_map_lookup_elem(map, index);
if (unlikely(!fwd)) {
err = -EINVAL;
@@ -3368,14 +3398,15 @@ err:
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map = READ_ONCE(ri->map);
u32 index = ri->ifindex;
struct net_device *fwd;
int err = 0;
- if (ri->map)
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
-
+ if (map)
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
+ map);
ri->ifindex = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
@@ -3399,15 +3430,14 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return XDP_ABORTED;
ri->ifindex = ifindex;
ri->flags = flags;
- ri->map = NULL;
- ri->map_owner = 0;
+ WRITE_ONCE(ri->map, NULL);
return XDP_REDIRECT;
}
@@ -3420,25 +3450,21 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags,
- unsigned long, map_owner)
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
+ u64, flags)
{
- struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return XDP_ABORTED;
ri->ifindex = ifindex;
ri->flags = flags;
- ri->map = map;
- ri->map_owner = map_owner;
+ WRITE_ONCE(ri->map, map);
return XDP_REDIRECT;
}
-/* Note, arg4 is hidden from users and populated by the verifier
- * with the right pointer.
- */
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
.func = bpf_xdp_redirect_map,
.gpl_only = false,
@@ -3681,7 +3707,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
if (unlikely(size > IP_TUNNEL_OPTS_MAX))
return -ENOMEM;
- ip_tunnel_info_opts_set(info, from, size);
+ ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
return 0;
}
@@ -3768,6 +3794,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
+
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+ ancestor_level)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+ struct cgroup *ancestor;
+ struct cgroup *cgrp;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ ancestor = cgroup_ancestor(cgrp, ancestor_level);
+ if (!ancestor)
+ return 0;
+
+ return ancestor->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
+ .func = bpf_skb_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
#endif
static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -3814,6 +3866,30 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
+ .func = bpf_get_socket_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return sock_gen_cookie(ctx->sk);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
+ .func = bpf_get_socket_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -3857,8 +3933,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
break;
- case SO_MAX_PACING_RATE:
- sk->sk_max_pacing_rate = val;
+ case SO_MAX_PACING_RATE: /* 32bit version */
+ sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
break;
@@ -3955,6 +4031,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
tp->snd_ssthresh = val;
}
break;
+ case TCP_SAVE_SYN:
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ tp->save_syn = val;
+ break;
default:
ret = -EINVAL;
}
@@ -3984,17 +4066,29 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
if (!sk_fullsock(sk))
goto err_clear;
-
#ifdef CONFIG_INET
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
- if (optname == TCP_CONGESTION) {
- struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+
+ switch (optname) {
+ case TCP_CONGESTION:
+ icsk = inet_csk(sk);
if (!icsk->icsk_ca_ops || optlen <= 1)
goto err_clear;
strncpy(optval, icsk->icsk_ca_ops->name, optlen);
optval[optlen - 1] = 0;
- } else {
+ break;
+ case TCP_SAVED_SYN:
+ tp = tcp_sk(sk);
+
+ if (optlen <= 0 || !tp->saved_syn ||
+ optlen > tp->saved_syn[0])
+ goto err_clear;
+ memcpy(optval, tp->saved_syn + 1, optlen);
+ break;
+ default:
goto err_clear;
}
} else if (level == SOL_IP) {
@@ -4544,26 +4638,28 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
{
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
void *srh_tlvs, *srh_end, *ptr;
- struct ipv6_sr_hdr *srh;
int srhoff = 0;
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ if (srh == NULL)
return -EINVAL;
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
ptr = skb->data + offset;
if (ptr >= srh_tlvs && ptr + len <= srh_end)
- srh_state->valid = 0;
+ srh_state->valid = false;
else if (ptr < (void *)&srh->flags ||
ptr + len > (void *)&srh->segments)
return -EFAULT;
if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
memcpy(skb->data + offset, from, len);
return 0;
@@ -4579,52 +4675,78 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
.arg4_type = ARG_CONST_SIZE
};
-BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
- u32, action, void *, param, u32, param_len)
+static void bpf_update_srh_state(struct sk_buff *skb)
{
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
- struct ipv6_sr_hdr *srh;
int srhoff = 0;
- int err;
-
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
- return -EINVAL;
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
- if (!srh_state->valid) {
- if (unlikely((srh_state->hdrlen & 7) != 0))
- return -EBADMSG;
-
- srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
- if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
- return -EBADMSG;
-
- srh_state->valid = 1;
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
+ srh_state->srh = NULL;
+ } else {
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_state->hdrlen = srh_state->srh->hdrlen << 3;
+ srh_state->valid = true;
}
+}
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+ u32, action, void *, param, u32, param_len)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ int hdroff = 0;
+ int err;
switch (action) {
case SEG6_LOCAL_ACTION_END_X:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
if (param_len != sizeof(struct in6_addr))
return -EINVAL;
return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
case SEG6_LOCAL_ACTION_END_T:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
if (param_len != sizeof(int))
return -EINVAL;
return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+ case SEG6_LOCAL_ACTION_END_DT6:
+ if (!seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
+ if (param_len != sizeof(int))
+ return -EINVAL;
+
+ if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
+ return -EBADMSG;
+ if (!pskb_pull(skb, hdroff))
+ return -EBADMSG;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ bpf_compute_data_pointers(skb);
+ bpf_update_srh_state(skb);
+ return seg6_lookup_nexthop(skb, NULL, *(int *)param);
case SEG6_LOCAL_ACTION_END_B6:
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
param, param_len);
if (!err)
- srh_state->hdrlen =
- ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ bpf_update_srh_state(skb);
+
return err;
case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
+ return -EBADMSG;
err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
param, param_len);
if (!err)
- srh_state->hdrlen =
- ((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+ bpf_update_srh_state(skb);
+
return err;
default:
return -EINVAL;
@@ -4646,15 +4768,14 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
{
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
void *srh_end, *srh_tlvs, *ptr;
- struct ipv6_sr_hdr *srh;
struct ipv6hdr *hdr;
int srhoff = 0;
int ret;
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ if (unlikely(srh == NULL))
return -EINVAL;
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
((srh->first_segment + 1) << 4));
@@ -4684,8 +4805,11 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
hdr = (struct ipv6hdr *)skb->data;
hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+ return -EINVAL;
+ srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
srh_state->hdrlen += len;
- srh_state->valid = 0;
+ srh_state->valid = false;
return 0;
}
@@ -4699,6 +4823,149 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */
+#ifdef CONFIG_INET
+static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+ struct sk_buff *skb, u8 family, u8 proto)
+{
+ bool refcounted = false;
+ struct sock *sk = NULL;
+ int dif = 0;
+
+ if (skb->dev)
+ dif = skb->dev->ifindex;
+
+ if (family == AF_INET) {
+ __be32 src4 = tuple->ipv4.saddr;
+ __be32 dst4 = tuple->ipv4.daddr;
+ int sdif = inet_sdif(skb);
+
+ if (proto == IPPROTO_TCP)
+ sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+ src4, tuple->ipv4.sport,
+ dst4, tuple->ipv4.dport,
+ dif, sdif, &refcounted);
+ else
+ sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
+ dst4, tuple->ipv4.dport,
+ dif, sdif, &udp_table, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
+ struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+ u16 hnum = ntohs(tuple->ipv6.dport);
+ int sdif = inet6_sdif(skb);
+
+ if (proto == IPPROTO_TCP)
+ sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+ src6, tuple->ipv6.sport,
+ dst6, hnum,
+ dif, sdif, &refcounted);
+ else if (likely(ipv6_bpf_stub))
+ sk = ipv6_bpf_stub->udp6_lib_lookup(net,
+ src6, tuple->ipv6.sport,
+ dst6, hnum,
+ dif, sdif,
+ &udp_table, skb);
+#endif
+ }
+
+ if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ sk = NULL;
+ }
+ return sk;
+}
+
+/* bpf_sk_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ * Returns the socket as an 'unsigned long' to simplify the casting in the
+ * callers to satisfy BPF_CALL declarations.
+ */
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ u8 proto, u64 netns_id, u64 flags)
+{
+ struct net *caller_net;
+ struct sock *sk = NULL;
+ u8 family = AF_UNSPEC;
+ struct net *net;
+
+ family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
+ if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags))
+ goto out;
+
+ if (skb->dev)
+ caller_net = dev_net(skb->dev);
+ else
+ caller_net = sock_net(skb->sk);
+ if (netns_id) {
+ net = get_net_ns_by_id(caller_net, netns_id);
+ if (unlikely(!net))
+ goto out;
+ sk = sk_lookup(net, tuple, skb, family, proto);
+ put_net(net);
+ } else {
+ net = caller_net;
+ sk = sk_lookup(net, tuple, skb, family, proto);
+ }
+
+ if (sk)
+ sk = sk_to_full_sk(sk);
+out:
+ return (unsigned long) sk;
+}
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+ .func = bpf_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+ .func = bpf_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_release, struct sock *, sk)
+{
+ if (!sock_flag(sk, SOCK_RCU_FREE))
+ sock_gen_put(sk);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+ .func = bpf_sk_release,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_SOCKET,
+};
+#endif /* CONFIG_INET */
+
bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
@@ -4718,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_xdp_adjust_head ||
func == bpf_xdp_adjust_meta ||
func == bpf_msg_pull_data ||
+ func == bpf_msg_push_data ||
func == bpf_xdp_adjust_tail ||
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
func == bpf_lwt_seg6_store_bytes ||
@@ -4740,6 +5008,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_map_update_elem_proto;
case BPF_FUNC_map_delete_elem:
return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_get_smp_processor_id:
@@ -4753,6 +5027,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+ /* else: fall through */
default:
return NULL;
}
@@ -4767,6 +5042,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
*/
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -4789,6 +5066,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
default:
return NULL;
}
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -4812,6 +5093,17 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
static const struct bpf_func_proto *
+cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+ default:
+ return sk_filter_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -4884,6 +5176,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -4917,6 +5219,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -4931,11 +5236,18 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_map_update_proto;
case BPF_FUNC_sock_hash_update:
return &bpf_sock_hash_update_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_ops_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
default:
return bpf_base_func_proto(func_id);
}
}
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -4950,11 +5262,18 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_cork_bytes_proto;
case BPF_FUNC_msg_pull_data:
return &bpf_msg_pull_data_proto;
+ case BPF_FUNC_msg_push_data:
+ return &bpf_msg_push_data_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
default:
return bpf_base_func_proto(func_id);
}
}
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -4977,6 +5296,27 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_redirect_map_proto;
case BPF_FUNC_sk_redirect_hash:
return &bpf_sk_redirect_hash_proto;
+ case BPF_FUNC_get_local_storage:
+ return &bpf_get_local_storage_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+#endif
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5100,6 +5440,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (size != size_default)
return false;
break;
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
+ if (size != sizeof(struct bpf_flow_keys *))
+ return false;
+ break;
default:
/* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
@@ -5125,6 +5469,7 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -5141,6 +5486,40 @@ static bool sk_filter_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
+static bool cg_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
+ return false;
+ }
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -5150,6 +5529,7 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
return false;
}
@@ -5235,23 +5615,29 @@ static bool __sock_filter_check_size(int off, int size,
return size == size_default;
}
-static bool sock_filter_is_valid_access(int off, int size,
- enum bpf_access_type type,
- const struct bpf_prog *prog,
- struct bpf_insn_access_aux *info)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
if (off % size != 0)
return false;
- if (!__sock_filter_check_attach_type(off, type,
- prog->expected_attach_type))
- return false;
if (!__sock_filter_check_size(off, size, info))
return false;
return true;
}
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (!bpf_sock_is_valid_access(off, size, type, info))
+ return false;
+ return __sock_filter_check_attach_type(off, type,
+ prog->expected_attach_type);
+}
+
static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
const struct bpf_prog *prog, int drop_verdict)
{
@@ -5360,6 +5746,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_end):
info->reg_type = PTR_TO_PACKET_END;
break;
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
}
@@ -5561,6 +5948,7 @@ static bool sk_skb_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
return false;
}
@@ -5620,6 +6008,39 @@ static bool sk_msg_is_valid_access(int off, int size,
return true;
}
+static bool flow_dissector_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case bpf_ctx_range(struct __sk_buff, flow_keys):
+ info->reg_type = PTR_TO_FLOW_KEYS;
+ break;
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ return false;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -5914,15 +6335,24 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct sock_common,
skc_num, 2, target_size));
break;
+
+ case offsetof(struct __sk_buff, flow_keys):
+ off = si->off;
+ off -= offsetof(struct __sk_buff, flow_keys);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, flow_keys);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+ si->src_reg, off);
+ break;
}
return insn - insn_buf;
}
-static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
- const struct bpf_insn *si,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog, u32 *target_size)
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
int off;
@@ -6632,22 +7062,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct sk_msg_md, data):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, data));
+ offsetof(struct sk_msg, data));
break;
case offsetof(struct sk_msg_md, data_end):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, data_end));
+ offsetof(struct sk_msg, data_end));
break;
case offsetof(struct sk_msg_md, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_family));
break;
@@ -6656,9 +7086,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_daddr));
break;
@@ -6668,9 +7098,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_rcv_saddr));
@@ -6685,9 +7115,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
off = si->off;
off -= offsetof(struct sk_msg_md, remote_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_v6_daddr.s6_addr32[0]) +
@@ -6706,9 +7136,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
off = si->off;
off -= offsetof(struct sk_msg_md, local_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -6722,9 +7152,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
@@ -6736,9 +7166,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_num));
break;
@@ -6781,8 +7211,8 @@ const struct bpf_prog_ops xdp_prog_ops = {
};
const struct bpf_verifier_ops cg_skb_verifier_ops = {
- .get_func_proto = sk_filter_func_proto,
- .is_valid_access = sk_filter_is_valid_access,
+ .get_func_proto = cg_skb_func_proto,
+ .is_valid_access = cg_skb_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
};
@@ -6834,7 +7264,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = {
const struct bpf_verifier_ops cg_sock_verifier_ops = {
.get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
- .convert_ctx_access = sock_filter_convert_ctx_access,
+ .convert_ctx_access = bpf_sock_convert_ctx_access,
};
const struct bpf_prog_ops cg_sock_prog_ops = {
@@ -6877,6 +7307,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
const struct bpf_prog_ops sk_msg_prog_ops = {
};
+const struct bpf_verifier_ops flow_dissector_verifier_ops = {
+ .get_func_proto = flow_dissector_func_proto,
+ .is_valid_access = flow_dissector_is_valid_access,
+ .convert_ctx_access = bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops flow_dissector_prog_ops = {
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
@@ -6940,3 +7379,271 @@ out:
release_sock(sk);
return ret;
}
+
+#ifdef CONFIG_INET
+struct sk_reuseport_kern {
+ struct sk_buff *skb;
+ struct sock *sk;
+ struct sock *selected_sk;
+ void *data_end;
+ u32 hash;
+ u32 reuseport_id;
+ bool bind_inany;
+};
+
+static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
+ struct sock_reuseport *reuse,
+ struct sock *sk, struct sk_buff *skb,
+ u32 hash)
+{
+ reuse_kern->skb = skb;
+ reuse_kern->sk = sk;
+ reuse_kern->selected_sk = NULL;
+ reuse_kern->data_end = skb->data + skb_headlen(skb);
+ reuse_kern->hash = hash;
+ reuse_kern->reuseport_id = reuse->reuseport_id;
+ reuse_kern->bind_inany = reuse->bind_inany;
+}
+
+struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ u32 hash)
+{
+ struct sk_reuseport_kern reuse_kern;
+ enum sk_action action;
+
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
+ action = BPF_PROG_RUN(prog, &reuse_kern);
+
+ if (action == SK_PASS)
+ return reuse_kern.selected_sk;
+ else
+ return ERR_PTR(-ECONNREFUSED);
+}
+
+BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
+ struct bpf_map *, map, void *, key, u32, flags)
+{
+ struct sock_reuseport *reuse;
+ struct sock *selected_sk;
+
+ selected_sk = map->ops->map_lookup_elem(map, key);
+ if (!selected_sk)
+ return -ENOENT;
+
+ reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
+ if (!reuse)
+ /* selected_sk is unhashed (e.g. by close()) after the
+ * above map_lookup_elem(). Treat selected_sk has already
+ * been removed from the map.
+ */
+ return -ENOENT;
+
+ if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
+ struct sock *sk;
+
+ if (unlikely(!reuse_kern->reuseport_id))
+ /* There is a small race between adding the
+ * sk to the map and setting the
+ * reuse_kern->reuseport_id.
+ * Treat it as the sk has not been added to
+ * the bpf map yet.
+ */
+ return -ENOENT;
+
+ sk = reuse_kern->sk;
+ if (sk->sk_protocol != selected_sk->sk_protocol)
+ return -EPROTOTYPE;
+ else if (sk->sk_family != selected_sk->sk_family)
+ return -EAFNOSUPPORT;
+
+ /* Catch all. Likely bound to a different sockaddr. */
+ return -EBADFD;
+ }
+
+ reuse_kern->selected_sk = selected_sk;
+
+ return 0;
+}
+
+static const struct bpf_func_proto sk_select_reuseport_proto = {
+ .func = sk_select_reuseport,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(sk_reuseport_load_bytes,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len)
+{
+ return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
+ .func = sk_reuseport_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(sk_reuseport_load_bytes_relative,
+ const struct sk_reuseport_kern *, reuse_kern, u32, offset,
+ void *, to, u32, len, u32, start_header)
+{
+ return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
+ len, start_header);
+}
+
+static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
+ .func = sk_reuseport_load_bytes_relative,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_reuseport_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sk_select_reuseport:
+ return &sk_select_reuseport_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &sk_reuseport_load_bytes_proto;
+ case BPF_FUNC_skb_load_bytes_relative:
+ return &sk_reuseport_load_bytes_relative_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static bool
+sk_reuseport_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const u32 size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
+ off % size || type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_reuseport_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, hash):
+ return size == size_default;
+
+ /* Fields that allow narrowing */
+ case offsetof(struct sk_reuseport_md, eth_protocol):
+ if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+ return false;
+ /* fall through */
+ case offsetof(struct sk_reuseport_md, ip_protocol):
+ case offsetof(struct sk_reuseport_md, bind_inany):
+ case offsetof(struct sk_reuseport_md, len):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+
+ default:
+ return false;
+ }
+}
+
+#define SK_REUSEPORT_LOAD_FIELD(F) ({ \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ si->dst_reg, si->src_reg, \
+ bpf_target_off(struct sk_reuseport_kern, F, \
+ FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ target_size)); \
+ })
+
+#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sk_buff, \
+ skb, \
+ SKB_FIELD)
+
+#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \
+ struct sock, \
+ sk, \
+ SK_FIELD, BPF_SIZE, EXTRA_OFF)
+
+static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct sk_reuseport_md, data):
+ SK_REUSEPORT_LOAD_SKB_FIELD(data);
+ break;
+
+ case offsetof(struct sk_reuseport_md, len):
+ SK_REUSEPORT_LOAD_SKB_FIELD(len);
+ break;
+
+ case offsetof(struct sk_reuseport_md, eth_protocol):
+ SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
+ break;
+
+ case offsetof(struct sk_reuseport_md, ip_protocol):
+ BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
+ SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
+ BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+ SK_FL_PROTO_SHIFT);
+ /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
+ * aware. No further narrowing or masking is needed.
+ */
+ *target_size = 1;
+ break;
+
+ case offsetof(struct sk_reuseport_md, data_end):
+ SK_REUSEPORT_LOAD_FIELD(data_end);
+ break;
+
+ case offsetof(struct sk_reuseport_md, hash):
+ SK_REUSEPORT_LOAD_FIELD(hash);
+ break;
+
+ case offsetof(struct sk_reuseport_md, bind_inany):
+ SK_REUSEPORT_LOAD_FIELD(bind_inany);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
+ .get_func_proto = sk_reuseport_func_proto,
+ .is_valid_access = sk_reuseport_is_valid_access,
+ .convert_ctx_access = sk_reuseport_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_reuseport_prog_ops = {
+};
+#endif /* CONFIG_INET */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 53f96e4f7bf5..676f3ad629f9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -25,6 +25,9 @@
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
+#include <linux/bpf.h>
+
+static DEFINE_MUTEX(flow_dissector_mutex);
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
@@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
}
EXPORT_SYMBOL(skb_flow_dissector_init);
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog *attached;
+ struct net *net;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&flow_dissector_mutex);
+ attached = rcu_dereference_protected(net->flow_dissector_prog,
+ lockdep_is_held(&flow_dissector_mutex));
+ if (attached) {
+ /* Only one BPF program can be attached at a time */
+ mutex_unlock(&flow_dissector_mutex);
+ return -EEXIST;
+ }
+ rcu_assign_pointer(net->flow_dissector_prog, prog);
+ mutex_unlock(&flow_dissector_mutex);
+ return 0;
+}
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
+{
+ struct bpf_prog *attached;
+ struct net *net;
+
+ net = current->nsproxy->net_ns;
+ mutex_lock(&flow_dissector_mutex);
+ attached = rcu_dereference_protected(net->flow_dissector_prog,
+ lockdep_is_held(&flow_dissector_mutex));
+ if (!attached) {
+ mutex_unlock(&flow_dissector_mutex);
+ return -ENOENT;
+ }
+ bpf_prog_put(attached);
+ RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+ mutex_unlock(&flow_dissector_mutex);
+ return 0;
+}
/**
* skb_flow_get_be16 - extract be16 entity
* @skb: sk_buff to extract from
@@ -152,7 +193,11 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
!dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
!dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_ENC_PORTS))
+ FLOW_DISSECTOR_KEY_ENC_PORTS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_OPTS))
return;
info = skb_tunnel_info(skb);
@@ -212,6 +257,31 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
tp->src = key->tp_src;
tp->dst = key->tp_dst;
}
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
+ struct flow_dissector_key_ip *ip;
+
+ ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP,
+ target_container);
+ ip->tos = key->tos;
+ ip->ttl = key->ttl;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
+ struct flow_dissector_key_enc_opts *enc_opt;
+
+ enc_opt = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_OPTS,
+ target_container);
+
+ if (info->options_len) {
+ enc_opt->len = info->options_len;
+ ip_tunnel_info_opts_get(enc_opt->data, info);
+ enc_opt->dst_opt_type = info->key.tun_flags &
+ TUNNEL_OPTIONS_PRESENT;
+ }
+ }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
@@ -353,8 +423,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
offset += sizeof(struct gre_base_hdr);
if (hdr->flags & GRE_CSUM)
- offset += sizeof(((struct gre_full_hdr *) 0)->csum) +
- sizeof(((struct gre_full_hdr *) 0)->reserved1);
+ offset += FIELD_SIZEOF(struct gre_full_hdr, csum) +
+ FIELD_SIZEOF(struct gre_full_hdr, reserved1);
if (hdr->flags & GRE_KEY) {
const __be32 *keyid;
@@ -376,11 +446,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
else
key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
}
- offset += sizeof(((struct gre_full_hdr *) 0)->key);
+ offset += FIELD_SIZEOF(struct gre_full_hdr, key);
}
if (hdr->flags & GRE_SEQ)
- offset += sizeof(((struct pptp_gre_header *) 0)->seq);
+ offset += FIELD_SIZEOF(struct pptp_gre_header, seq);
if (gre_ver == 0) {
if (*p_proto == htons(ETH_P_TEB)) {
@@ -407,7 +477,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
u8 *ppp_hdr;
if (hdr->flags & GRE_ACK)
- offset += sizeof(((struct pptp_gre_header *) 0)->ack);
+ offset += FIELD_SIZEOF(struct pptp_gre_header, ack);
ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
sizeof(_ppp_hdr),
@@ -559,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}
+static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_ports *key_ports;
+
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+ key_control->thoff = flow_keys->thoff;
+ if (flow_keys->is_frag)
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+ if (flow_keys->is_first_frag)
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flow_keys->is_encap)
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+ key_basic->n_proto = flow_keys->n_proto;
+ key_basic->ip_proto = flow_keys->ip_proto;
+
+ if (flow_keys->addr_proto == ETH_P_IP &&
+ dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
+ key_addrs->v4addrs.src = flow_keys->ipv4_src;
+ key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
+ memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->src = flow_keys->sport;
+ key_ports->dst = flow_keys->dport;
+ }
+}
+
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -589,7 +713,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret;
- bool skip_vlan = false;
+ enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ struct bpf_prog *attached = NULL;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -629,6 +754,50 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+ if (skb) {
+ if (skb->dev)
+ attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog);
+ else if (skb->sk)
+ attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog);
+ else
+ WARN_ON_ONCE(1);
+ }
+ if (attached) {
+ /* Note that even though the const qualifier is discarded
+ * throughout the execution of the BPF program, all changes(the
+ * control block) are reverted after the BPF program returns.
+ * Therefore, __skb_flow_dissect does not alter the skb.
+ */
+ struct bpf_flow_keys flow_keys = {};
+ struct bpf_skb_data_end cb_saved;
+ struct bpf_skb_data_end *cb;
+ u32 result;
+
+ cb = (struct bpf_skb_data_end *)skb->cb;
+
+ /* Save Control Block */
+ memcpy(&cb_saved, cb, sizeof(cb_saved));
+ memset(cb, 0, sizeof(cb_saved));
+
+ /* Pass parameters to the BPF program */
+ cb->qdisc_cb.flow_keys = &flow_keys;
+ flow_keys.nhoff = nhoff;
+
+ bpf_compute_data_pointers((struct sk_buff *)skb);
+ result = BPF_PROG_RUN(attached, skb);
+
+ /* Restore state */
+ memcpy(cb, &cb_saved, sizeof(cb_saved));
+
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ key_control->thoff = min_t(u16, key_control->thoff, skb->len);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
@@ -748,14 +917,14 @@ proto_again:
}
case htons(ETH_P_8021AD):
case htons(ETH_P_8021Q): {
- const struct vlan_hdr *vlan;
+ const struct vlan_hdr *vlan = NULL;
struct vlan_hdr _vlan;
- bool vlan_tag_present = skb && skb_vlan_tag_present(skb);
+ __be16 saved_vlan_tpid = proto;
- if (vlan_tag_present)
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
+ skb && skb_vlan_tag_present(skb)) {
proto = skb->protocol;
-
- if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
+ } else {
vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
data, hlen, &_vlan);
if (!vlan) {
@@ -765,20 +934,23 @@ proto_again:
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
- if (skip_vlan) {
- fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
- break;
- }
}
- skip_vlan = true;
- if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_VLAN)) {
+ if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
+ } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
+ dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
+ } else {
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
+ if (dissector_uses_key(flow_dissector, dissector_vlan)) {
key_vlan = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_VLAN,
+ dissector_vlan,
target_container);
- if (vlan_tag_present) {
+ if (!vlan) {
key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
key_vlan->vlan_priority =
(skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
@@ -789,6 +961,7 @@ proto_again:
(ntohs(vlan->h_vlan_TCI) &
VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}
+ key_vlan->vlan_tpid = saved_vlan_tpid;
}
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 98fd12721221..e4e442d70c2d 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -112,7 +112,7 @@ static void est_timer(struct timer_list *t)
* @bstats: basic statistics
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
+ * @lock: lock for statistics and control path
* @running: qdisc running seqcount
* @opt: rate estimator configuration TLV
*
@@ -128,7 +128,7 @@ static void est_timer(struct timer_list *t)
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
- spinlock_t *stats_lock,
+ spinlock_t *lock,
seqcount_t *running,
struct nlattr *opt)
{
@@ -154,19 +154,22 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
seqcount_init(&est->seq);
intvl_log = parm->interval + 2;
est->bstats = bstats;
- est->stats_lock = stats_lock;
+ est->stats_lock = lock;
est->running = running;
est->ewma_log = parm->ewma_log;
est->intvl_log = intvl_log;
est->cpu_bstats = cpu_bstats;
- if (stats_lock)
+ if (lock)
local_bh_disable();
est_fetch_counters(est, &b);
- if (stats_lock)
+ if (lock)
local_bh_enable();
est->last_bytes = b.bytes;
est->last_packets = b.packets;
+
+ if (lock)
+ spin_lock_bh(lock);
old = rcu_dereference_protected(*rate_est, 1);
if (old) {
del_timer_sync(&old->timer);
@@ -179,6 +182,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
mod_timer(&est->timer, est->next_jiffies);
rcu_assign_pointer(*rate_est, est);
+ if (lock)
+ spin_unlock_bh(lock);
if (old)
kfree_rcu(old, rcu);
return 0;
@@ -209,7 +214,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
* @bstats: basic statistics
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
- * @stats_lock: statistics lock
+ * @lock: lock for statistics and control path
* @running: qdisc running seqcount (might be NULL)
* @opt: rate estimator configuration TLV
*
@@ -221,11 +226,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
- spinlock_t *stats_lock,
+ spinlock_t *lock,
seqcount_t *running, struct nlattr *opt)
{
return gen_new_estimator(bstats, cpu_bstats, rate_est,
- stats_lock, running, opt);
+ lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 188d693cb251..9bf1b9ad1780 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -162,6 +162,34 @@ __gnet_stats_copy_basic(const seqcount_t *running,
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
+static int
+___gnet_stats_copy_basic(const seqcount_t *running,
+ struct gnet_dump *d,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b,
+ int type)
+{
+ struct gnet_stats_basic_packed bstats = {0};
+
+ __gnet_stats_copy_basic(running, &bstats, cpu, b);
+
+ if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
+ d->tc_stats.bytes = bstats.bytes;
+ d->tc_stats.packets = bstats.packets;
+ }
+
+ if (d->tail) {
+ struct gnet_stats_basic sb;
+
+ memset(&sb, 0, sizeof(sb));
+ sb.bytes = bstats.bytes;
+ sb.packets = bstats.packets;
+ return gnet_stats_copy(d, type, &sb, sizeof(sb),
+ TCA_STATS_PAD);
+ }
+ return 0;
+}
+
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @running: seqcount_t pointer
@@ -181,29 +209,36 @@ gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
- struct gnet_stats_basic_packed bstats = {0};
-
- __gnet_stats_copy_basic(running, &bstats, cpu, b);
-
- if (d->compat_tc_stats) {
- d->tc_stats.bytes = bstats.bytes;
- d->tc_stats.packets = bstats.packets;
- }
-
- if (d->tail) {
- struct gnet_stats_basic sb;
-
- memset(&sb, 0, sizeof(sb));
- sb.bytes = bstats.bytes;
- sb.packets = bstats.packets;
- return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
- TCA_STATS_PAD);
- }
- return 0;
+ return ___gnet_stats_copy_basic(running, d, cpu, b,
+ TCA_STATS_BASIC);
}
EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
+ * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
+ * @running: seqcount_t pointer
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic_hw(const seqcount_t *running,
+ struct gnet_dump *d,
+ struct gnet_stats_basic_cpu __percpu *cpu,
+ struct gnet_stats_basic_packed *b)
+{
+ return ___gnet_stats_copy_basic(running, d, cpu, b,
+ TCA_STATS_BASIC_HW);
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
+
+/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
* @rate_est: rate estimator
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index e38e641e98d5..7f51efb2b3ab 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -155,7 +155,7 @@ static void linkwatch_do_dev(struct net_device *dev)
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
- if (dev->flags & IFF_UP) {
+ if (dev->flags & IFF_UP && netif_device_present(dev)) {
if (netif_carrier_ok(dev))
dev_activate(dev);
else
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index e45098593dc0..3e85437f7106 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -50,10 +50,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
* mixing with BH RCU lock doesn't work.
*/
preempt_disable();
- rcu_read_lock();
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
- rcu_read_unlock();
switch (ret) {
case BPF_OK:
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8e3fda9e725c..ee605d9d8bd4 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -232,7 +232,8 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
}
}
-static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
{
int i;
struct neigh_hash_table *nht;
@@ -250,6 +251,10 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
np = &n->next;
continue;
}
+ if (skip_perm && n->nud_state & NUD_PERMANENT) {
+ np = &n->next;
+ continue;
+ }
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
@@ -285,21 +290,35 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
+ neigh_flush_dev(tbl, dev, false);
write_unlock_bh(&tbl->lock);
}
EXPORT_SYMBOL(neigh_changeaddr);
-int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
{
write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
+ neigh_flush_dev(tbl, dev, skip_perm);
pneigh_ifdown_and_unlock(tbl, dev);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
return 0;
}
+
+int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev)
+{
+ __neigh_ifdown(tbl, dev, true);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_carrier_down);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ __neigh_ifdown(tbl, dev, false);
+ return 0;
+}
EXPORT_SYMBOL(neigh_ifdown);
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
@@ -1179,6 +1198,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
lladdr = neigh->ha;
}
+ /* Update confirmed timestamp for neighbour entry after we
+ * received ARP packet even if it doesn't change IP to MAC binding.
+ */
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
@@ -1200,15 +1225,12 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
}
}
- /* Update timestamps only once we know we will make a change to the
+ /* Update timestamp only once we know we will make a change to the
* neighbour entry. Otherwise we risk to move the locktime window with
* noop updates and ignore relevant ARP updates.
*/
- if (new != old || lladdr != neigh->ha) {
- if (new & NUD_CONNECTED)
- neigh->confirmed = jiffies;
+ if (new != old || lladdr != neigh->ha)
neigh->updated = jiffies;
- }
if (new != old) {
neigh_del_timer(neigh);
@@ -1276,11 +1298,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh->arp_queue_len_bytes = 0;
}
out:
- if (update_isrouter) {
- neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
- (neigh->flags | NTF_ROUTER) :
- (neigh->flags & ~NTF_ROUTER);
- }
+ if (update_isrouter)
+ neigh_update_is_router(neigh, flags, &notify);
write_unlock_bh(&neigh->lock);
if (notify)
@@ -1708,7 +1727,8 @@ out:
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
@@ -1783,12 +1803,16 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
}
if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
- flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ flags &= ~(NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
}
if (ndm->ndm_flags & NTF_EXT_LEARNED)
flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+ if (ndm->ndm_flags & NTF_ROUTER)
+ flags |= NEIGH_UPDATE_F_ISROUTER;
+
if (ndm->ndm_flags & NTF_USE) {
neigh_event_send(neigh, NULL);
err = 0;
@@ -2158,15 +2182,47 @@ errout:
return err;
}
+static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ndtmsg *ndtm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
+ return -EINVAL;
+ }
+
+ ndtm = nlmsg_data(nlh);
+ if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ndtm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
int family, tidx, nidx = 0;
int tbl_skip = cb->args[0];
int neigh_skip = cb->args[1];
struct neigh_table *tbl;
- family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+ if (cb->strict_check) {
+ int err = neightbl_valid_dump_info(nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
@@ -2179,7 +2235,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
NLM_F_MULTI) < 0)
break;
@@ -2194,7 +2250,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (neightbl_fill_param_info(skb, tbl, p,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
NLM_F_MULTI) < 0)
goto out;
@@ -2323,35 +2379,24 @@ static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
return false;
}
+struct neigh_dump_filter {
+ int master_idx;
+ int dev_idx;
+};
+
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct neigh_dump_filter *filter)
{
struct net *net = sock_net(skb->sk);
- const struct nlmsghdr *nlh = cb->nlh;
- struct nlattr *tb[NDA_MAX + 1];
struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
- int filter_master_idx = 0, filter_idx = 0;
unsigned int flags = NLM_F_MULTI;
- int err;
- err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
- if (!err) {
- if (tb[NDA_IFINDEX]) {
- if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
- return -EINVAL;
- filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
- }
- if (tb[NDA_MASTER]) {
- if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
- return -EINVAL;
- filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
- }
- if (filter_idx || filter_master_idx)
- flags |= NLM_F_DUMP_FILTERED;
- }
+ if (filter->dev_idx || filter->master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
@@ -2364,8 +2409,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
n = rcu_dereference_bh(n->next)) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
- if (neigh_ifindex_filtered(n->dev, filter_idx) ||
- neigh_master_filtered(n->dev, filter_master_idx))
+ if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
+ neigh_master_filtered(n->dev, filter->master_idx))
goto next;
if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
@@ -2387,12 +2432,17 @@ out:
}
static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct neigh_dump_filter *filter)
{
struct pneigh_entry *n;
struct net *net = sock_net(skb->sk);
int rc, h, s_h = cb->args[3];
int idx, s_idx = idx = cb->args[4];
+ unsigned int flags = NLM_F_MULTI;
+
+ if (filter->dev_idx || filter->master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
read_lock_bh(&tbl->lock);
@@ -2402,10 +2452,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
if (idx < s_idx || pneigh_net(n) != net)
goto next;
+ if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
+ neigh_master_filtered(n->dev, filter->master_idx))
+ goto next;
if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI, tbl) < 0) {
+ RTM_NEWNEIGH, flags, tbl) < 0) {
read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
@@ -2424,22 +2476,91 @@ out:
}
+static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
+ bool strict_check,
+ struct neigh_dump_filter *filter,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ int err, i;
+
+ if (strict_check) {
+ struct ndmsg *ndm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
+ ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
+ NULL, extack);
+ } else {
+ err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
+ NULL, extack);
+ }
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ /* all new attributes should require strict_check */
+ switch (i) {
+ case NDA_IFINDEX:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request");
+ return -EINVAL;
+ }
+ filter->dev_idx = nla_get_u32(tb[i]);
+ break;
+ case NDA_MASTER:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request");
+ return -EINVAL;
+ }
+ filter->master_idx = nla_get_u32(tb[i]);
+ break;
+ default:
+ if (strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct neigh_dump_filter filter = {};
struct neigh_table *tbl;
int t, family, s_t;
int proxy = 0;
int err;
- family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
/* check for full ndmsg structure presence, family member is
* the same for both structures
*/
- if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
- ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY)
proxy = 1;
+ err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
+ if (err < 0 && cb->strict_check)
+ return err;
+
s_t = cb->args[0];
for (t = 0; t < NEIGH_NR_TABLES; t++) {
@@ -2453,9 +2574,9 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
if (proxy)
- err = pneigh_dump_table(tbl, skb, cb);
+ err = pneigh_dump_table(tbl, skb, cb, &filter);
else
- err = neigh_dump_table(tbl, skb, cb);
+ err = neigh_dump_table(tbl, skb, cb, &filter);
if (err < 0)
break;
}
@@ -3273,4 +3394,3 @@ static int __init neigh_init(void)
}
subsys_initcall(neigh_init);
-
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bb7e80f4ced3..bd67c4d0fcfd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -26,6 +26,7 @@
#include <linux/pm_runtime.h>
#include <linux/of.h>
#include <linux/of_net.h>
+#include <linux/cpu.h>
#include "net-sysfs.h"
@@ -905,11 +906,20 @@ static const void *rx_queue_namespace(struct kobject *kobj)
return ns;
}
+static void rx_queue_get_ownership(struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = rx_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
static struct kobj_type rx_queue_ktype __ro_after_init = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
.default_attrs = rx_queue_default_attrs,
- .namespace = rx_queue_namespace
+ .namespace = rx_queue_namespace,
+ .get_ownership = rx_queue_get_ownership,
};
static int rx_queue_add_kobject(struct net_device *dev, int index)
@@ -1047,13 +1057,30 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
char *buf)
{
struct net_device *dev = queue->dev;
- int index = get_netdev_queue_index(queue);
- int tc = netdev_txq_to_tc(dev, index);
+ int index;
+ int tc;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+ index = get_netdev_queue_index(queue);
+
+ /* If queue belongs to subordinate dev use its TC mapping */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
- return sprintf(buf, "%u\n", tc);
+ /* We can report the traffic class one of two ways:
+ * Subordinate device traffic classes are reported with the traffic
+ * class first, and then the subordinate class so for example TC0 on
+ * subordinate device 2 will be reported as "0-2". If the queue
+ * belongs to the root device it will be reported with just the
+ * traffic class, so just "0" for TC 0 for example.
+ */
+ return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
+ sprintf(buf, "%u\n", tc);
}
#ifdef CONFIG_XPS
@@ -1070,6 +1097,9 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
int err, index = get_netdev_queue_index(queue);
u32 rate = 0;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
err = kstrtou32(buf, 10, &rate);
if (err < 0)
return err;
@@ -1214,10 +1244,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
cpumask_var_t mask;
unsigned long index;
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
index = get_netdev_queue_index(queue);
if (dev->num_tc) {
+ /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+ if (num_tc < 0)
+ return -EINVAL;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
@@ -1227,13 +1267,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
return -ENOMEM;
rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
+ dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {
for_each_possible_cpu(cpu) {
int i, tci = cpu * num_tc + tc;
struct xps_map *map;
- map = rcu_dereference(dev_maps->cpu_map[tci]);
+ map = rcu_dereference(dev_maps->attr_map[tci]);
if (!map)
continue;
@@ -1260,6 +1300,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
cpumask_var_t mask;
int err;
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -1283,6 +1326,91 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
+
+static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ unsigned long *mask, index;
+ int j, len, num_tc = 1, tc = 0;
+
+ index = get_netdev_queue_index(queue);
+
+ if (dev->num_tc) {
+ num_tc = dev->num_tc;
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0)
+ return -EINVAL;
+ }
+ mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+ GFP_KERNEL);
+ if (!mask)
+ return -ENOMEM;
+
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_rxqs_map);
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
+ j < dev->num_rx_queues;) {
+ int i, tci = j * num_tc + tc;
+ struct xps_map *map;
+
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
+
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ set_bit(j, mask);
+ break;
+ }
+ }
+ }
+out_no_maps:
+ rcu_read_unlock();
+
+ len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
+ kfree(mask);
+
+ return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct net_device *dev = queue->dev;
+ struct net *net = dev_net(dev);
+ unsigned long *mask, index;
+ int err;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
+ GFP_KERNEL);
+ if (!mask)
+ return -ENOMEM;
+
+ index = get_netdev_queue_index(queue);
+
+ err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
+ if (err) {
+ kfree(mask);
+ return err;
+ }
+
+ cpus_read_lock();
+ err = __netif_set_xps_queue(dev, mask, index, true);
+ cpus_read_unlock();
+
+ kfree(mask);
+ return err ? : len;
+}
+
+static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
+ = __ATTR_RW(xps_rxqs);
#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
@@ -1290,6 +1418,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
&queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+ &xps_rxqs_attribute.attr,
&queue_tx_maxrate.attr,
#endif
NULL
@@ -1315,11 +1444,20 @@ static const void *netdev_queue_namespace(struct kobject *kobj)
return ns;
}
+static void netdev_queue_get_ownership(struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = netdev_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
static struct kobj_type netdev_queue_ktype __ro_after_init = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
.default_attrs = netdev_queue_default_attrs,
.namespace = netdev_queue_namespace,
+ .get_ownership = netdev_queue_get_ownership,
};
static int netdev_queue_add_kobject(struct net_device *dev, int index)
@@ -1509,6 +1647,14 @@ static const void *net_namespace(struct device *d)
return dev_net(dev);
}
+static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid)
+{
+ struct net_device *dev = to_net_dev(d);
+ const struct net *net = dev_net(dev);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
static struct class net_class __ro_after_init = {
.name = "net",
.dev_release = netdev_release,
@@ -1516,6 +1662,7 @@ static struct class net_class __ro_after_init = {
.dev_uevent = netdev_uevent,
.ns_type = &net_ns_type_operations,
.namespace = net_namespace,
+ .get_ownership = net_get_ownership,
};
#ifdef CONFIG_OF_NET
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a11e03f920d3..fefe72774aeb 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -17,6 +17,7 @@
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
+#include <linux/uidgid.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -448,6 +449,33 @@ dec_ucounts:
return net;
}
+/**
+ * net_ns_get_ownership - get sysfs ownership data for @net
+ * @net: network namespace in question (can be NULL)
+ * @uid: kernel user ID for sysfs objects
+ * @gid: kernel group ID for sysfs objects
+ *
+ * Returns the uid/gid pair of root in the user namespace associated with the
+ * given network namespace.
+ */
+void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
+{
+ if (net) {
+ kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
+ kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
+
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+ } else {
+ *uid = GLOBAL_ROOT_UID;
+ *gid = GLOBAL_ROOT_GID;
+ }
+}
+EXPORT_SYMBOL_GPL(net_ns_get_ownership);
+
static void unhash_nsid(struct net *net, struct net *last)
{
struct net *tmp;
@@ -825,6 +853,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
.s_idx = cb->args[0],
};
+ if (cb->strict_check &&
+ nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) {
+ NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request");
+ return -EINVAL;
+ }
+
spin_lock_bh(&net->nsid_lock);
idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
spin_unlock_bh(&net->nsid_lock);
@@ -973,22 +1007,18 @@ static int register_pernet_operations(struct list_head *list,
int error;
if (ops->id) {
-again:
- error = ida_get_new_above(&net_generic_ids, MIN_PERNET_OPS_ID, ops->id);
- if (error < 0) {
- if (error == -EAGAIN) {
- ida_pre_get(&net_generic_ids, GFP_KERNEL);
- goto again;
- }
+ error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
+ GFP_KERNEL);
+ if (error < 0)
return error;
- }
+ *ops->id = error;
max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
}
error = __register_pernet_operations(list, ops);
if (error) {
rcu_barrier();
if (ops->id)
- ida_remove(&net_generic_ids, *ops->id);
+ ida_free(&net_generic_ids, *ops->id);
}
return error;
@@ -999,7 +1029,7 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
__unregister_pernet_operations(ops);
rcu_barrier();
if (ops->id)
- ida_remove(&net_generic_ids, *ops->id);
+ ida_free(&net_generic_ids, *ops->id);
}
/**
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 57557a6a950c..5da9552b186b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -57,7 +57,6 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
MAX_UDP_CHUNK)
static void zap_completion_queue(void);
-static void netpoll_async_cleanup(struct work_struct *work);
static unsigned int carrier_timeout = 4;
module_param(carrier_timeout, uint, 0644);
@@ -135,27 +134,9 @@ static void queue_process(struct work_struct *work)
}
}
-/*
- * Check whether delayed processing was scheduled for our NIC. If so,
- * we attempt to grab the poll lock and use ->poll() to pump the card.
- * If this fails, either we've recursed in ->poll() or it's already
- * running on another CPU.
- *
- * Note: we don't mask interrupts with this lock because we're using
- * trylock here and interrupts are already disabled in the softirq
- * case. Further, we test the poll_owner to avoid recursion on UP
- * systems where the lock doesn't exist.
- */
static void poll_one_napi(struct napi_struct *napi)
{
- int work = 0;
-
- /* net_rx_action's ->poll() invocations and our's are
- * synchronized by this test which is only made while
- * holding the napi->poll_lock.
- */
- if (!test_bit(NAPI_STATE_SCHED, &napi->state))
- return;
+ int work;
/* If we set this bit but see that it has already been set,
* that indicates that napi has been disabled and we need
@@ -187,16 +168,16 @@ static void poll_napi(struct net_device *dev)
}
}
-static void netpoll_poll_dev(struct net_device *dev)
+void netpoll_poll_dev(struct net_device *dev)
{
- const struct net_device_ops *ops;
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
+ const struct net_device_ops *ops;
/* Don't do any rx activity if the dev_lock mutex is held
* the dev_open/close paths use this to block netpoll activity
* while changing device state
*/
- if (down_trylock(&ni->dev_lock))
+ if (!ni || down_trylock(&ni->dev_lock))
return;
if (!netif_running(dev)) {
@@ -205,13 +186,8 @@ static void netpoll_poll_dev(struct net_device *dev)
}
ops = dev->netdev_ops;
- if (!ops->ndo_poll_controller) {
- up(&ni->dev_lock);
- return;
- }
-
- /* Process pending work on NIC */
- ops->ndo_poll_controller(dev);
+ if (ops->ndo_poll_controller)
+ ops->ndo_poll_controller(dev);
poll_napi(dev);
@@ -219,6 +195,7 @@ static void netpoll_poll_dev(struct net_device *dev)
zap_completion_queue();
}
+EXPORT_SYMBOL(netpoll_poll_dev);
void netpoll_poll_disable(struct net_device *dev)
{
@@ -611,10 +588,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
np->dev = ndev;
strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
- INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
- if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
- !ndev->netdev_ops->ndo_poll_controller) {
+ if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
np->dev_name);
err = -ENOTSUPP;
@@ -811,10 +786,6 @@ void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- /* rtnl_dereference would be preferable here but
- * rcu_cleanup_netpoll path can put us in here safely without
- * holding the rtnl, so plain rcu_dereference it is
- */
npinfo = rtnl_dereference(np->dev->npinfo);
if (!npinfo)
return;
@@ -835,21 +806,16 @@ void __netpoll_cleanup(struct netpoll *np)
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
-static void netpoll_async_cleanup(struct work_struct *work)
+void __netpoll_free(struct netpoll *np)
{
- struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
+ ASSERT_RTNL();
- rtnl_lock();
+ /* Wait for transmitting packets to finish before freeing. */
+ synchronize_rcu_bh();
__netpoll_cleanup(np);
- rtnl_unlock();
kfree(np);
}
-
-void __netpoll_free_async(struct netpoll *np)
-{
- schedule_work(&np->cleanup_work);
-}
-EXPORT_SYMBOL_GPL(__netpoll_free_async);
+EXPORT_SYMBOL_GPL(__netpoll_free);
void netpoll_cleanup(struct netpoll *np)
{
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 49368e21d228..6ac919847ce6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strncpy(pkt_dev->dst_min, buf, len);
+ strcpy(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
@@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (len < 0)
return len;
-
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
-
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strncpy(pkt_dev->dst_max, buf, len);
+ strcpy(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
@@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strncpy(pkt_dev->src_min, buf, len);
+ strcpy(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
@@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strncpy(pkt_dev->src_max, buf, len);
+ strcpy(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
@@ -2255,7 +2253,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
} else {
/* slow path: we dont already have xfrm_state */
- x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
(xfrm_address_t *)&pkt_dev->cur_daddr,
(xfrm_address_t *)&pkt_dev->cur_saddr,
AF_INET,
@@ -3428,7 +3426,7 @@ xmit_more:
net_info_ratelimited("%s xmit error: %d\n",
pkt_dev->odevname, ret);
pkt_dev->errors++;
- /* fallthru */
+ /* fall through */
case NETDEV_TX_BUSY:
/* Retry it next time */
refcount_dec(&(pkt_dev->skb->users));
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e3f743c141b3..0958c7be2c22 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,7 +59,7 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
-#define RTNL_MAX_TYPE 48
+#define RTNL_MAX_TYPE 49
#define RTNL_SLAVE_MAX_TYPE 36
struct rtnl_link {
@@ -130,6 +130,12 @@ int rtnl_is_locked(void)
}
EXPORT_SYMBOL(rtnl_is_locked);
+bool refcount_dec_and_rtnl_lock(refcount_t *r)
+{
+ return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
+}
+EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);
+
#ifdef CONFIG_PROVE_LOCKING
bool lockdep_rtnl_is_held(void)
{
@@ -324,6 +330,10 @@ void rtnl_unregister_all(int protocol)
rtnl_lock();
tab = rtnl_msg_handlers[protocol];
+ if (!tab) {
+ rtnl_unlock();
+ return;
+ }
RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
link = tab[msgindex];
@@ -964,7 +974,8 @@ static size_t rtnl_xdp_size(void)
{
size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
nla_total_size(1) + /* XDP_ATTACHED */
- nla_total_size(4); /* XDP_PROG_ID */
+ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */
+ nla_total_size(4); /* XDP_<mode>_PROG_ID */
return xdp_size;
}
@@ -1011,9 +1022,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_NEW_NETNSID */
+ nla_total_size(4) /* IFLA_NEW_IFINDEX */
+ nla_total_size(1) /* IFLA_PROTO_DOWN */
- + nla_total_size(4) /* IFLA_IF_NETNSID */
+ + nla_total_size(4) /* IFLA_TARGET_NETNSID */
+ nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ + nla_total_size(4) /* IFLA_MIN_MTU */
+ + nla_total_size(4) /* IFLA_MAX_MTU */
+ 0;
}
@@ -1353,27 +1366,51 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
-static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
- const struct net_device_ops *ops = dev->netdev_ops;
const struct bpf_prog *generic_xdp_prog;
- struct netdev_bpf xdp;
ASSERT_RTNL();
- *prog_id = 0;
generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
- if (generic_xdp_prog) {
- *prog_id = generic_xdp_prog->aux->id;
- return XDP_ATTACHED_SKB;
- }
- if (!ops->ndo_bpf)
- return XDP_ATTACHED_NONE;
+ if (!generic_xdp_prog)
+ return 0;
+ return generic_xdp_prog->aux->id;
+}
- __dev_xdp_query(dev, ops->ndo_bpf, &xdp);
- *prog_id = xdp.prog_id;
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
- return xdp.prog_attached;
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+ XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+ u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+ u32 (*get_prog_id)(struct net_device *dev))
+{
+ u32 curr_id;
+ int err;
+
+ curr_id = get_prog_id(dev);
+ if (!curr_id)
+ return 0;
+
+ *prog_id = curr_id;
+ err = nla_put_u32(skb, attr, curr_id);
+ if (err)
+ return err;
+
+ if (*mode != XDP_ATTACHED_NONE)
+ *mode = XDP_ATTACHED_MULTI;
+ else
+ *mode = tgt_mode;
+
+ return 0;
}
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1381,17 +1418,32 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
struct nlattr *xdp;
u32 prog_id;
int err;
+ u8 mode;
xdp = nla_nest_start(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
- err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
- rtnl_xdp_attached_mode(dev, &prog_id));
+ prog_id = 0;
+ mode = XDP_ATTACHED_NONE;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+ IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+ IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+ IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
+ if (err)
+ goto err_cancel;
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
if (err)
goto err_cancel;
- if (prog_id) {
+ if (prog_id && mode != XDP_ATTACHED_MULTI) {
err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
if (err)
goto err_cancel;
@@ -1552,7 +1604,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
ifm->ifi_flags = dev_get_flags(dev);
ifm->ifi_change = change;
- if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+ if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
@@ -1561,6 +1613,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
@@ -1689,9 +1743,11 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_XDP] = { .type = NLA_NESTED },
[IFLA_EVENT] = { .type = NLA_U32 },
[IFLA_GROUP] = { .type = NLA_U32 },
- [IFLA_IF_NETNSID] = { .type = NLA_S32 },
+ [IFLA_TARGET_NETNSID] = { .type = NLA_S32 },
[IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
+ [IFLA_MIN_MTU] = { .type = NLA_U32 },
+ [IFLA_MAX_MTU] = { .type = NLA_U32 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1795,7 +1851,15 @@ static bool link_dump_filtered(struct net_device *dev,
return false;
}
-static struct net *get_target_net(struct sock *sk, int netnsid)
+/**
+ * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
+ * @sk: netlink socket
+ * @netnsid: network namespace identifier
+ *
+ * Returns the network namespace identified by netnsid on success or an error
+ * pointer on failure.
+ */
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
{
struct net *net;
@@ -1812,9 +1876,54 @@ static struct net *get_target_net(struct sock *sk, int netnsid)
}
return net;
}
+EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);
+
+static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
+ bool strict_check, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int hdrlen;
+
+ if (strict_check) {
+ struct ifinfomsg *ifm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for link dump");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
+ return -EINVAL;
+ }
+ if (ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps");
+ return -EINVAL;
+ }
+
+ return nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
+ }
+
+ /* A hack to preserve kernel<->userspace interface.
+ * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
+ * However, before Linux v3.9 the code here assumed rtgenmsg and that's
+ * what iproute2 < v3.9.0 used.
+ * We can detect the old iproute2. Even including the IFLA_EXT_MASK
+ * attribute, its netlink message is shorter than struct ifinfomsg.
+ */
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
+ sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+
+ return nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack);
+}
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct netlink_ext_ack *extack = cb->extack;
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct net *tgt_net = net;
int h, s_h;
@@ -1827,46 +1936,54 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int flags = NLM_F_MULTI;
int master_idx = 0;
int netnsid = -1;
- int err;
- int hdrlen;
+ int err, i;
s_h = cb->args[0];
s_idx = cb->args[1];
- /* A hack to preserve kernel<->userspace interface.
- * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
- * However, before Linux v3.9 the code here assumed rtgenmsg and that's
- * what iproute2 < v3.9.0 used.
- * We can detect the old iproute2. Even including the IFLA_EXT_MASK
- * attribute, its netlink message is shorter than struct ifinfomsg.
- */
- hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
- sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
+ err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
+ if (err < 0) {
+ if (cb->strict_check)
+ return err;
+
+ goto walk_entries;
+ }
- if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
- ifla_policy, NULL) >= 0) {
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
- tgt_net = get_target_net(skb->sk, netnsid);
+ for (i = 0; i <= IFLA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ /* new attributes should only be added with strict checking */
+ switch (i) {
+ case IFLA_TARGET_NETNSID:
+ netnsid = nla_get_s32(tb[i]);
+ tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
if (IS_ERR(tgt_net)) {
- tgt_net = net;
- netnsid = -1;
+ NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
+ return PTR_ERR(tgt_net);
+ }
+ break;
+ case IFLA_EXT_MASK:
+ ext_filter_mask = nla_get_u32(tb[i]);
+ break;
+ case IFLA_MASTER:
+ master_idx = nla_get_u32(tb[i]);
+ break;
+ case IFLA_LINKINFO:
+ kind_ops = linkinfo_to_kind_ops(tb[i]);
+ break;
+ default:
+ if (cb->strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
+ return -EINVAL;
}
}
-
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
-
- if (tb[IFLA_MASTER])
- master_idx = nla_get_u32(tb[IFLA_MASTER]);
-
- if (tb[IFLA_LINKINFO])
- kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
-
- if (master_idx || kind_ops)
- flags |= NLM_F_DUMP_FILTERED;
}
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
+
+walk_entries:
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &tgt_net->dev_index_head[h];
@@ -1878,8 +1995,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
err = rtnl_fill_ifinfo(skb, dev, net,
RTM_NEWLINK,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- flags,
+ nlh->nlmsg_seq, 0, flags,
ext_filter_mask, 0, NULL, 0,
netnsid);
@@ -1934,7 +2050,7 @@ EXPORT_SYMBOL(rtnl_link_get_net);
*
* 1. IFLA_NET_NS_PID
* 2. IFLA_NET_NS_FD
- * 3. IFLA_IF_NETNSID
+ * 3. IFLA_TARGET_NETNSID
*/
static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
struct nlattr *tb[])
@@ -1944,10 +2060,10 @@ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
return rtnl_link_get_net(src_net, tb);
- if (!tb[IFLA_IF_NETNSID])
+ if (!tb[IFLA_TARGET_NETNSID])
return get_net(src_net);
- net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID]));
+ net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
if (!net)
return ERR_PTR(-EINVAL);
@@ -1988,13 +2104,13 @@ static int rtnl_ensure_unique_netns(struct nlattr *tb[],
return -EOPNOTSUPP;
}
- if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+ if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
goto invalid_attr;
- if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD]))
+ if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
goto invalid_attr;
- if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID]))
+ if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
goto invalid_attr;
return 0;
@@ -2270,7 +2386,7 @@ static int do_setlink(const struct sk_buff *skb,
if (err < 0)
return err;
- if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
+ if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
tb, CAP_NET_ADMIN);
if (IS_ERR(net)) {
@@ -2336,7 +2452,7 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_MTU]) {
- err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+ err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -2713,9 +2829,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[IFLA_IFNAME])
nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
- tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
if (IS_ERR(tgt_net))
return PTR_ERR(tgt_net);
}
@@ -2760,7 +2876,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
}
if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
- __dev_notify_flags(dev, old_flags, 0U);
+ __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
} else {
dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
__dev_notify_flags(dev, old_flags, ~0U);
@@ -2787,6 +2903,12 @@ struct net_device *rtnl_create_link(struct net *net,
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
+ if (num_tx_queues < 1 || num_tx_queues > 4096)
+ return ERR_PTR(-EINVAL);
+
+ if (num_rx_queues < 1 || num_rx_queues > 4096)
+ return ERR_PTR(-EINVAL);
+
dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
ops->setup, num_tx_queues, num_rx_queues);
if (!dev)
@@ -3123,9 +3245,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
- tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
if (IS_ERR(tgt_net))
return PTR_ERR(tgt_net);
}
@@ -3210,13 +3332,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx;
int s_idx = cb->family;
+ int type = cb->nlh->nlmsg_type - RTM_BASE;
if (s_idx == 0)
s_idx = 1;
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
struct rtnl_link **tab;
- int type = cb->nlh->nlmsg_type-RTM_BASE;
struct rtnl_link *link;
rtnl_dumpit_func dumpit;
@@ -3677,14 +3799,100 @@ out:
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
+static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
+ int *br_idx, int *brport_idx,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ struct ndmsg *ndm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
+ return -EINVAL;
+ }
+
+ ndm = nlmsg_data(nlh);
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_flags || ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX,
+ NULL, extack);
+ if (err < 0)
+ return err;
+
+ *brport_idx = ndm->ndm_ifindex;
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_IFINDEX:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request");
+ return -EINVAL;
+ }
+ *brport_idx = nla_get_u32(tb[NDA_IFINDEX]);
+ break;
+ case NDA_MASTER:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request");
+ return -EINVAL;
+ }
+ *br_idx = nla_get_u32(tb[NDA_MASTER]);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
+ int *br_idx, int *brport_idx,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX+1];
+ int err;
+
+ /* A hack to preserve kernel<->userspace interface.
+ * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
+ * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails.
+ * So, check for ndmsg with an optional u32 attribute (not used here).
+ * Fortunately these sizes don't conflict with the size of ifinfomsg
+ * with an optional attribute.
+ */
+ if (nlmsg_len(nlh) != sizeof(struct ndmsg) &&
+ (nlmsg_len(nlh) != sizeof(struct ndmsg) +
+ nla_attr_size(sizeof(u32)))) {
+ struct ifinfomsg *ifm;
+
+ err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
+ ifla_policy, extack);
+ if (err < 0) {
+ return -EINVAL;
+ } else if (err == 0) {
+ if (tb[IFLA_MASTER])
+ *br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ }
+
+ ifm = nlmsg_data(nlh);
+ *brport_idx = ifm->ifi_index;
+ }
+ return 0;
+}
+
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net_device *dev;
- struct nlattr *tb[IFLA_MAX+1];
struct net_device *br_dev = NULL;
const struct net_device_ops *ops = NULL;
const struct net_device_ops *cops = NULL;
- struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
struct net *net = sock_net(skb->sk);
struct hlist_head *head;
int brport_idx = 0;
@@ -3694,16 +3902,14 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
int err = 0;
int fidx = 0;
- err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
- IFLA_MAX, ifla_policy, NULL);
- if (err < 0) {
- return -EINVAL;
- } else if (err == 0) {
- if (tb[IFLA_MASTER])
- br_idx = nla_get_u32(tb[IFLA_MASTER]);
- }
-
- brport_idx = ifm->ifi_index;
+ if (cb->strict_check)
+ err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ else
+ err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ if (err < 0)
+ return err;
if (br_idx) {
br_dev = __dev_get_by_index(net, br_idx);
@@ -3888,28 +4094,72 @@ nla_put_failure:
}
EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
+static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
+ bool strict_check, u32 *filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX+1];
+ int err, i;
+
+ if (strict_check) {
+ struct ifinfomsg *ifm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, extack);
+ } else {
+ err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, extack);
+ }
+ if (err < 0)
+ return err;
+
+ /* new attributes should only be added with strict checking */
+ for (i = 0; i <= IFLA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFLA_EXT_MASK:
+ *filter_mask = nla_get_u32(tb[i]);
+ break;
+ default:
+ if (strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct net_device *dev;
int idx = 0;
u32 portid = NETLINK_CB(cb->skb).portid;
- u32 seq = cb->nlh->nlmsg_seq;
+ u32 seq = nlh->nlmsg_seq;
u32 filter_mask = 0;
int err;
- if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
- struct nlattr *extfilt;
-
- extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
- IFLA_EXT_MASK);
- if (extfilt) {
- if (nla_len(extfilt) < sizeof(filter_mask))
- return -EINVAL;
-
- filter_mask = nla_get_u32(extfilt);
- }
- }
+ err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask,
+ cb->extack);
+ if (err < 0 && cb->strict_check)
+ return err;
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
@@ -4503,6 +4753,7 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct netlink_ext_ack *extack = cb->extack;
int h, s_h, err, s_idx, s_idxattr, s_prividx;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
@@ -4519,13 +4770,32 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->seq = net->dev_base_seq;
- if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
+ if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) {
+ NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
return -EINVAL;
+ }
ifsm = nlmsg_data(cb->nlh);
+
+ /* only requests using strict checks can pass data to influence
+ * the dump. The legacy exception is filter_mask.
+ */
+ if (cb->strict_check) {
+ if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
+ return -EINVAL;
+ }
+ if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) {
+ NL_SET_ERR_MSG(extack, "Invalid attributes after stats header");
+ return -EINVAL;
+ }
+ }
+
filter_mask = ifsm->filter_mask;
- if (!filter_mask)
+ if (!filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
return -EINVAL;
+ }
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 7232274de334..af6ad467ed61 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -140,6 +140,7 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
&net_secret);
return seq_scale(hash);
}
+EXPORT_SYMBOL_GPL(secure_tcp_seq);
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb35b62af272..946de0e24c87 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -939,9 +939,6 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
WARN_ON_ONCE(!in_task());
- if (!sock_flag(sk, SOCK_ZEROCOPY))
- return NULL;
-
skb = sock_omalloc(sk, 0, GFP_KERNEL);
if (!skb)
return NULL;
@@ -1291,7 +1288,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
}
EXPORT_SYMBOL(skb_clone);
-static void skb_headers_offset_update(struct sk_buff *skb, int off)
+void skb_headers_offset_update(struct sk_buff *skb, int off)
{
/* Only adjust this if it actually is csum_start rather than csum */
if (skb->ip_summed == CHECKSUM_PARTIAL)
@@ -1305,6 +1302,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
skb->inner_network_header += off;
skb->inner_mac_header += off;
}
+EXPORT_SYMBOL(skb_headers_offset_update);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
@@ -1715,7 +1713,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len)
{
skb->data -= len;
skb->len += len;
- if (unlikely(skb->data<skb->head))
+ if (unlikely(skb->data < skb->head))
skb_under_panic(skb, len, __builtin_return_address(0));
return skb->data;
}
@@ -1848,8 +1846,9 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
if (skb->ip_summed == CHECKSUM_COMPLETE) {
int delta = skb->len - len;
- skb->csum = csum_sub(skb->csum,
- skb_checksum(skb, len, delta, 0));
+ skb->csum = csum_block_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0),
+ len);
}
return __pskb_trim(skb, len);
}
@@ -2858,23 +2857,27 @@ EXPORT_SYMBOL(skb_queue_purge);
/**
* skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty
+ * Return value: the sum of truesizes of all purged skbs.
*
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock).
*/
-void skb_rbtree_purge(struct rb_root *root)
+unsigned int skb_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
+ sum += skb->truesize;
kfree_skb(skb);
}
+ return sum;
}
/**
@@ -3379,64 +3382,6 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
}
EXPORT_SYMBOL(skb_find_text);
-/**
- * skb_append_datato_frags - append the user data to a skb
- * @sk: sock structure
- * @skb: skb structure to be appended with user data.
- * @getfrag: call back function to be used for getting the user data
- * @from: pointer to user message iov
- * @length: length of the iov message
- *
- * Description: This procedure append the user data in the fragment part
- * of the skb if any page alloc fails user this procedure returns -ENOMEM
- */
-int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
- int (*getfrag)(void *from, char *to, int offset,
- int len, int odd, struct sk_buff *skb),
- void *from, int length)
-{
- int frg_cnt = skb_shinfo(skb)->nr_frags;
- int copy;
- int offset = 0;
- int ret;
- struct page_frag *pfrag = &current->task_frag;
-
- do {
- /* Return error if we don't have space for new frag */
- if (frg_cnt >= MAX_SKB_FRAGS)
- return -EMSGSIZE;
-
- if (!sk_page_frag_refill(sk, pfrag))
- return -ENOMEM;
-
- /* copy the user data to page */
- copy = min_t(int, length, pfrag->size - pfrag->offset);
-
- ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
- offset, copy, 0, skb);
- if (ret < 0)
- return -EFAULT;
-
- /* copy was successful so update the size parameters */
- skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
- copy);
- frg_cnt++;
- pfrag->offset += copy;
- get_page(pfrag->page);
-
- skb->truesize += copy;
- refcount_add(copy, &sk->sk_wmem_alloc);
- skb->len += copy;
- skb->data_len += copy;
- offset += copy;
- length -= copy;
-
- } while (length > 0);
-
- return 0;
-}
-EXPORT_SYMBOL(skb_append_datato_frags);
-
int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
int offset, size_t size)
{
@@ -3816,14 +3761,14 @@ err:
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
- struct sk_buff *lp, *p = *head;
unsigned int delta_truesize;
+ struct sk_buff *lp;
if (unlikely(p->len + len >= 65536))
return -E2BIG;
@@ -4450,14 +4395,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
*/
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
- if (unlikely(start > skb_headlen(skb)) ||
- unlikely((int)start + off > skb_headlen(skb) - 2)) {
- net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
- start, off, skb_headlen(skb));
+ u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+ u32 csum_start = skb_headroom(skb) + (u32)start;
+
+ if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
+ net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
+ start, off, skb_headroom(skb), skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) + start;
+ skb->csum_start = csum_start;
skb->csum_offset = off;
skb_set_transport_header(skb, start);
return true;
@@ -4899,7 +4846,6 @@ EXPORT_SYMBOL(skb_try_coalesce);
*/
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
- skb->tstamp = 0;
skb->pkt_type = PACKET_HOST;
skb->skb_iif = 0;
skb->ignore_df = 0;
@@ -4912,8 +4858,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
return;
ipvs_reset(skb);
- skb_orphan(skb);
skb->mark = 0;
+ skb->tstamp = 0;
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 000000000000..56a99d0c9aa0
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,802 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+ if (msg->sg.end > msg->sg.start &&
+ elem_first_coalesce < msg->sg.end)
+ return true;
+
+ if (msg->sg.end < msg->sg.start &&
+ (elem_first_coalesce > msg->sg.start ||
+ elem_first_coalesce < msg->sg.end))
+ return true;
+
+ return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+ int elem_first_coalesce)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+ int ret = 0;
+
+ len -= msg->sg.size;
+ while (len > 0) {
+ struct scatterlist *sge;
+ u32 orig_offset;
+ int use, i;
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ return -ENOMEM;
+
+ orig_offset = pfrag->offset;
+ use = min_t(int, len, pfrag->size - orig_offset);
+ if (!sk_wmem_schedule(sk, use))
+ return -ENOMEM;
+
+ i = msg->sg.end;
+ sk_msg_iter_var_prev(i);
+ sge = &msg->sg.data[i];
+
+ if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+ sg_page(sge) == pfrag->page &&
+ sge->offset + sge->length == orig_offset) {
+ sge->length += use;
+ } else {
+ if (sk_msg_full(msg)) {
+ ret = -ENOSPC;
+ break;
+ }
+
+ sge = &msg->sg.data[msg->sg.end];
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sk_msg_iter_next(msg, end);
+ }
+
+ sk_mem_charge(sk, use);
+ msg->sg.size += use;
+ pfrag->offset += use;
+ len -= use;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+ u32 off, u32 len)
+{
+ int i = src->sg.start;
+ struct scatterlist *sge = sk_msg_elem(src, i);
+ u32 sge_len, sge_off;
+
+ if (sk_msg_full(dst))
+ return -ENOSPC;
+
+ while (off) {
+ if (sge->length > off)
+ break;
+ off -= sge->length;
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && off)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ while (len) {
+ sge_len = sge->length - off;
+ sge_off = sge->offset + off;
+ if (sge_len > len)
+ sge_len = len;
+ off = 0;
+ len -= sge_len;
+ sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
+ sk_mem_charge(sk, sge_len);
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && len)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_msg_clone);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (bytes < sge->length) {
+ sge->length -= bytes;
+ sge->offset += bytes;
+ sk_mem_uncharge(sk, bytes);
+ break;
+ }
+
+ sk_mem_uncharge(sk, sge->length);
+ bytes -= sge->length;
+ sge->length = 0;
+ sge->offset = 0;
+ sk_msg_iter_var_next(i);
+ } while (bytes && i != msg->sg.end);
+ msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = &msg->sg.data[i];
+ int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+ sk_mem_uncharge(sk, uncharge);
+ bytes -= uncharge;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ u32 len = sge->length;
+
+ if (charge)
+ sk_mem_uncharge(sk, len);
+ if (!msg->skb)
+ put_page(sg_page(sge));
+ memset(sge, 0, sizeof(*sge));
+ return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ int freed = 0;
+
+ while (msg->sg.size) {
+ msg->sg.size -= sge->length;
+ freed += sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, msg->sg.size);
+ sge = sk_msg_elem(msg, i);
+ }
+ if (msg->skb)
+ consume_skb(msg->skb);
+ sk_msg_init(msg);
+ return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, bool charge)
+{
+ struct scatterlist *sge;
+ u32 i = msg->sg.start;
+
+ while (bytes) {
+ sge = sk_msg_elem(msg, i);
+ if (!sge->length)
+ break;
+ if (bytes < sge->length) {
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
+ sge->length -= bytes;
+ sge->offset += bytes;
+ msg->sg.size -= bytes;
+ break;
+ }
+
+ msg->sg.size -= sge->length;
+ bytes -= sge->length;
+ sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, bytes);
+ }
+ msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+ u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+ int trim = msg->sg.size - len;
+ u32 i = msg->sg.end;
+
+ if (trim <= 0) {
+ WARN_ON(trim < 0);
+ return;
+ }
+
+ sk_msg_iter_var_prev(i);
+ msg->sg.size = len;
+ while (msg->sg.data[i].length &&
+ trim >= msg->sg.data[i].length) {
+ trim -= msg->sg.data[i].length;
+ sk_msg_free_elem(sk, msg, i, true);
+ sk_msg_iter_var_prev(i);
+ if (!trim)
+ goto out;
+ }
+
+ msg->sg.data[i].length -= trim;
+ sk_mem_uncharge(sk, trim);
+out:
+ /* If we trim data before curr pointer update copybreak and current
+ * so that any future copy operations start at new copy location.
+ * However trimed data that has not yet been used in a copy op
+ * does not require an update.
+ */
+ if (msg->sg.curr >= i) {
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
+ sk_msg_iter_var_next(i);
+ msg->sg.end = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+ const int to_max_pages = MAX_MSG_FRAGS;
+ struct page *pages[MAX_MSG_FRAGS];
+ ssize_t orig, copied, use, offset;
+
+ orig = msg->sg.size;
+ while (bytes > 0) {
+ i = 0;
+ maxpages = to_max_pages - num_elems;
+ if (maxpages == 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+ &offset);
+ if (copied <= 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ iov_iter_advance(from, copied);
+ bytes -= copied;
+ msg->sg.size += copied;
+
+ while (copied) {
+ use = min_t(int, copied, PAGE_SIZE - offset);
+ sg_set_page(&msg->sg.data[msg->sg.end],
+ pages[i], use, offset);
+ sg_unmark_end(&msg->sg.data[msg->sg.end]);
+ sk_mem_charge(sk, use);
+
+ offset = 0;
+ copied -= use;
+ sk_msg_iter_next(msg, end);
+ num_elems++;
+ i++;
+ }
+ /* When zerocopy is mixed with sk_msg_*copy* operations we
+ * may have a copybreak set in this case clear and prefer
+ * zerocopy remainder when possible.
+ */
+ msg->sg.copybreak = 0;
+ msg->sg.curr = msg->sg.end;
+ }
+out:
+ /* Revert iov_iter updates, msg will need to use 'trim' later if it
+ * also needs to be cleared.
+ */
+ if (ret)
+ iov_iter_revert(from, msg->sg.size - orig);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int ret = -ENOSPC, i = msg->sg.curr;
+ struct scatterlist *sge;
+ u32 copy, buf_size;
+ void *to;
+
+ do {
+ sge = sk_msg_elem(msg, i);
+ /* This is possible if a trim operation shrunk the buffer */
+ if (msg->sg.copybreak >= sge->length) {
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ if (i == msg->sg.end)
+ break;
+ sge = sk_msg_elem(msg, i);
+ }
+
+ buf_size = sge->length - msg->sg.copybreak;
+ copy = (buf_size > bytes) ? bytes : buf_size;
+ to = sg_virt(sge) + msg->sg.copybreak;
+ msg->sg.copybreak += copy;
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+ ret = copy_from_iter_nocache(to, copy, from);
+ else
+ ret = copy_from_iter(to, copy, from);
+ if (ret != copy) {
+ ret = -EFAULT;
+ goto out;
+ }
+ bytes -= copy;
+ if (!bytes)
+ break;
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+out:
+ msg->sg.curr = i;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct sock *sk = psock->sk;
+ int copied = 0, num_sge;
+ struct sk_msg *msg;
+
+ msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ if (unlikely(!msg))
+ return -EAGAIN;
+ if (!sk_rmem_schedule(sk, skb, skb->len)) {
+ kfree(msg);
+ return -EAGAIN;
+ }
+
+ sk_msg_init(msg);
+ num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+ if (unlikely(num_sge < 0)) {
+ kfree(msg);
+ return num_sge;
+ }
+
+ sk_mem_charge(sk, skb->len);
+ copied = skb->len;
+ msg->sg.start = 0;
+ msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+ msg->skb = skb;
+
+ sk_psock_queue_msg(psock, msg);
+ sk->sk_data_ready(sk);
+ return copied;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool ingress)
+{
+ if (ingress)
+ return sk_psock_skb_ingress(psock, skb);
+ else
+ return skb_send_sock_locked(psock->sk, skb, off, len);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(work, struct sk_psock, work);
+ struct sk_psock_work_state *state = &psock->work_state;
+ struct sk_buff *skb;
+ bool ingress;
+ u32 len, off;
+ int ret;
+
+ /* Lock sock to avoid losing sk_socket during loop. */
+ lock_sock(psock->sk);
+ if (state->skb) {
+ skb = state->skb;
+ len = state->len;
+ off = state->off;
+ state->skb = NULL;
+ goto start;
+ }
+
+ while ((skb = skb_dequeue(&psock->ingress_skb))) {
+ len = skb->len;
+ off = 0;
+start:
+ ingress = tcp_skb_bpf_ingress(skb);
+ do {
+ ret = -EIO;
+ if (likely(psock->sk->sk_socket))
+ ret = sk_psock_handle_skb(psock, skb, off,
+ len, ingress);
+ if (ret <= 0) {
+ if (ret == -EAGAIN) {
+ state->skb = skb;
+ state->len = len;
+ state->off = off;
+ goto end;
+ }
+ /* Hard errors break pipe and stop xmit. */
+ sk_psock_report_error(psock, ret ? -ret : EPIPE);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ kfree_skb(skb);
+ goto end;
+ }
+ off += ret;
+ len -= ret;
+ } while (len);
+
+ if (!ingress)
+ kfree_skb(skb);
+ }
+end:
+ release_sock(psock->sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+ struct sk_psock *psock = kzalloc_node(sizeof(*psock),
+ GFP_ATOMIC | __GFP_NOWARN,
+ node);
+ if (!psock)
+ return NULL;
+
+ psock->sk = sk;
+ psock->eval = __SK_NONE;
+
+ INIT_LIST_HEAD(&psock->link);
+ spin_lock_init(&psock->link_lock);
+
+ INIT_WORK(&psock->work, sk_psock_backlog);
+ INIT_LIST_HEAD(&psock->ingress_msg);
+ skb_queue_head_init(&psock->ingress_skb);
+
+ sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+ refcount_set(&psock->refcnt, 1);
+
+ rcu_assign_sk_user_data(sk, psock);
+ sock_hold(sk);
+
+ return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ spin_lock_bh(&psock->link_lock);
+ link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+ list);
+ if (link)
+ list_del(&link->list);
+ spin_unlock_bh(&psock->link_lock);
+ return link;
+}
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+ list_del(&msg->list);
+ sk_msg_free(psock->sk, msg);
+ kfree(msg);
+ }
+}
+
+static void sk_psock_zap_ingress(struct sk_psock *psock)
+{
+ __skb_queue_purge(&psock->ingress_skb);
+ __sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+ struct sk_psock_link *link, *tmp;
+
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ }
+}
+
+static void sk_psock_destroy_deferred(struct work_struct *gc)
+{
+ struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+
+ /* No sk_callback_lock since already detached. */
+ if (psock->parser.enabled)
+ strp_done(&psock->parser.strp);
+
+ cancel_work_sync(&psock->work);
+
+ psock_progs_drop(&psock->progs);
+
+ sk_psock_link_destroy(psock);
+ sk_psock_cork_free(psock);
+ sk_psock_zap_ingress(psock);
+
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ sock_put(psock->sk);
+ kfree(psock);
+}
+
+void sk_psock_destroy(struct rcu_head *rcu)
+{
+ struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
+
+ INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
+ schedule_work(&psock->gc);
+}
+EXPORT_SYMBOL_GPL(sk_psock_destroy);
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+ rcu_assign_sk_user_data(sk, NULL);
+ sk_psock_cork_free(psock);
+ sk_psock_restore_proto(sk, psock);
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (psock->progs.skb_parser)
+ sk_psock_stop_strp(sk, psock);
+ write_unlock_bh(&sk->sk_callback_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+
+ call_rcu_sched(&psock->rcu, sk_psock_destroy);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+ switch (verdict) {
+ case SK_PASS:
+ return redir ? __SK_REDIRECT : __SK_PASS;
+ case SK_DROP:
+ default:
+ break;
+ }
+
+ return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ preempt_disable();
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.msg_parser);
+ if (unlikely(!prog)) {
+ ret = __SK_PASS;
+ goto out;
+ }
+
+ sk_msg_compute_data_pointers(msg);
+ msg->sk = sk;
+ ret = BPF_PROG_RUN(prog, msg);
+ ret = sk_psock_map_verd(ret, msg->sk_redir);
+ psock->apply_bytes = msg->apply_bytes;
+ if (ret == __SK_REDIRECT) {
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ psock->sk_redir = msg->sk_redir;
+ if (!psock->sk_redir) {
+ ret = __SK_DROP;
+ goto out;
+ }
+ sock_hold(psock->sk_redir);
+ }
+out:
+ rcu_read_unlock();
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ skb->sk = psock->sk;
+ bpf_compute_data_end_sk_skb(skb);
+ preempt_disable();
+ ret = BPF_PROG_RUN(prog, skb);
+ preempt_enable();
+ /* strparser clones the skb before handing it to a upper layer,
+ * meaning skb_orphan has been called. We NULL sk on the way out
+ * to ensure we don't trigger a BUG_ON() in skb/sk operations
+ * later and because we are not charging the memory of this skb
+ * to any socket yet.
+ */
+ skb->sk = NULL;
+ return ret;
+}
+
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+ struct sk_psock_parser *parser;
+
+ parser = container_of(strp, struct sk_psock_parser, strp);
+ return container_of(parser, struct sk_psock, parser);
+}
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+ struct sk_buff *skb, int verdict)
+{
+ struct sk_psock *psock_other;
+ struct sock *sk_other;
+ bool ingress;
+
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_other = tcp_skb_bpf_redirect_fetch(skb);
+ if (unlikely(!sk_other))
+ goto out_free;
+ psock_other = sk_psock(sk_other);
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
+ goto out_free;
+ ingress = tcp_skb_bpf_ingress(skb);
+ if ((!ingress && sock_writeable(sk_other)) ||
+ (ingress &&
+ atomic_read(&sk_other->sk_rmem_alloc) <=
+ sk_other->sk_rcvbuf)) {
+ if (!ingress)
+ skb_set_owner_w(skb, sk_other);
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_work(&psock_other->work);
+ break;
+ }
+ /* fall-through */
+ case __SK_DROP:
+ /* fall-through */
+ default:
+out_free:
+ kfree_skb(skb);
+ }
+}
+
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ skb_orphan(skb);
+ tcp_skb_bpf_redirect_clear(skb);
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ }
+ rcu_read_unlock();
+ sk_psock_verdict_apply(psock, skb, ret);
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct bpf_prog *prog;
+ int ret = skb->len;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.skb_parser);
+ if (likely(prog))
+ ret = sk_psock_bpf_run(psock, prog, skb);
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_data_ready(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->parser.strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+ rcu_read_unlock();
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ rcu_read_unlock();
+ write_space(sk);
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ psock->parser.enabled = false;
+ return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (parser->enabled)
+ return;
+
+ parser->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+ parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_parser *parser = &psock->parser;
+
+ if (!parser->enabled)
+ return;
+
+ sk->sk_data_ready = parser->saved_data_ready;
+ parser->saved_data_ready = NULL;
+ strp_stop(&parser->strp);
+ parser->enabled = false;
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index bc2d7a37297f..6fcc4bc07d19 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -91,6 +91,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
@@ -249,58 +250,13 @@ static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
_sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
- "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
- "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
- "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
- "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
- "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
- "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
- "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
- "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
- "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
- "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
- "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
- "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
- "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
- "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
- "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
- "rlock-AF_MAX"
+ _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
- "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
- "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
- "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
- "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
- "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
- "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
- "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
- "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
- "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
- "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
- "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
- "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
- "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
- "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
- "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
- "wlock-AF_MAX"
+ _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
- "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
- "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
- "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
- "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
- "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
- "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
- "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
- "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
- "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
- "elock-27" , "elock-28" , "elock-AF_CAN" ,
- "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
- "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
- "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
- "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
- "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
- "elock-AF_MAX"
+ _sock_locks("elock-")
};
/*
@@ -697,6 +653,7 @@ EXPORT_SYMBOL(sk_mc_loop);
int sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
+ struct sock_txtime sk_txtime;
struct sock *sk = sock->sk;
int val;
int valbool;
@@ -1041,7 +998,7 @@ set_rcvbuf:
cmpxchg(&sk->sk_pacing_status,
SK_PACING_NONE,
SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = val;
+ sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
break;
@@ -1070,6 +1027,26 @@ set_rcvbuf:
}
break;
+ case SO_TXTIME:
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else if (optlen != sizeof(struct sock_txtime)) {
+ ret = -EINVAL;
+ } else if (copy_from_user(&sk_txtime, optval,
+ sizeof(struct sock_txtime))) {
+ ret = -EFAULT;
+ } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
+ ret = -EINVAL;
+ } else {
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ }
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1115,6 +1092,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
u64 val64;
struct linger ling;
struct timeval tm;
+ struct sock_txtime txtime;
} v;
int lv = sizeof(int);
@@ -1358,7 +1336,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
#endif
case SO_MAX_PACING_RATE:
- v.val = sk->sk_max_pacing_rate;
+ /* 32bit version */
+ v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
break;
case SO_INCOMING_CPU:
@@ -1403,6 +1382,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sock_flag(sk, SOCK_ZEROCOPY);
break;
+ case SO_TXTIME:
+ lv = sizeof(v.txtime);
+ v.txtime.clockid = sk->sk_clockid;
+ v.txtime.flags |= sk->sk_txtime_deadline_mode ?
+ SOF_TXTIME_DEADLINE_MODE : 0;
+ v.txtime.flags |= sk->sk_txtime_report_errors ?
+ SOF_TXTIME_REPORT_ERRORS : 0;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2137,6 +2125,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
sockc->tsflags |= tsflags;
break;
+ case SCM_TXTIME:
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
+ return -EINVAL;
+ sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
+ break;
/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
case SCM_RIGHTS:
case SCM_CREDENTIALS:
@@ -2244,67 +2239,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
- int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
- int first_coalesce)
-{
- int sg_curr = *sg_curr_index, use = 0, rc = 0;
- unsigned int size = *sg_curr_size;
- struct page_frag *pfrag;
- struct scatterlist *sge;
-
- len -= size;
- pfrag = sk_page_frag(sk);
-
- while (len > 0) {
- unsigned int orig_offset;
-
- if (!sk_page_frag_refill(sk, pfrag)) {
- rc = -ENOMEM;
- goto out;
- }
-
- use = min_t(int, len, pfrag->size - pfrag->offset);
-
- if (!sk_wmem_schedule(sk, use)) {
- rc = -ENOMEM;
- goto out;
- }
-
- sk_mem_charge(sk, use);
- size += use;
- orig_offset = pfrag->offset;
- pfrag->offset += use;
-
- sge = sg + sg_curr - 1;
- if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
- sge->offset + sge->length == orig_offset) {
- sge->length += use;
- } else {
- sge = sg + sg_curr;
- sg_unmark_end(sge);
- sg_set_page(sge, pfrag->page, use, orig_offset);
- get_page(pfrag->page);
- sg_curr++;
-
- if (sg_curr == MAX_SKB_FRAGS)
- sg_curr = 0;
-
- if (sg_curr == sg_start) {
- rc = -ENOSPC;
- break;
- }
- }
-
- len -= use;
- }
-out:
- *sg_curr_size = size;
- *sg_curr_index = sg_curr;
- return rc;
-}
-EXPORT_SYMBOL(sk_alloc_sg);
-
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
@@ -2323,7 +2257,7 @@ static void __lock_sock(struct sock *sk)
finish_wait(&sk->sk_lock.wq, &wait);
}
-static void __release_sock(struct sock *sk)
+void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
@@ -2338,7 +2272,7 @@ static void __release_sock(struct sock *sk)
next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
cond_resched();
@@ -2401,9 +2335,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
long allocated = sk_memory_allocated_add(sk, amt);
+ bool charged = true;
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
+ !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
goto suppress_allocation;
/* Under limit. */
@@ -2461,7 +2396,8 @@ suppress_allocation:
return 1;
}
- trace_sock_exceed_buf_limit(sk, prot, allocated);
+ if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
sk_memory_allocated_sub(sk, amt);
@@ -2814,10 +2750,12 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_ll_usec = sysctl_net_busy_read;
#endif
- sk->sk_max_pacing_rate = ~0U;
- sk->sk_pacing_rate = ~0U;
+ sk->sk_max_pacing_rate = ~0UL;
+ sk->sk_pacing_rate = ~0UL;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
+
+ sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
@@ -2902,8 +2840,8 @@ EXPORT_SYMBOL(lock_sock_fast);
int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
{
struct timeval tv;
- if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
tv = ktime_to_timeval(sk->sk_stamp);
if (tv.tv_sec == -1)
return -ENOENT;
@@ -2918,8 +2856,8 @@ EXPORT_SYMBOL(sock_get_timestamp);
int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
{
struct timespec ts;
- if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ts = ktime_to_timespec(sk->sk_stamp);
if (ts.tv_sec == -1)
return -ENOENT;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..3312a5849a97 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/tcp.h>
#include <linux/workqueue.h>
+#include <linux/nospec.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
@@ -218,6 +219,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
if (req->sdiag_family >= AF_MAX)
return -EINVAL;
+ req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
if (sock_diag_handlers[req->sdiag_family] == NULL)
sock_load_diag_module(req->sdiag_family, 0);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 000000000000..be6092ac69f8
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1003 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sks;
+ struct sk_psock_progs progs;
+ raw_spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+ u64 cost;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->max_entries == 0 ||
+ attr->key_size != 4 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ stab = kzalloc(sizeof(*stab), GFP_USER);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&stab->map, attr);
+ raw_spin_lock_init(&stab->lock);
+
+ /* Make sure page count doesn't overflow. */
+ cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ err = -EINVAL;
+ goto free_stab;
+ }
+
+ stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ err = bpf_map_precharge_memlock(stab->map.pages);
+ if (err)
+ goto free_stab;
+
+ stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (stab->sks)
+ return &stab->map;
+ err = -ENOMEM;
+free_stab:
+ kfree(stab);
+ return ERR_PTR(err);
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ u32 ufd = attr->target_fd;
+ struct bpf_map *map;
+ struct fd f;
+ int ret;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ ret = sock_map_prog_update(map, prog, attr->attach_type);
+ fdput(f);
+ return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+ __acquires(&sk->sk_lock.slock)
+{
+ lock_sock(sk);
+ preempt_disable();
+ rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ rcu_read_unlock();
+ preempt_enable();
+ release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+ struct sk_psock_link *link,
+ struct bpf_map *map, void *link_raw)
+{
+ link->link_raw = link_raw;
+ link->map = map;
+ spin_lock_bh(&psock->link_lock);
+ list_add_tail(&link->list, &psock->link);
+ spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+ struct sk_psock *psock, void *link_raw)
+{
+ struct sk_psock_link *link, *tmp;
+ bool strp_stop = false;
+
+ spin_lock_bh(&psock->link_lock);
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ if (link->link_raw == link_raw) {
+ struct bpf_map *map = link->map;
+ struct bpf_stab *stab = container_of(map, struct bpf_stab,
+ map);
+ if (psock->parser.enabled && stab->progs.skb_parser)
+ strp_stop = true;
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ }
+ }
+ spin_unlock_bh(&psock->link_lock);
+ if (strp_stop) {
+ write_lock_bh(&sk->sk_callback_lock);
+ sk_psock_stop_strp(sk, psock);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+ struct sk_psock *psock = sk_psock(sk);
+
+ if (likely(psock)) {
+ sock_map_del_link(sk, psock, link_raw);
+ sk_psock_put(sk, psock);
+ }
+}
+
+static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
+ struct sock *sk)
+{
+ struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+ bool skb_progs, sk_psock_is_new = false;
+ struct sk_psock *psock;
+ int ret;
+
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ skb_parser = READ_ONCE(progs->skb_parser);
+ skb_progs = skb_parser && skb_verdict;
+ if (skb_progs) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict))
+ return PTR_ERR(skb_verdict);
+ skb_parser = bpf_prog_inc_not_zero(skb_parser);
+ if (IS_ERR(skb_parser)) {
+ bpf_prog_put(skb_verdict);
+ return PTR_ERR(skb_parser);
+ }
+ }
+
+ msg_parser = READ_ONCE(progs->msg_parser);
+ if (msg_parser) {
+ msg_parser = bpf_prog_inc_not_zero(msg_parser);
+ if (IS_ERR(msg_parser)) {
+ ret = PTR_ERR(msg_parser);
+ goto out;
+ }
+ }
+
+ psock = sk_psock_get_checked(sk);
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ if (psock) {
+ if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+ (skb_progs && READ_ONCE(psock->progs.skb_parser))) {
+ sk_psock_put(sk, psock);
+ ret = -EBUSY;
+ goto out_progs;
+ }
+ } else {
+ psock = sk_psock_init(sk, map->numa_node);
+ if (!psock) {
+ ret = -ENOMEM;
+ goto out_progs;
+ }
+ sk_psock_is_new = true;
+ }
+
+ if (msg_parser)
+ psock_set_prog(&psock->progs.msg_parser, msg_parser);
+ if (sk_psock_is_new) {
+ ret = tcp_bpf_init(sk);
+ if (ret < 0)
+ goto out_drop;
+ } else {
+ tcp_bpf_reinit(sk);
+ }
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (skb_progs && !psock->parser.enabled) {
+ ret = sk_psock_init_strp(sk, psock);
+ if (ret) {
+ write_unlock_bh(&sk->sk_callback_lock);
+ goto out_drop;
+ }
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+ psock_set_prog(&psock->progs.skb_parser, skb_parser);
+ sk_psock_start_strp(sk, psock);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+out_drop:
+ sk_psock_put(sk, psock);
+out_progs:
+ if (msg_parser)
+ bpf_prog_put(msg_parser);
+out:
+ if (skb_progs) {
+ bpf_prog_put(skb_verdict);
+ bpf_prog_put(skb_parser);
+ }
+ return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ synchronize_rcu();
+ rcu_read_lock();
+ raw_spin_lock_bh(&stab->lock);
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct sock **psk = &stab->sks[i];
+ struct sock *sk;
+
+ sk = xchg(psk, NULL);
+ if (sk)
+ sock_map_unref(sk, psk);
+ }
+ raw_spin_unlock_bh(&stab->lock);
+ rcu_read_unlock();
+
+ bpf_map_area_free(stab->sks);
+ kfree(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (unlikely(key >= map->max_entries))
+ return NULL;
+ return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+ struct sock **psk)
+{
+ struct sock *sk;
+
+ raw_spin_lock_bh(&stab->lock);
+ sk = *psk;
+ if (!sk_test || sk_test == sk)
+ *psk = NULL;
+ raw_spin_unlock_bh(&stab->lock);
+ if (unlikely(!sk))
+ return -EINVAL;
+ sock_map_unref(sk, psk);
+ return 0;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ __sock_map_delete(stab, sk, link_raw);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = *(u32 *)key;
+ struct sock **psk;
+
+ if (unlikely(i >= map->max_entries))
+ return -EINVAL;
+
+ psk = &stab->sks[i];
+ return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *key_next = next;
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+ if (i >= stab->map.max_entries)
+ *key_next = 0;
+ else
+ *key_next = i + 1;
+ return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ struct sock *osk;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(idx >= map->max_entries))
+ return -E2BIG;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, &stab->progs, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ raw_spin_lock_bh(&stab->lock);
+ osk = stab->sks[idx];
+ if (osk && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!osk && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, &stab->sks[idx]);
+ stab->sks[idx] = sk;
+ if (osk)
+ sock_map_unref(osk, &stab->sks[idx]);
+ raw_spin_unlock_bh(&stab->lock);
+ return 0;
+out_unlock:
+ raw_spin_unlock_bh(&stab->lock);
+ if (psock)
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+ return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+ return sk->sk_type == SOCK_STREAM &&
+ sk->sk_protocol == IPPROTO_TCP;
+}
+
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ u32 ufd = *(u32 *)value;
+ u32 idx = *(u32 *)key;
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+
+ sock = sockfd_lookup(ufd, &ret);
+ if (!sock)
+ return ret;
+ sk = sock->sk;
+ if (!sk) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sock_map_sk_is_suitable(sk) ||
+ sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ sock_map_sk_acquire(sk);
+ ret = sock_map_update_common(map, idx, sk, flags);
+ sock_map_sk_release(sk);
+out:
+ fput(sock->file);
+ return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_map_update_common(map, *(u32 *)key, sops->sk,
+ flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ msg->flags = flags;
+ msg->sk_redir = __sock_map_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_map_ops = {
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+ .map_lookup_elem = sock_map_lookup,
+ .map_release_uref = sock_map_release_progs,
+ .map_check_btf = map_check_no_btf,
+};
+
+struct bpf_htab_elem {
+ struct rcu_head rcu;
+ u32 hash;
+ struct sock *sk;
+ struct hlist_node node;
+ u8 key[0];
+};
+
+struct bpf_htab_bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct bpf_htab_bucket *buckets;
+ u32 buckets_num;
+ u32 elem_size;
+ struct sk_psock_progs progs;
+ atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+ return jhash(key, len, 0);
+}
+
+static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
+ u32 hash)
+{
+ return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_htab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+ u32 key_size)
+{
+ struct bpf_htab_elem *elem;
+
+ hlist_for_each_entry_rcu(elem, head, node) {
+ if (elem->hash == hash &&
+ !memcmp(&elem->key, key, key_size))
+ return elem;
+ }
+
+ return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_htab_bucket *bucket;
+ struct bpf_htab_elem *elem;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+ return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_htab *htab,
+ struct bpf_htab_elem *elem)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_htab_elem *elem_probe, *elem = link_raw;
+ struct bpf_htab_bucket *bucket;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ bucket = sock_hash_select_bucket(htab, elem->hash);
+
+ /* elem may be deleted in parallel from the map, but access here
+ * is okay since it's going away only after RCU grace period.
+ * However, we need to check whether it's still present.
+ */
+ raw_spin_lock_bh(&bucket->lock);
+ elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+ elem->key, map->key_size);
+ if (elem_probe && elem_probe == elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 hash, key_size = map->key_size;
+ struct bpf_htab_bucket *bucket;
+ struct bpf_htab_elem *elem;
+ int ret = -ENOENT;
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ raw_spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ ret = 0;
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+ return ret;
+}
+
+static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
+ void *key, u32 key_size,
+ u32 hash, struct sock *sk,
+ struct bpf_htab_elem *old)
+{
+ struct bpf_htab_elem *new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+
+ new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!new) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(new->key, key, key_size);
+ new->sk = sk;
+ new->hash = hash;
+ return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_htab_elem *elem, *elem_new;
+ struct bpf_htab_bucket *bucket;
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, &htab->progs, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ raw_spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!elem && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+ if (IS_ERR(elem_new)) {
+ ret = PTR_ERR(elem_new);
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, elem_new);
+ /* Add new element to the head of the list, so that
+ * concurrent search will find it before old elem.
+ */
+ hlist_add_head_rcu(&elem_new->node, &bucket->head);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+ return 0;
+out_unlock:
+ raw_spin_unlock_bh(&bucket->lock);
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ u32 ufd = *(u32 *)value;
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+
+ sock = sockfd_lookup(ufd, &ret);
+ if (!sock)
+ return ret;
+ sk = sock->sk;
+ if (!sk) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sock_map_sk_is_suitable(sk) ||
+ sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ sock_map_sk_acquire(sk);
+ ret = sock_hash_update_common(map, key, sk, flags);
+ sock_map_sk_release(sk);
+out:
+ fput(sock->file);
+ return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+ void *key_next)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_htab_elem *elem, *elem_next;
+ u32 hash, key_size = map->key_size;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first_elem;
+ hash = sock_hash_bucket_hash(key, key_size);
+ head = &sock_hash_select_bucket(htab, hash)->head;
+ elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+ if (!elem)
+ goto find_first_elem;
+
+ elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+ struct bpf_htab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+
+ i = hash & (htab->buckets_num - 1);
+ i++;
+find_first_elem:
+ for (; i < htab->buckets_num; i++) {
+ head = &sock_hash_select_bucket(htab, i)->head;
+ elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct bpf_htab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int i, err;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (attr->max_entries == 0 ||
+ attr->key_size == 0 ||
+ attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+ if (attr->key_size > MAX_BPF_STACK)
+ return ERR_PTR(-E2BIG);
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct bpf_htab_elem) +
+ round_up(htab->map.key_size, 8);
+ if (htab->buckets_num == 0 ||
+ htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
+ err = -EINVAL;
+ goto free_htab;
+ }
+
+ cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ err = -EINVAL;
+ goto free_htab;
+ }
+
+ htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+ sizeof(struct bpf_htab_bucket),
+ htab->map.numa_node);
+ if (!htab->buckets) {
+ err = -ENOMEM;
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->buckets_num; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_htab_bucket *bucket;
+ struct bpf_htab_elem *elem;
+ struct hlist_node *node;
+ int i;
+
+ synchronize_rcu();
+ rcu_read_lock();
+ for (i = 0; i < htab->buckets_num; i++) {
+ bucket = sock_hash_select_bucket(htab, i);
+ raw_spin_lock_bh(&bucket->lock);
+ hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ }
+ raw_spin_unlock_bh(&bucket->lock);
+ }
+ rcu_read_unlock();
+
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_hash_update_common(map, key, sops->sk, flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+ msg->flags = flags;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_hash_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
+ .map_lookup_elem = sock_map_lookup,
+ .map_release_uref = sock_hash_release_progs,
+ .map_check_btf = map_check_no_btf,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return &container_of(map, struct bpf_stab, map)->progs;
+ case BPF_MAP_TYPE_SOCKHASH:
+ return &container_of(map, struct bpf_htab, map)->progs;
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ u32 which)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+
+ if (!progs)
+ return -EOPNOTSUPP;
+
+ switch (which) {
+ case BPF_SK_MSG_VERDICT:
+ psock_set_prog(&progs->msg_parser, prog);
+ break;
+ case BPF_SK_SKB_STREAM_PARSER:
+ psock_set_prog(&progs->skb_parser, prog);
+ break;
+ case BPF_SK_SKB_STREAM_VERDICT:
+ psock_set_prog(&progs->skb_verdict, prog);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+ switch (link->map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return sock_map_delete_from_link(link->map, sk,
+ link->link_raw);
+ case BPF_MAP_TYPE_SOCKHASH:
+ return sock_hash_delete_from_link(link->map, sk,
+ link->link_raw);
+ default:
+ break;
+ }
+}
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 064acb04be0f..ba5cba56f574 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -8,11 +8,34 @@
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
+#include <linux/idr.h>
+#include <linux/filter.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
-static DEFINE_SPINLOCK(reuseport_lock);
+DEFINE_SPINLOCK(reuseport_lock);
+
+#define REUSEPORT_MIN_ID 1
+static DEFINE_IDA(reuseport_ida);
+
+int reuseport_get_id(struct sock_reuseport *reuse)
+{
+ int id;
+
+ if (reuse->reuseport_id)
+ return reuse->reuseport_id;
+
+ id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
+ /* Called under reuseport_lock */
+ GFP_ATOMIC);
+ if (id < 0)
+ return id;
+
+ reuse->reuseport_id = id;
+
+ return reuse->reuseport_id;
+}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
@@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
return reuse;
}
-int reuseport_alloc(struct sock *sk)
+int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
@@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk)
/* Allocation attempts can occur concurrently via the setsockopt path
* and the bind/hash path. Nothing to do when we lose the race.
*/
- if (rcu_dereference_protected(sk->sk_reuseport_cb,
- lockdep_is_held(&reuseport_lock)))
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (reuse) {
+ /* Only set reuse->bind_inany if the bind_inany is true.
+ * Otherwise, it will overwrite the reuse->bind_inany
+ * which was set by the bind/hash path.
+ */
+ if (bind_inany)
+ reuse->bind_inany = bind_inany;
goto out;
+ }
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
@@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk)
reuse->socks[0] = sk;
reuse->num_socks = 1;
+ reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
@@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
+ more_reuse->reuseport_id = reuse->reuseport_id;
+ more_reuse->bind_inany = reuse->bind_inany;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
+ more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
@@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head)
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
- if (reuse->prog)
- bpf_prog_destroy(reuse->prog);
+ sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
+ if (reuse->reuseport_id)
+ ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head)
* @sk2: Socket belonging to the existing reuseport group.
* May return ENOMEM and not add socket to group under memory pressure.
*/
-int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
{
struct sock_reuseport *old_reuse, *reuse;
if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
- int err = reuseport_alloc(sk2);
+ int err = reuseport_alloc(sk2, bind_inany);
if (err)
return err;
@@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk)
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+
+ /* At least one of the sk in this reuseport group is added to
+ * a bpf map. Notify the bpf side. The bpf map logic will
+ * remove the sk if it is indeed added to a bpf map.
+ */
+ if (reuse->reuseport_id)
+ bpf_sk_reuseport_detach(sk);
+
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
@@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk)
}
EXPORT_SYMBOL(reuseport_detach_sock);
-static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
- struct bpf_prog *prog, struct sk_buff *skb,
- int hdr_len)
+static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
@@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk,
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
- if (prog && skb)
- sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ if (!prog || !skb)
+ goto select_by_hash;
+
+ if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ else
+ sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
+select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2)
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
@@ -252,12 +302,21 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
-struct bpf_prog *
-reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
+ int err = reuseport_alloc(sk, false);
+
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
@@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
- return old_prog;
+ sk_reuseport_prog_free(old_prog);
+ return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/core/utils.c b/net/core/utils.c
index d47863b07a60..2a597ac7808e 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -397,7 +397,7 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
break;
default:
pr_err("unexpected address family %d\n", af);
- };
+ }
return ret;
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 6771f1855b96..4b2b194f4f1f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -3,8 +3,11 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
* Released under terms in GPL version 2. See COPYING.
*/
+#include <linux/bpf.h>
+#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
@@ -45,8 +48,8 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
!= sizeof(u32));
- /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
- return key << RHT_HASH_RESERVED_SPACE;
+ /* Use cyclic increasing ID as direct hash key */
+ return key;
}
static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
@@ -91,30 +94,33 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
-static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
int id = xdp_rxq->mem.id;
- int err;
- if (id == 0)
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
return;
+ }
- mutex_lock(&mem_id_lock);
-
- xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
+ if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
+ xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
return;
}
- err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
- WARN_ON(err);
+ if (id == 0)
+ return;
+
+ mutex_lock(&mem_id_lock);
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
mutex_unlock(&mem_id_lock);
}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
@@ -124,7 +130,7 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
- __xdp_rxq_info_unreg_mem_model(xdp_rxq);
+ xdp_rxq_info_unreg_mem_model(xdp_rxq);
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
@@ -327,10 +333,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (xa)
+ if (xa) {
+ napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct);
- else
+ } else {
put_page(page);
+ }
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED:
@@ -345,8 +353,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
rcu_read_lock();
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- if (!WARN_ON_ONCE(!xa))
- xa->zc_alloc->free(xa->zc_alloc, handle);
+ xa->zc_alloc->free(xa->zc_alloc, handle);
rcu_read_unlock();
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
@@ -371,3 +378,72 @@ void xdp_return_buff(struct xdp_buff *xdp)
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+
+int xdp_attachment_query(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ bpf->prog_id = info->prog ? info->prog->aux->id : 0;
+ bpf->prog_flags = info->prog ? info->flags : 0;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_query);
+
+bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "program loaded with different flags");
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
+
+void xdp_attachment_setup(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog)
+ bpf_prog_put(info->prog);
+ info->prog = bpf->prog;
+ info->flags = bpf->flags;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_setup);
+
+struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
+{
+ unsigned int metasize, totsize;
+ void *addr, *data_to_copy;
+ struct xdp_frame *xdpf;
+ struct page *page;
+
+ /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
+ metasize = xdp_data_meta_unsupported(xdp) ? 0 :
+ xdp->data - xdp->data_meta;
+ totsize = xdp->data_end - xdp->data + metasize;
+
+ if (sizeof(*xdpf) + totsize > PAGE_SIZE)
+ return NULL;
+
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+
+ addr = page_to_virt(page);
+ xdpf = addr;
+ memset(xdpf, 0, sizeof(*xdpf));
+
+ addr += sizeof(*xdpf);
+ data_to_copy = metasize ? xdp->data_meta : xdp->data;
+ memcpy(addr, data_to_copy, totsize);
+
+ xdpf->data = addr + metasize;
+ xdpf->len = totsize - metasize;
+ xdpf->headroom = 0;
+ xdpf->metasize = metasize;
+ xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+
+ xdp_return_buff(xdp);
+ return xdpf;
+}
+EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 2589a6b78aa1..a556cd708885 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1786,7 +1786,7 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
if (itr->app.selector == app->selector &&
itr->app.protocol == app->protocol &&
itr->ifindex == ifindex &&
- (!prio || itr->app.priority == prio))
+ ((prio == -1) || itr->app.priority == prio))
return itr;
}
@@ -1821,7 +1821,8 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
u8 prio = 0;
spin_lock_bh(&dcb_lock);
- if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+ itr = dcb_app_lookup(app, dev->ifindex, -1);
+ if (itr)
prio = itr->app.priority;
spin_unlock_bh(&dcb_lock);
@@ -1849,7 +1850,8 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
spin_lock_bh(&dcb_lock);
/* Search for existing match and replace */
- if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) {
+ itr = dcb_app_lookup(new, dev->ifindex, -1);
+ if (itr) {
if (new->priority)
itr->app.priority = new->priority;
else {
@@ -1882,7 +1884,8 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
u8 prio = 0;
spin_lock_bh(&dcb_lock);
- if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+ itr = dcb_app_lookup(app, dev->ifindex, -1);
+ if (itr)
prio |= 1 << itr->app.priority;
spin_unlock_bh(&dcb_lock);
@@ -1955,6 +1958,92 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
}
EXPORT_SYMBOL(dcb_ieee_delapp);
+/**
+ * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
+ * priorities to the DSCP values assigned to that priority. Initialize p_map
+ * such that each map element holds a bit mask of DSCP values configured for
+ * that priority by APP entries.
+ */
+void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_prio_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1ULL << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
+
+/**
+ * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
+ * DSCP values to the priorities assigned to that DSCP value. Initialize p_map
+ * such that each map element holds a bit mask of priorities configured for a
+ * given DSCP value by APP entries.
+ */
+void
+dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_dscp_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+ p_map->map[itr->app.protocol] |= 1 << itr->app.priority;
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);
+
+/**
+ * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
+ * type, with valid PID values >= 1536. A special meaning is then assigned to
+ * protocol value of 0: "default priority. For use when priority is not
+ * otherwise specified".
+ *
+ * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries
+ * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default
+ * priorities set by these entries.
+ */
+u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 mask = 0;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
+ itr->app.protocol == 0 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+ mask |= 1 << itr->app.priority;
+ }
+ spin_unlock_bh(&dcb_lock);
+
+ return mask;
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask);
+
static int __init dcbnl_init(void)
{
INIT_LIST_HEAD(&dcb_app_list);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index d28d46bff6ab..85d6c879383d 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -606,11 +606,13 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (sk->sk_state == DCCP_LISTEN) {
if (dh->dccph_type == DCCP_PKT_REQUEST) {
/* It is possible that we process SYN packets from backlog,
- * so we need to make sure to disable BH right there.
+ * so we need to make sure to disable BH and RCU right there.
*/
+ rcu_read_lock();
local_bh_disable();
acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
local_bh_enable();
+ rcu_read_unlock();
if (!acceptable)
return 1;
consume_skb(skb);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b08feb219b44..8e08cea6f178 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -493,9 +493,11 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
ireq->ir_rmt_addr);
+ rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- ireq_opt_deref(ireq));
+ rcu_dereference(ireq->ireq_opt));
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d56e36a6db7..43733accf58e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
__poll_t mask;
struct sock *sk = sock->sk;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index f3393e154f0f..dcc74956badd 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -40,4 +40,3 @@ config DECNET_ROUTER
to work.
See <file:Documentation/networking/decnet.txt> for more information.
-
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
index 9e38122d942b..07b38e441b2d 100644
--- a/net/decnet/Makefile
+++ b/net/decnet/Makefile
@@ -8,4 +8,3 @@ decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
decnet-y += sysctl_net_decnet.o
obj-$(CONFIG_NETFILTER) += netfilter/
-
diff --git a/net/decnet/TODO b/net/decnet/TODO
index ebb5ac69d128..358e9eb49016 100644
--- a/net/decnet/TODO
+++ b/net/decnet/TODO
@@ -16,14 +16,14 @@ Steve's quick list of things that need finishing off:
o Verify errors etc. against POSIX 1003.1g (draft)
- o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
+ o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
[maybe this should be done at socket level... the control data in the
send/recvmsg() calls should simply be a vector of set/getsockopt()
calls]
o check MSG_CTRUNC is set where it should be.
- o Find all the commonality between DECnet and IPv4 routing code and extract
+ o Find all the commonality between DECnet and IPv4 routing code and extract
it into a small library of routines. [probably a project for 2.7.xx]
o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
@@ -38,4 +38,3 @@ Steve's quick list of things that need finishing off:
o DECnet sendpages() function
o AIO for DECnet
-
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index bfd43e8f2c06..d0b3e69c6b39 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1363,7 +1363,7 @@ static int dn_dev_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu"
" %04hu %03d %02x %-10s %-7s %-7s\n",
- dev->name ? dev->name : "???",
+ dev->name,
dn_type2asc(dn_db->parms.mode),
0, 0,
dn_db->t3, dn_db->parms.t3,
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index fce94cbd4378..f78fe58eafc8 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -797,5 +797,3 @@ void __init dn_fib_init(void)
rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
dn_fib_rtm_delroute, NULL, 0);
}
-
-
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 1b2120645730..2fb5e055ba25 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -491,6 +491,7 @@ static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
break;
case DN_RUN:
sk->sk_shutdown |= SHUTDOWN_MASK;
+ /* fall through */
case DN_CC:
scp->state = DN_CN;
}
@@ -911,4 +912,3 @@ free_out:
return NET_RX_SUCCESS;
}
-
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 56a52a004c56..a1779de6bd9c 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -701,4 +701,3 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
dn_nsp_send(skb);
}
-
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e74765024d88..1c002c0fb712 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -404,7 +404,7 @@ void dn_rt_cache_flush(int delay)
if (delay <= 0) {
spin_unlock_bh(&dn_rt_flush_lock);
- dn_run_flush(0);
+ dn_run_flush(NULL);
return;
}
@@ -1920,9 +1920,8 @@ void __init dn_route_init(void)
void __exit dn_route_cleanup(void)
{
del_timer(&dn_route_timer);
- dn_run_flush(0);
+ dn_run_flush(NULL);
remove_proc_entry("decnet_cache", init_net.proc_net);
dst_entries_destroy(&dn_dst_ops);
}
-
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 72236695db3d..4a4e3c17740c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -256,5 +256,3 @@ void __exit dn_fib_rules_cleanup(void)
rtnl_unlock();
rcu_barrier();
}
-
-
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
index 255c1ae9daeb..b579e52130aa 100644
--- a/net/decnet/netfilter/Makefile
+++ b/net/decnet/netfilter/Makefile
@@ -3,4 +3,3 @@
#
obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
-
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index ab395e55cd78..a4faacadd8a8 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -158,4 +158,3 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
module_init(dn_rtmsg_init);
module_exit(dn_rtmsg_fini);
-
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 0c9478b91fa5..a65d553e730d 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -29,6 +29,7 @@
#include <linux/keyctl.h>
#include <linux/err.h>
#include <linux/seq_file.h>
+#include <linux/dns_resolver.h>
#include <keys/dns_resolver-type.h>
#include <keys/user-type.h>
#include "internal.h"
@@ -48,27 +49,86 @@ const struct cred *dns_resolver_cache;
/*
* Preparse instantiation data for a dns_resolver key.
*
- * The data must be a NUL-terminated string, with the NUL char accounted in
- * datalen.
+ * For normal hostname lookups, the data must be a NUL-terminated string, with
+ * the NUL char accounted in datalen.
*
* If the data contains a '#' characters, then we take the clause after each
* one to be an option of the form 'key=value'. The actual data of interest is
* the string leading up to the first '#'. For instance:
*
* "ip1,ip2,...#foo=bar"
+ *
+ * For server list requests, the data must begin with a NUL char and be
+ * followed by a byte indicating the version of the data format. Version 1
+ * looks something like (note this is packed):
+ *
+ * u8 Non-string marker (ie. 0)
+ * u8 Content (DNS_PAYLOAD_IS_*)
+ * u8 Version (e.g. 1)
+ * u8 Source of server list
+ * u8 Lookup status of server list
+ * u8 Number of servers
+ * foreach-server {
+ * __le16 Name length
+ * __le16 Priority (as per SRV record, low first)
+ * __le16 Weight (as per SRV record, higher first)
+ * __le16 Port
+ * u8 Source of address list
+ * u8 Lookup status of address list
+ * u8 Protocol (DNS_SERVER_PROTOCOL_*)
+ * u8 Number of addresses
+ * char[] Name (not NUL-terminated)
+ * foreach-address {
+ * u8 Family (DNS_ADDRESS_IS_*)
+ * union {
+ * u8[4] ipv4_addr
+ * u8[16] ipv6_addr
+ * }
+ * }
+ * }
+ *
*/
static int
dns_resolver_preparse(struct key_preparsed_payload *prep)
{
+ const struct dns_payload_header *bin;
struct user_key_payload *upayload;
unsigned long derrno;
int ret;
int datalen = prep->datalen, result_len = 0;
const char *data = prep->data, *end, *opt;
+ if (datalen <= 1 || !data)
+ return -EINVAL;
+
+ if (data[0] == 0) {
+ /* It may be a server list. */
+ if (datalen <= sizeof(*bin))
+ return -EINVAL;
+
+ bin = (const struct dns_payload_header *)data;
+ kenter("[%u,%u],%u", bin->content, bin->version, datalen);
+ if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) {
+ pr_warn_ratelimited(
+ "dns_resolver: Unsupported content type (%u)\n",
+ bin->content);
+ return -EINVAL;
+ }
+
+ if (bin->version != 1) {
+ pr_warn_ratelimited(
+ "dns_resolver: Unsupported server list version (%u)\n",
+ bin->version);
+ return -EINVAL;
+ }
+
+ result_len = datalen;
+ goto store_result;
+ }
+
kenter("'%*.*s',%u", datalen, datalen, data, datalen);
- if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+ if (!data || data[datalen - 1] != '\0')
return -EINVAL;
datalen--;
@@ -144,6 +204,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
return 0;
}
+store_result:
kdebug("store result");
prep->quotalen = result_len;
@@ -320,4 +381,3 @@ static void __exit exit_dns_resolver(void)
module_init(init_dns_resolver)
module_exit(exit_dns_resolver)
MODULE_LICENSE("GPL");
-
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 49da67034f29..76338c38738a 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -148,12 +148,9 @@ int dns_query(const char *type, const char *name, size_t namelen,
if (_result) {
ret = -ENOMEM;
- *_result = kmalloc(len + 1, GFP_KERNEL);
+ *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL);
if (!*_result)
goto put;
-
- memcpy(*_result, upayload->data, len);
- (*_result)[len] = '\0';
}
if (_expiry)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 4183e4ba27a5..48c41918fb35 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -38,6 +38,9 @@ config NET_DSA_TAG_DSA
config NET_DSA_TAG_EDSA
bool
+config NET_DSA_TAG_GSWIP
+ bool
+
config NET_DSA_TAG_KSZ
bool
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9e4d3536f977..6e721f7a2947 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -9,6 +9,7 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o
dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
+dsa_core-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index e63c554e0623..a69c1790bbfc 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -19,12 +19,10 @@
#include <linux/of_mdio.h>
#include <linux/of_platform.h>
#include <linux/of_net.h>
-#include <linux/of_gpio.h>
#include <linux/netdevice.h>
#include <linux/sysfs.h>
#include <linux/phy_fixed.h>
#include <linux/ptp_classify.h>
-#include <linux/gpio/consumer.h>
#include <linux/etherdevice.h>
#include "dsa_priv.h"
@@ -54,6 +52,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
#ifdef CONFIG_NET_DSA_TAG_EDSA
[DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
#endif
+#ifdef CONFIG_NET_DSA_TAG_GSWIP
+ [DSA_TAG_PROTO_GSWIP] = &gswip_netdev_ops,
+#endif
#ifdef CONFIG_NET_DSA_TAG_KSZ
[DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops,
#endif
@@ -72,6 +73,52 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
[DSA_TAG_PROTO_NONE] = &none_ops,
};
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
+{
+ const char *protocol_name[DSA_TAG_LAST] = {
+#ifdef CONFIG_NET_DSA_TAG_BRCM
+ [DSA_TAG_PROTO_BRCM] = "brcm",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND
+ [DSA_TAG_PROTO_BRCM_PREPEND] = "brcm-prepend",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_DSA
+ [DSA_TAG_PROTO_DSA] = "dsa",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_EDSA
+ [DSA_TAG_PROTO_EDSA] = "edsa",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_GSWIP
+ [DSA_TAG_PROTO_GSWIP] = "gswip",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_KSZ
+ [DSA_TAG_PROTO_KSZ] = "ksz",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_LAN9303
+ [DSA_TAG_PROTO_LAN9303] = "lan9303",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_MTK
+ [DSA_TAG_PROTO_MTK] = "mtk",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_QCA
+ [DSA_TAG_PROTO_QCA] = "qca",
+#endif
+#ifdef CONFIG_NET_DSA_TAG_TRAILER
+ [DSA_TAG_PROTO_TRAILER] = "trailer",
+#endif
+ [DSA_TAG_PROTO_NONE] = "none",
+ };
+ unsigned int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(protocol_name) != DSA_TAG_LAST);
+
+ for (i = 0; i < ARRAY_SIZE(dsa_device_ops); i++)
+ if (ops == dsa_device_ops[i])
+ return protocol_name[i];
+
+ return protocol_name[DSA_TAG_PROTO_NONE];
+};
+
const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
{
const struct dsa_device_ops *ops;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dc5d9af3dc80..a1917025e155 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
if (!ds)
return NULL;
+ /* We avoid allocating memory outside dsa_switch
+ * if it is not needed.
+ */
+ if (n <= sizeof(ds->_bitmap) * 8) {
+ ds->bitmap = &ds->_bitmap;
+ } else {
+ ds->bitmap = devm_kcalloc(dev,
+ BITS_TO_LONGS(n),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (unlikely(!ds->bitmap))
+ return NULL;
+ }
+
ds->dev = dev;
ds->num_ports = n;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 3964c6f7a7c0..9e4fd04ab53c 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -86,6 +86,7 @@ struct dsa_slave_priv {
/* dsa.c */
const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
bool dsa_schedule_work(struct work_struct *work);
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops);
/* legacy.c */
#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
@@ -205,6 +206,9 @@ extern const struct dsa_device_ops dsa_netdev_ops;
/* tag_edsa.c */
extern const struct dsa_device_ops edsa_netdev_ops;
+/* tag_gswip.c */
+extern const struct dsa_device_ops gswip_netdev_ops;
+
/* tag_ksz.c */
extern const struct dsa_device_ops ksz_netdev_ops;
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
index 42a7b85b84e1..cb42939db776 100644
--- a/net/dsa/legacy.c
+++ b/net/dsa/legacy.c
@@ -392,8 +392,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
}
/* Drop our reference to the MDIO bus device */
- if (pd->chip[i].host_dev)
- put_device(pd->chip[i].host_dev);
+ put_device(pd->chip[i].host_dev);
}
kfree(pd->chip);
}
@@ -687,8 +686,7 @@ static void dsa_shutdown(struct platform_device *pdev)
#ifdef CONFIG_PM_SLEEP
static int dsa_suspend(struct device *d)
{
- struct platform_device *pdev = to_platform_device(d);
- struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+ struct dsa_switch_tree *dst = dev_get_drvdata(d);
int i, ret = 0;
for (i = 0; i < dst->pd->nr_chips; i++) {
@@ -703,8 +701,7 @@ static int dsa_suspend(struct device *d)
static int dsa_resume(struct device *d)
{
- struct platform_device *pdev = to_platform_device(d);
- struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
+ struct dsa_switch_tree *dst = dev_get_drvdata(d);
int i, ret = 0;
for (i = 0; i < dst->pd->nr_chips; i++) {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9864bcd3d317..7d0c19e7edcf 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -722,7 +722,7 @@ static void dsa_slave_netpoll_cleanup(struct net_device *dev)
p->netpoll = NULL;
- __netpoll_free_async(netpoll);
+ __netpoll_free(netpoll);
}
static void dsa_slave_poll_controller(struct net_device *dev)
@@ -767,7 +767,6 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
const struct tc_action *a;
struct dsa_port *to_dp;
int err = -EOPNOTSUPP;
- LIST_HEAD(actions);
if (!ds->ops->port_mirror_add)
return err;
@@ -775,8 +774,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
if (!tcf_exts_has_one_action(cls->exts))
return err;
- tcf_exts_to_list(cls->exts, &actions);
- a = list_first_entry(&actions, struct tc_action, list);
+ a = tcf_exts_first_action(cls->exts);
if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
struct dsa_mall_mirror_tc_entry *mirror;
@@ -900,7 +898,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
switch (f->command) {
case TC_BLOCK_BIND:
- return tcf_block_cb_register(f->block, cb, dev, dev);
+ return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
case TC_BLOCK_UNBIND:
tcf_block_cb_unregister(f->block, cb, dev);
return 0;
@@ -1060,6 +1058,27 @@ static struct device_type dsa_type = {
.name = "dsa",
};
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *dev = to_net_dev(d);
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return sprintf(buf, "%s\n",
+ dsa_tag_protocol_to_str(dp->cpu_dp->tag_ops));
+}
+static DEVICE_ATTR_RO(tagging);
+
+static struct attribute *dsa_slave_attrs[] = {
+ &dev_attr_tagging.attr,
+ NULL
+};
+
+static const struct attribute_group dsa_group = {
+ .name = "dsa",
+ .attrs = dsa_slave_attrs,
+};
+
static void dsa_slave_phylink_validate(struct net_device *dev,
unsigned long *supported,
struct phylink_link_state *state)
@@ -1355,8 +1374,14 @@ int dsa_slave_create(struct dsa_port *port)
goto out_phy;
}
+ ret = sysfs_create_group(&slave_dev->dev.kobj, &dsa_group);
+ if (ret)
+ goto out_unreg;
+
return 0;
+out_unreg:
+ unregister_netdev(slave_dev);
out_phy:
rtnl_lock();
phylink_disconnect_phy(p->dp->pl);
@@ -1380,6 +1405,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
rtnl_unlock();
dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
+ sysfs_remove_group(&slave_dev->dev.kobj, &dsa_group);
unregister_netdev(slave_dev);
phylink_destroy(dp->pl);
free_percpu(p->stats64);
@@ -1452,6 +1478,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
netdev_dbg(dev, "fdb add failed err=%d\n", err);
break;
}
+ fdb_info->offloaded = true;
call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
&fdb_info->info);
break;
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index b93511726069..142b294d3446 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
{
const struct switchdev_obj_port_mdb *mdb = info->mdb;
struct switchdev_trans *trans = info->trans;
- DECLARE_BITMAP(group, ds->num_ports);
int port;
/* Build a mask of Multicast group members */
- bitmap_zero(group, ds->num_ports);
+ bitmap_zero(ds->bitmap, ds->num_ports);
if (ds->index == info->sw_index)
- set_bit(info->port, group);
+ set_bit(info->port, ds->bitmap);
for (port = 0; port < ds->num_ports; port++)
if (dsa_is_dsa_port(ds, port))
- set_bit(port, group);
+ set_bit(port, ds->bitmap);
if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_mdb_prepare_bitmap(ds, mdb, group);
+ return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
- dsa_switch_mdb_add_bitmap(ds, mdb, group);
+ dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
return 0;
}
@@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
{
const struct switchdev_obj_port_vlan *vlan = info->vlan;
struct switchdev_trans *trans = info->trans;
- DECLARE_BITMAP(members, ds->num_ports);
int port;
/* Build a mask of VLAN members */
- bitmap_zero(members, ds->num_ports);
+ bitmap_zero(ds->bitmap, ds->num_ports);
if (ds->index == info->sw_index)
- set_bit(info->port, members);
+ set_bit(info->port, ds->bitmap);
for (port = 0; port < ds->num_ports; port++)
if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
- set_bit(port, members);
+ set_bit(port, ds->bitmap);
if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_vlan_prepare_bitmap(ds, vlan, members);
+ return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
- dsa_switch_vlan_add_bitmap(ds, vlan, members);
+ dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
return 0;
}
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
new file mode 100644
index 000000000000..49e9b73f1be3
--- /dev/null
+++ b/net/dsa/tag_gswip.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel / Lantiq GSWIP V2.0 PMAC tag support
+ *
+ * Copyright (C) 2017 - 2018 Hauke Mehrtens <hauke@hauke-m.de>
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "dsa_priv.h"
+
+#define GSWIP_TX_HEADER_LEN 4
+
+/* special tag in TX path header */
+/* Byte 0 */
+#define GSWIP_TX_SLPID_SHIFT 0 /* source port ID */
+#define GSWIP_TX_SLPID_CPU 2
+#define GSWIP_TX_SLPID_APP1 3
+#define GSWIP_TX_SLPID_APP2 4
+#define GSWIP_TX_SLPID_APP3 5
+#define GSWIP_TX_SLPID_APP4 6
+#define GSWIP_TX_SLPID_APP5 7
+
+/* Byte 1 */
+#define GSWIP_TX_CRCGEN_DIS BIT(7)
+#define GSWIP_TX_DPID_SHIFT 0 /* destination group ID */
+#define GSWIP_TX_DPID_ELAN 0
+#define GSWIP_TX_DPID_EWAN 1
+#define GSWIP_TX_DPID_CPU 2
+#define GSWIP_TX_DPID_APP1 3
+#define GSWIP_TX_DPID_APP2 4
+#define GSWIP_TX_DPID_APP3 5
+#define GSWIP_TX_DPID_APP4 6
+#define GSWIP_TX_DPID_APP5 7
+
+/* Byte 2 */
+#define GSWIP_TX_PORT_MAP_EN BIT(7)
+#define GSWIP_TX_PORT_MAP_SEL BIT(6)
+#define GSWIP_TX_LRN_DIS BIT(5)
+#define GSWIP_TX_CLASS_EN BIT(4)
+#define GSWIP_TX_CLASS_SHIFT 0
+#define GSWIP_TX_CLASS_MASK GENMASK(3, 0)
+
+/* Byte 3 */
+#define GSWIP_TX_DPID_EN BIT(0)
+#define GSWIP_TX_PORT_MAP_SHIFT 1
+#define GSWIP_TX_PORT_MAP_MASK GENMASK(6, 1)
+
+#define GSWIP_RX_HEADER_LEN 8
+
+/* special tag in RX path header */
+/* Byte 7 */
+#define GSWIP_RX_SPPID_SHIFT 4
+#define GSWIP_RX_SPPID_MASK GENMASK(6, 4)
+
+static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ int err;
+ u8 *gswip_tag;
+
+ err = skb_cow_head(skb, GSWIP_TX_HEADER_LEN);
+ if (err)
+ return NULL;
+
+ skb_push(skb, GSWIP_TX_HEADER_LEN);
+
+ gswip_tag = skb->data;
+ gswip_tag[0] = GSWIP_TX_SLPID_CPU;
+ gswip_tag[1] = GSWIP_TX_DPID_ELAN;
+ gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL;
+ gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK;
+ gswip_tag[3] |= GSWIP_TX_DPID_EN;
+
+ return skb;
+}
+
+static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev,
+ struct packet_type *pt)
+{
+ int port;
+ u8 *gswip_tag;
+
+ if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
+ return NULL;
+
+ gswip_tag = skb->data - ETH_HLEN;
+
+ /* Get source port information */
+ port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
+ skb->dev = dsa_master_find_slave(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ /* remove GSWIP tag */
+ skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
+
+ return skb;
+}
+
+const struct dsa_device_ops gswip_netdev_ops = {
+ .xmit = gswip_tag_xmit,
+ .rcv = gswip_tag_rcv,
+};
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ee28440f57c5..fd8faa0dfa61 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
}
EXPORT_SYMBOL(sysfs_format_mac);
-struct sk_buff **eth_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
{
- struct sk_buff *p, **pp = NULL;
- struct ethhdr *eh, *eh2;
- unsigned int hlen, off_eth;
const struct packet_offload *ptype;
+ unsigned int hlen, off_eth;
+ struct sk_buff *pp = NULL;
+ struct ethhdr *eh, *eh2;
+ struct sk_buff *p;
__be16 type;
int flush = 1;
@@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head,
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2cc224106b69..d14226ecfde4 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -25,7 +25,7 @@
#include <net/ieee802154_netdev.h>
#include <net/6lowpan.h>
-#include <net/ipv6.h>
+#include <net/ipv6_frag.h>
#include <net/inet_frag.h>
#include "6lowpan_i.h"
@@ -40,9 +40,6 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
{
const struct frag_lowpan_compare_key *key = a;
- struct lowpan_frag_queue *fq;
-
- fq = container_of(q, struct lowpan_frag_queue, q);
BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
memcpy(&q->key, key, sizeof(*key));
@@ -52,10 +49,8 @@ static void lowpan_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags);
spin_lock(&fq->q.lock);
@@ -265,7 +260,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
}
sub_frag_mem_limit(fq->q.net, sum_truesize);
- head->next = NULL;
+ skb_mark_not_on_list(head);
head->dev = ldev;
head->tstamp = fq->q.stamp;
@@ -468,7 +463,6 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
table[0].data = &ieee802154_lowpan->frags.high_thresh;
table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
- table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh;
table[1].data = &ieee802154_lowpan->frags.low_thresh;
table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
table[2].data = &ieee802154_lowpan->frags.timeout;
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index e6ff5128e61a..ca53efa17be1 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -265,9 +265,24 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev)
/* We must take a copy of the skb before we modify/replace the ipv6
* header as the header could be used elsewhere
*/
- skb = skb_unshare(skb, GFP_ATOMIC);
- if (!skb)
- return NET_XMIT_DROP;
+ if (unlikely(skb_headroom(skb) < ldev->needed_headroom ||
+ skb_tailroom(skb) < ldev->needed_tailroom)) {
+ struct sk_buff *nskb;
+
+ nskb = skb_copy_expand(skb, ldev->needed_headroom,
+ ldev->needed_tailroom, GFP_ATOMIC);
+ if (likely(nskb)) {
+ consume_skb(skb);
+ skb = nskb;
+ } else {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+ } else {
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_XMIT_DROP;
+ }
ret = lowpan_header(skb, ldev, &dgram_size, &dgram_offset);
if (ret < 0) {
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index cb7176cd4cd6..fe225d9a1877 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -400,4 +400,3 @@ module_exit(wpan_phy_class_exit);
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface");
MODULE_AUTHOR("Dmitry Eremin-Solenikov");
-
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 35c432668454..78f6f1233194 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -75,4 +75,3 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, },
[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, },
};
-
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a60658c85a9a..bc6b912603f1 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -25,6 +25,7 @@
#include <linux/termios.h> /* For TIOCOUTQ/INQ */
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/socket.h>
#include <net/datalink.h>
#include <net/psnap.h>
#include <net/sock.h>
@@ -452,6 +453,7 @@ struct dgram_sock {
unsigned int bound:1;
unsigned int connected:1;
unsigned int want_ack:1;
+ unsigned int want_lqi:1;
unsigned int secen:1;
unsigned int secen_override:1;
unsigned int seclevel:3;
@@ -486,6 +488,7 @@ static int dgram_init(struct sock *sk)
struct dgram_sock *ro = dgram_sk(sk);
ro->want_ack = 1;
+ ro->want_lqi = 0;
return 0;
}
@@ -713,6 +716,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
size_t copied = 0;
int err = -EOPNOTSUPP;
struct sk_buff *skb;
+ struct dgram_sock *ro = dgram_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name);
skb = skb_recv_datagram(sk, flags, noblock, &err);
@@ -744,6 +748,13 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
*addr_len = sizeof(*saddr);
}
+ if (ro->want_lqi) {
+ err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI,
+ sizeof(uint8_t), &(mac_cb(skb)->lqi));
+ if (err)
+ goto done;
+ }
+
if (flags & MSG_TRUNC)
copied = skb->len;
done:
@@ -847,6 +858,9 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname,
case WPAN_WANTACK:
val = ro->want_ack;
break;
+ case WPAN_WANTLQI:
+ val = ro->want_lqi;
+ break;
case WPAN_SECURITY:
if (!ro->secen_override)
val = WPAN_SECURITY_DEFAULT;
@@ -892,6 +906,9 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname,
case WPAN_WANTACK:
ro->want_ack = !!val;
break;
+ case WPAN_WANTLQI:
+ ro->want_lqi = !!val;
+ break;
case WPAN_SECURITY:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN) &&
!ns_capable(net->user_ns, CAP_NET_RAW)) {
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 80dad301361d..32cae39cdff6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -430,7 +430,7 @@ config INET_DIAG
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at:
-
+
http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
If unsure, say Y.
@@ -600,7 +600,7 @@ config TCP_CONG_VENO
distinguishing to circumvent the difficult judgment of the packet loss
type. TCP Veno cuts down less congestion window in response to random
loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index eec9569ffa5c..58629314eae9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
-obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b403499fdabe..1fbe2f815474 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog)
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
sk->sk_max_ack_backlog = backlog;
err = 0;
@@ -485,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!net->ipv4.sysctl_ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!inet_can_nonlocal_bind(net, inet) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
@@ -1377,6 +1377,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (encap)
skb_reset_inner_headers(skb);
skb->network_header = (u8 *)iph - skb->head;
+ skb_reset_mac_len(skb);
} while ((skb = skb->next));
out:
@@ -1384,12 +1385,12 @@ out:
}
EXPORT_SYMBOL(inet_gso_segment);
-struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
- struct sk_buff *p;
+ struct sk_buff *pp = NULL;
const struct iphdr *iph;
+ struct sk_buff *p;
unsigned int hlen;
unsigned int off;
unsigned int id;
@@ -1425,7 +1426,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
struct iphdr *iph2;
u16 flush_id;
@@ -1505,8 +1506,8 @@ out:
}
EXPORT_SYMBOL(inet_gro_receive);
-static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ipip_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
if (NAPI_GRO_CB(skb)->encap_mark) {
NAPI_GRO_CB(skb)->flush = 1;
@@ -1801,6 +1802,7 @@ static __net_init int inet_init_net(struct net *net)
* We set them here, in case sysctl is not compiled.
*/
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_fwd_update_priority = 1;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;
@@ -1882,6 +1884,7 @@ fs_initcall(ipv4_offload_init);
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
+ .list_func = ip_list_rcv,
};
static int __init inet_init(void)
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 4dd95cdd8070..c01fa791260d 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -461,9 +461,9 @@ static int ah4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_AH);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index e90c89ef8c08..850a6f13a082 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&arp_tbl, dev);
+ if (!netif_carrier_ok(dev))
+ neigh_carrier_down(&arp_tbl, dev);
break;
default:
break;
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
index ce262d76cc48..e9e42f99725e 100644
--- a/net/ipv4/bpfilter/Makefile
+++ b/net/ipv4/bpfilter/Makefile
@@ -1,2 +1 @@
obj-$(CONFIG_BPFILTER) += sockopt.o
-
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 82178cc69c96..777fa3b7fb13 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1512,7 +1512,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
*
* Description:
* Parse the packet's IP header looking for a CIPSO option. Returns a pointer
- * to the start of the CIPSO option on success, NULL if one if not found.
+ * to the start of the CIPSO option on success, NULL if one is not found.
*
*/
unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
@@ -1522,10 +1522,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
int optlen;
int taglen;
- for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+ for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
switch (optptr[0]) {
- case IPOPT_CIPSO:
- return optptr;
case IPOPT_END:
return NULL;
case IPOPT_NOOP:
@@ -1534,6 +1532,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
default:
taglen = optptr[1];
}
+ if (!taglen || taglen > optlen)
+ return NULL;
+ if (optptr[0] == IPOPT_CIPSO)
+ return optptr;
+
optlen -= taglen;
optptr += taglen;
}
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..300921417f89 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
- if (!oif)
+ if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..63d5b58fbfdb 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -100,6 +100,16 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
[IFA_RT_PRIORITY] = { .type = NLA_U32 },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+};
+
+struct inet_fill_args {
+ u32 portid;
+ u32 seq;
+ int event;
+ unsigned int flags;
+ int netnsid;
+ int ifindex;
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -773,7 +783,8 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
}
static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
- __u32 *pvalid_lft, __u32 *pprefered_lft)
+ __u32 *pvalid_lft, __u32 *pprefered_lft,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[IFA_MAX+1];
struct in_ifaddr *ifa;
@@ -783,7 +794,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
int err;
err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
- NULL);
+ extack);
if (err < 0)
goto errout;
@@ -888,7 +899,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
ASSERT_RTNL();
- ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
+ ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack);
if (IS_ERR(ifa))
return PTR_ERR(ifa);
@@ -1584,13 +1595,14 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
}
static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
+ struct inet_fill_args *args)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
u32 preferred, valid;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
+ args->flags);
if (!nlh)
return -EMSGSIZE;
@@ -1601,6 +1613,10 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto nla_put_failure;
+
if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
preferred = ifa->ifa_preferred_lft;
valid = ifa->ifa_valid_lft;
@@ -1645,27 +1661,138 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ struct in_ifaddr *ifa;
+ int ip_idx = 0;
+ int err;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+
+ err = inet_fill_ifaddr(skb, ifa, fillargs);
+ if (err < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ }
+ err = 0;
+
+done:
+ cb->args[2] = ip_idx;
+
+ return err;
+}
+
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = RTM_NEWADDR,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ };
struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
int h, s_h;
int idx, s_idx;
- int ip_idx, s_ip_idx;
+ int s_ip_idx;
struct net_device *dev;
struct in_device *in_dev;
- struct in_ifaddr *ifa;
struct hlist_head *head;
+ int err;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
- s_ip_idx = ip_idx = cb->args[2];
+ s_ip_idx = cb->args[2];
+
+ if (cb->strict_check) {
+ err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ return err;
+
+ if (fillargs.ifindex) {
+ dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+ &fillargs);
+ }
+ goto put_tgt_net;
+ }
+ }
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
- head = &net->dev_index_head[h];
+ head = &tgt_net->dev_index_head[h];
rcu_read_lock();
- cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
- net->dev_base_seq;
+ cb->seq = atomic_read(&tgt_net->ipv4.dev_addr_genid) ^
+ tgt_net->dev_base_seq;
hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
@@ -1675,18 +1802,11 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
if (!in_dev)
goto cont;
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) < 0) {
- rcu_read_unlock();
- goto done;
- }
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ err = in_dev_dump_addr(in_dev, skb, cb, s_ip_idx,
+ &fillargs);
+ if (err < 0) {
+ rcu_read_unlock();
+ goto done;
}
cont:
idx++;
@@ -1697,7 +1817,9 @@ cont:
done:
cb->args[0] = h;
cb->args[1] = idx;
- cb->args[2] = ip_idx;
+put_tgt_net:
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
return skb->len;
}
@@ -1705,8 +1827,14 @@ done:
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
+ struct inet_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ };
struct sk_buff *skb;
- u32 seq = nlh ? nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
struct net *net;
@@ -1715,7 +1843,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
if (!skb)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+ err = inet_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -1827,6 +1955,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+ if (all || type == NETCONFA_BC_FORWARDING)
+ size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +2003,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+ if ((all || type == NETCONFA_BC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+ IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+ goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -1989,6 +2123,7 @@ errout:
static int inet_netconf_dump_devconf(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
int h, s_h;
int idx, s_idx;
@@ -1996,6 +2131,21 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
struct in_device *in_dev;
struct hlist_head *head;
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
+ return -EINVAL;
+ }
+ }
+
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -2015,7 +2165,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
if (inet_netconf_fill_devconf(skb, dev->ifindex,
&in_dev->cnf,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
NETCONFA_ALL) < 0) {
@@ -2032,7 +2182,7 @@ cont:
if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done;
@@ -2043,7 +2193,7 @@ cont:
if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
net->ipv4.devconf_dflt,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done;
@@ -2143,6 +2293,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
+ if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+ new_value != old_value)
+ rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2413,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+ DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 97689012b357..9e1c840596c5 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -683,12 +683,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
*/
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ip_esp_hdr *esph;
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int ivlen = crypto_aead_ivsize(aead);
- int elen = skb->len - sizeof(*esph) - ivlen;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
int nfrags;
int assoclen;
int seqhilen;
@@ -698,13 +697,13 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct scatterlist *sg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
goto out;
if (elen <= 0)
goto out;
- assoclen = sizeof(*esph);
+ assoclen = sizeof(struct ip_esp_hdr);
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
@@ -820,9 +819,9 @@ static int esp4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_ESP);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 7cf755ef9efb..58834a10c0be 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -28,8 +28,8 @@
#include <linux/spinlock.h>
#include <net/udp.h>
-static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *esp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
@@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev))
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
if (!xo)
return -EINVAL;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev)) {
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2998b0e47d4b..5bf653f36911 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -315,6 +315,32 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
}
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
+{
+ bool dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ int ret;
+
+ for (ret = 0; ret < fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (fi->fib_nh[0].nh_dev == dev)
+ dev_match = true;
+#endif
+
+ return dev_match;
+}
+EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
+
/* Given (packet source, input interface) and optional (dst, oif, tos):
* - (main) check, that source is valid i.e. not broadcast or our local
* address.
@@ -361,24 +387,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
(res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
goto e_inval;
fib_combine_itag(itag, &res);
- dev_match = false;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (ret = 0; ret < res.fi->fib_nhs; ret++) {
- struct fib_nh *nh = &res.fi->fib_nh[ret];
-
- if (nh->nh_dev == dev) {
- dev_match = true;
- break;
- } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
- dev_match = true;
- break;
- }
- }
-#else
- if (FIB_RES_DEV(res) == dev)
- dev_match = true;
-#endif
+ dev_match = fib_info_nh_uses_dev(res.fi, dev);
if (dev_match) {
ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
return ret;
@@ -792,19 +802,111 @@ errout:
return err;
}
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+ struct fib_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[RTA_MAX + 1];
+ struct rtmsg *rtm;
+ int err, i;
+
+ ASSERT_RTNL();
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
+ return -EINVAL;
+ }
+
+ rtm = nlmsg_data(nlh);
+ if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
+ rtm->rtm_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
+ return -EINVAL;
+ }
+ if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
+ return -EINVAL;
+ }
+
+ filter->flags = rtm->rtm_flags;
+ filter->protocol = rtm->rtm_protocol;
+ filter->rt_type = rtm->rtm_type;
+ filter->table_id = rtm->rtm_table;
+
+ err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= RTA_MAX; ++i) {
+ int ifindex;
+
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_TABLE:
+ filter->table_id = nla_get_u32(tb[i]);
+ break;
+ case RTA_OIF:
+ ifindex = nla_get_u32(tb[i]);
+ filter->dev = __dev_get_by_index(net, ifindex);
+ if (!filter->dev)
+ return -ENODEV;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ if (filter->flags || filter->protocol || filter->rt_type ||
+ filter->table_id || filter->dev) {
+ filter->filter_set = 1;
+ cb->answer_flags = NLM_F_DUMP_FILTERED;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
+
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
+ struct fib_dump_filter filter = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
struct hlist_head *head;
int dumped = 0, err;
- if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
- ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
+ if (err < 0)
+ return err;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
+ }
+
+ /* fib entries are never clones and ipv4 does not use prefix flag */
+ if (filter.flags & (RTM_F_PREFIX | RTM_F_CLONED))
return skb->len;
+ if (filter.table_id) {
+ tb = fib_get_table(net, filter.table_id);
+ if (!tb) {
+ NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+ return -ENOENT;
+ }
+
+ err = fib_table_dump(tb, skb, cb, &filter);
+ return skb->len ? : err;
+ }
+
s_h = cb->args[0];
s_e = cb->args[1];
@@ -819,7 +921,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- err = fib_table_dump(tb, skb, cb);
+ err = fib_table_dump(tb, skb, cb, &filter);
if (err < 0) {
if (likely(skb->len))
goto out;
@@ -1243,7 +1345,8 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct netdev_notifier_changeupper_info *info;
+ struct netdev_notifier_changeupper_info *upper_info = ptr;
+ struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
unsigned int flags;
@@ -1278,16 +1381,19 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_sync_up(dev, RTNH_F_LINKDOWN);
else
fib_sync_down_dev(dev, event, false);
- /* fall through */
+ rt_cache_flush(net);
+ break;
case NETDEV_CHANGEMTU:
+ fib_sync_mtu(dev, info_ext->ext.mtu);
rt_cache_flush(net);
break;
case NETDEV_CHANGEUPPER:
- info = ptr;
+ upper_info = ptr;
/* flush all routes if dev is linked to or unlinked from
* an L3 master device (e.g., VRF)
*/
- if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ if (upper_info->upper_dev &&
+ netif_is_l3_master(upper_info->upper_dev))
fib_disable_ip(dev, NETDEV_DOWN, true);
break;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f3c89ccf14c5..b5c3937ca6ec 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -208,7 +208,6 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
- struct dst_metrics *m;
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
@@ -219,9 +218,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
- m = fi->fib_metrics;
- if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
- kfree(m);
+ ip_fib_metrics_put(fi->fib_metrics);
+
kfree(fi);
}
@@ -797,8 +795,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
return -EINVAL;
}
dev = __dev_get_by_index(net, nh->nh_oif);
- if (!dev)
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
return -ENODEV;
+ }
if (!(dev->flags & IFF_UP)) {
NL_SET_ERR_MSG(extack,
"Nexthop device is not up");
@@ -1018,13 +1018,6 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
return true;
}
-static int
-fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
-{
- return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
- fi->fib_metrics->metrics);
-}
-
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -1082,16 +1075,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (!fi)
goto failure;
- if (cfg->fc_mx) {
- fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
- if (unlikely(!fi->fib_metrics)) {
- kfree(fi);
- return ERR_PTR(err);
- }
- refcount_set(&fi->fib_metrics->refcnt, 1);
- } else {
- fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
+ fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
+ cfg->fc_mx_len);
+ if (unlikely(IS_ERR(fi->fib_metrics))) {
+ err = PTR_ERR(fi->fib_metrics);
+ kfree(fi);
+ return ERR_PTR(err);
}
+
fib_info_cnt++;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
@@ -1110,10 +1101,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
} endfor_nexthops(fi)
- err = fib_convert_metrics(fi, cfg);
- if (err)
- goto failure;
-
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
@@ -1470,6 +1457,56 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
return NOTIFY_DONE;
}
+/* Update the PMTU of exceptions when:
+ * - the new MTU of the first hop becomes smaller than the PMTU
+ * - the old MTU was the same as the PMTU, and it limited discovery of
+ * larger MTUs on the path. With that limit raised, we can now
+ * discover larger MTUs
+ * A special case is locked exceptions, for which the PMTU is smaller
+ * than the minimal accepted PMTU:
+ * - if the new MTU is greater than the PMTU, don't make any change
+ * - otherwise, unlock and set PMTU
+ */
+static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig)
+{
+ struct fnhe_hash_bucket *bucket;
+ int i;
+
+ bucket = rcu_dereference_protected(nh->nh_exceptions, 1);
+ if (!bucket)
+ return;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
+ fnhe;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
+ if (fnhe->fnhe_mtu_locked) {
+ if (new <= fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ fnhe->fnhe_mtu_locked = false;
+ }
+ } else if (new < fnhe->fnhe_pmtu ||
+ orig == fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ }
+ }
+ }
+}
+
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct fib_nh *nh;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ if (nh->nh_dev == dev)
+ nh_update_mtu(nh, dev->mtu, orig_mtu);
+ }
+}
+
/* Event force Flags Description
* NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
}
static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+ struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_dump_filter *filter)
{
+ unsigned int flags = NLM_F_MULTI;
__be32 xkey = htonl(l->key);
struct fib_alias *fa;
int i, s_i;
+ if (filter->filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
s_i = cb->args[4];
i = 0;
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
int err;
- if (i < s_i) {
- i++;
- continue;
- }
+ if (i < s_i)
+ goto next;
- if (tb->tb_id != fa->tb_id) {
- i++;
- continue;
+ if (tb->tb_id != fa->tb_id)
+ goto next;
+
+ if (filter->filter_set) {
+ if (filter->rt_type && fa->fa_type != filter->rt_type)
+ goto next;
+
+ if ((filter->protocol &&
+ fa->fa_info->fib_protocol != filter->protocol))
+ goto next;
+
+ if (filter->dev &&
+ !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+ goto next;
}
err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
tb->tb_id, fa->fa_type,
xkey, KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+ fa->fa_tos, fa->fa_info, flags);
if (err < 0) {
cb->args[4] = i;
return err;
}
+next:
i++;
}
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb, struct fib_dump_filter *filter)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
int err;
- err = fn_trie_dump_leaf(l, tb, skb, cb);
+ err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
if (err < 0) {
cb->args[3] = key;
cb->args[2] = count;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index c9ec1603666b..500a59906b87 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -224,14 +224,14 @@ drop:
return 0;
}
-static struct sk_buff **fou_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *fou_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
- const struct net_offload *ops;
- struct sk_buff **pp = NULL;
u8 proto = fou_from_sock(sk)->protocol;
const struct net_offload **offloads;
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
@@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return guehdr;
}
-static struct sk_buff **gue_gro_receive(struct sock *sk,
- struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *gue_gro_receive(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload **offloads;
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct guehdr *guehdr;
size_t len, optlen, hdrlen, off;
@@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk,
skb_gro_pull(skb, hdrlen);
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct guehdr *guehdr2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862b6be5..7efe740c06eb 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -86,13 +86,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
+ if (!skb_checksum_simple_validate(skb)) {
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ } else if (csum_err) {
*csum_err = true;
return -EINVAL;
}
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
options++;
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a7d980105f6..6c63524f598a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -108,10 +108,10 @@ out:
return segs;
}
-static struct sk_buff **gre_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *gre_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
const struct gre_base_hdr *greh;
unsigned int hlen, grehlen;
@@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
null_compute_pseudo);
}
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct gre_base_hdr *greh2;
if (!NAPI_GRO_CB(p)->same_flow)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1617604c9284..d832beed6e3a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -429,14 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
icmp_param->data.icmph.checksum = 0;
+ ipcm_init(&ipc);
inet->tos = ip_hdr(skb)->tos;
sk->sk_mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
if (icmp_param->replyopts.opt.opt.optlen) {
ipc.opt = &icmp_param->replyopts.opt;
@@ -710,11 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
sk->sk_mark = mark;
+ ipcm_init(&ipc);
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts.opt;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
type, code, &icmp_param);
@@ -1103,9 +1098,9 @@ void icmp_err(struct sk_buff *skb, u32 info)
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
- ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
}
/*
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 75151be21413..4da39446da2d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -820,10 +820,9 @@ static void igmp_timer_expire(struct timer_list *t)
spin_lock(&im->lock);
im->tm_running = 0;
- if (im->unsolicit_count) {
- im->unsolicit_count--;
+ if (im->unsolicit_count && --im->unsolicit_count)
igmp_start_timer(im, unsolicited_report_interval(in_dev));
- }
+
im->reporter = 1;
spin_unlock(&im->lock);
@@ -1288,7 +1287,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#endif
}
-static void igmp_group_added(struct ip_mc_list *im, unsigned int mode)
+static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
@@ -1308,6 +1307,8 @@ static void igmp_group_added(struct ip_mc_list *im, unsigned int mode)
if (in_dev->dead)
return;
+
+ im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
@@ -1320,7 +1321,7 @@ static void igmp_group_added(struct ip_mc_list *im, unsigned int mode)
* not send filter-mode change record as the mode should be from
* IN() to IN(A).
*/
- if (mode == MCAST_EXCLUDE)
+ if (im->sfmode == MCAST_EXCLUDE)
im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_event(in_dev);
@@ -1391,9 +1392,6 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
unsigned int mode)
{
struct ip_mc_list *im;
-#ifdef CONFIG_IP_MULTICAST
- struct net *net = dev_net(in_dev->dev);
-#endif
ASSERT_RTNL();
@@ -1420,7 +1418,6 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
timer_setup(&im->timer, igmp_timer_expire, 0);
- im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
#endif
im->next_rcu = in_dev->mc_list;
@@ -1432,7 +1429,7 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, im);
#endif
- igmp_group_added(im, mode);
+ igmp_group_added(im);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
@@ -1699,7 +1696,7 @@ void ip_mc_remap(struct in_device *in_dev)
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, pmc);
#endif
- igmp_group_added(pmc, pmc->sfmode);
+ igmp_group_added(pmc);
}
}
@@ -1762,7 +1759,7 @@ void ip_mc_up(struct in_device *in_dev)
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, pmc);
#endif
- igmp_group_added(pmc, pmc->sfmode);
+ igmp_group_added(pmc);
}
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 33a88e045efd..15e7f7915a21 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
}
EXPORT_SYMBOL(inet_rcv_saddr_equal);
+bool inet_rcv_saddr_any(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#endif
+ return !sk->sk_rcv_saddr;
+}
+
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
@@ -535,7 +544,8 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct ip_options_rcu *opt;
struct rtable *rt;
- opt = ireq_opt_deref(ireq);
+ rcu_read_lock();
+ opt = rcu_dereference(ireq->ireq_opt);
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
@@ -549,11 +559,13 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
goto no_route;
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
+ rcu_read_unlock();
return &rt->dst;
route_err:
ip_rt_put(rt);
no_route:
+ rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 0d70608cc2e1..bcb11f3a27c0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <linux/rhashtable.h>
#include <net/sock.h>
#include <net/inet_frag.h>
@@ -136,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments;
nf = q->net;
f = nf->f;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
+ if (fp) {
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
+ fp = xp;
+ } while (fp);
+ } else {
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 3647167c8fa3..f5c9ef2586de 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net,
saddr, sport, daddr, hnum,
dif, sdif);
if (result)
- return result;
+ goto done;
/* Lookup lhash2 with INADDR_ANY */
@@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net,
if (ilb2->count > ilb->count)
goto port_lookup;
- return inet_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
- dif, sdif);
+ result = inet_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ goto done;
port_lookup:
sk_for_each_rcu(sk, &ilb->head) {
@@ -352,12 +353,15 @@ port_lookup:
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
- return result;
+ goto done;
}
result = sk;
hiscore = score;
}
}
+done:
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
inet_csk(sk2)->icsk_bind_hash == tb &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false))
- return reuseport_add_sock(sk, sk2);
+ return reuseport_add_sock(sk, sk2,
+ inet_rcv_saddr_any(sk));
}
- return reuseport_alloc(sk);
+ return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
int __inet_hash(struct sock *sk, struct sock *osk)
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index b54b948b0596..32662e9e5d21 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
- skb->priority = rt_tos2priority(iph->tos);
+ if (net->ipv4.sysctl_ip_fwd_update_priority)
+ skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d14d741fb05e..9b0158fa431f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,6 +57,57 @@
*/
static const char ip_frag_cache_name[] = "ip4-frags";
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+ struct inet_skb_parm h;
+ struct sk_buff *next_frag;
+ int frag_run_len;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void ip4_frag_init_run(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+
+ FRAG_CB(skb)->next_frag = NULL;
+ FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
+ struct sk_buff *skb)
+{
+ RB_CLEAR_NODE(&skb->rbnode);
+ FRAG_CB(skb)->next_frag = NULL;
+
+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+ FRAG_CB(q->fragments_tail)->next_frag = skb;
+ q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+ if (q->last_run_head)
+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+ &q->last_run_head->rbnode.rb_right);
+ else
+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+ ip4_frag_init_run(skb);
+ q->fragments_tail = skb;
+ q->last_run_head = skb;
+}
+
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
struct inet_frag_queue q;
@@ -75,8 +126,8 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
- struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -136,7 +187,7 @@ static void ip_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
const struct iphdr *iph;
- struct sk_buff *head;
+ struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
int err;
@@ -152,14 +203,36 @@ static void ip_expire(struct timer_list *t)
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
-
- head = qp->q.fragments;
-
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+ if (!(qp->q.flags & INET_FRAG_FIRST_IN))
goto out;
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+ * pull the head out of the tree in order to be able to
+ * deal with head->dev.
+ */
+ if (qp->q.fragments) {
+ head = qp->q.fragments;
+ qp->q.fragments = head->next;
+ } else {
+ head = skb_rb_first(&qp->q.rb_fragments);
+ if (!head)
+ goto out;
+ if (FRAG_CB(head)->next_frag)
+ rb_replace_node(&head->rbnode,
+ &FRAG_CB(head)->next_frag->rbnode,
+ &qp->q.rb_fragments);
+ else
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+ }
+ if (head == qp->q.fragments_tail)
+ qp->q.fragments_tail = NULL;
+
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -179,16 +252,15 @@ static void ip_expire(struct timer_list *t)
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
- skb_get(head);
spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- kfree_skb(head);
goto out_rcu_unlock;
out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
+ kfree_skb(head);
ipq_put(qp);
}
@@ -231,7 +303,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid);
qp->rid = end;
- rc = qp->q.fragments && (end - start) > max;
+ rc = qp->q.fragments_tail && (end - start) > max;
if (rc) {
struct net *net;
@@ -245,7 +317,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
- struct sk_buff *fp;
unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,21 +324,16 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- fp = qp->q.fragments;
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -277,7 +343,9 @@ static int ip_frag_reinit(struct ipq *qp)
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
- struct sk_buff *prev, *next;
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct rb_node **rbn, *parent;
+ struct sk_buff *skb1, *prev_tail;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -313,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
*/
if (end < qp->q.len ||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
- goto err;
+ goto discard_qp;
qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
} else {
@@ -325,115 +393,77 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
if (qp->q.flags & INET_FRAG_LAST_IN)
- goto err;
+ goto discard_qp;
qp->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_qp;
err = -ENOMEM;
if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
- goto err;
+ goto discard_qp;
err = pskb_trim_rcsum(skb, end - offset);
if (err)
- goto err;
+ goto discard_qp;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = qp->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = qp->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
-found:
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+ * We do the same here for IPv4 (and increment an snmp counter).
*/
- if (prev) {
- int i = (prev->ip_defrag_offset + prev->len) - offset;
-
- if (i > 0) {
- offset += i;
- err = -EINVAL;
- if (end <= offset)
- goto err;
- err = -ENOMEM;
- if (!pskb_pull(skb, i))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
- err = -ENOMEM;
-
- while (next && next->ip_defrag_offset < end) {
- int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
-
- if (i < next->len) {
- int delta = -next->truesize;
-
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- if (!pskb_pull(next, i))
- goto err;
- delta += next->truesize;
- if (delta)
- add_frag_mem_limit(qp->q.net, delta);
- next->ip_defrag_offset += i;
- qp->q.meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragment is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- qp->q.fragments = next;
-
- qp->q.meat -= free_it->len;
- sub_frag_mem_limit(qp->q.net, free_it->truesize);
- kfree_skb(free_it);
- }
+ err = -EINVAL;
+ /* Find out where to put this fragment. */
+ prev_tail = qp->q.fragments_tail;
+ if (!prev_tail)
+ ip4_frag_create_run(&qp->q, skb); /* First fragment. */
+ else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+ /* This is the common case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
+ goto overlap;
+ if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+ ip4_frag_append_to_last_run(&qp->q, skb);
+ else
+ ip4_frag_create_run(&qp->q, skb);
+ } else {
+ /* Binary search. Note that skb can become the first fragment,
+ * but not the last (covered above).
+ */
+ rbn = &qp->q.rb_fragments.rb_node;
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
+ if (end <= skb1->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= skb1->ip_defrag_offset +
+ FRAG_CB(skb1)->frag_run_len)
+ rbn = &parent->rb_right;
+ else /* Found an overlap with skb1. */
+ goto overlap;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb.
+ */
+ ip4_frag_init_run(skb);
+ rb_link_node(&skb->rbnode, parent, rbn);
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
}
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
- dev = skb->dev;
if (dev)
qp->iif = dev->ifindex;
- /* Makes sure compiler wont do silly aliasing games */
- barrier();
skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- qp->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- qp->q.fragments = skb;
-
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
@@ -455,28 +485,34 @@ found:
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, prev, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev);
skb->_skb_refdst = orefdst;
+ if (err)
+ inet_frag_kill(&qp->q);
return err;
}
skb_dst_drop(skb);
return -EINPROGRESS;
+overlap:
+ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
+discard_qp:
+ inet_frag_kill(&qp->q);
err:
kfree_skb(skb);
return err;
}
-
/* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
- struct net_device *dev)
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
- struct sk_buff *fp, *head = qp->q.fragments;
+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+ struct sk_buff **nextp; /* To build frag_list. */
+ struct rb_node *rbn;
int len;
int ihlen;
int err;
@@ -490,25 +526,26 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail;
}
/* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
goto out_nomem;
-
- fp->next = head->next;
- if (!fp->next)
+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+ if (RB_EMPTY_NODE(&skb->rbnode))
+ FRAG_CB(prev_tail)->next_frag = fp;
+ else
+ rb_replace_node(&skb->rbnode, &fp->rbnode,
+ &qp->q.rb_fragments);
+ if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
- prev->next = fp;
-
- skb_morph(head, qp->q.fragments);
- head->next = qp->q.fragments->next;
-
- consume_skb(qp->q.fragments);
- qp->q.fragments = head;
+ skb_morph(skb, head);
+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &qp->q.rb_fragments);
+ consume_skb(head);
+ head = skb;
}
- WARN_ON(!head);
WARN_ON(head->ip_defrag_offset != 0);
/* Allocate a new buffer for the datagram. */
@@ -533,35 +570,61 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
goto out_nomem;
- clone->next = head->next;
- head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
+ head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+ /* Traverse the tree in order, to build frag_list. */
+ fp = FRAG_CB(head)->next_frag;
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ while (rbn || fp) {
+ /* fp points to the next sk_buff in the current run;
+ * rbn points to the next run.
+ */
+ /* Go through the current run. */
+ while (fp) {
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ fp->sk = NULL;
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ fp = FRAG_CB(fp)->next_frag;
+ }
+ /* Move to the next run. */
+ if (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ }
}
sub_frag_mem_limit(qp->q.net, head->truesize);
- head->next = NULL;
+ *nextp = NULL;
+ skb_mark_not_on_list(head);
+ head->prev = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -589,7 +652,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
return 0;
out_nomem:
@@ -671,6 +736,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_check_defrag);
+unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ while (skb) {
+ struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+ sum += skb->truesize;
+ kfree_skb(skb);
+ skb = next;
+ }
+ }
+ return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
#ifdef CONFIG_SYSCTL
static int dist_min;
@@ -735,7 +822,6 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[0].data = &net->ipv4.frags.high_thresh;
table[0].extra1 = &net->ipv4.frags.low_thresh;
- table[0].extra2 = &init_net.ipv4.frags.high_thresh;
table[1].data = &net->ipv4.frags.low_thresh;
table[1].extra2 = &net->ipv4.frags.high_thresh;
table[2].data = &net->ipv4.frags.timeout;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2d8efeecf619..38befe829caf 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -178,6 +178,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
if (tpi->proto == htons(ETH_P_TEB))
itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
else
itn = net_generic(net, ipgre_net_id);
@@ -229,22 +232,19 @@ static void gre_err(struct sk_buff *skb, u32 info)
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
- bool csum_err = false;
- if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
- iph->ihl * 4) < 0) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
+ iph->ihl * 4) < 0)
+ return;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ skb->dev->ifindex, IPPROTO_GRE);
return;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
- IPPROTO_GRE, 0);
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
+ IPPROTO_GRE);
return;
}
@@ -328,6 +328,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
+ return PACKET_REJECT;
+
drop:
kfree_skb(skb);
return PACKET_RCVD;
@@ -587,6 +589,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
goto err_free_skb;
key = &tun_info->key;
+ if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+ goto err_free_rt;
md = ip_tunnel_info_opts(tun_info);
if (!md)
goto err_free_rt;
@@ -983,7 +987,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
- int t_hlen;
tunnel = netdev_priv(dev);
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -991,8 +994,6 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
- t_hlen = tunnel->hlen + sizeof(struct iphdr);
-
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
@@ -1302,13 +1303,11 @@ static const struct net_device_ops gre_tap_netdev_ops = {
static int erspan_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- int t_hlen;
tunnel->tun_hlen = 8;
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
erspan_hdr_len(tunnel->erspan_ver);
- t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
@@ -1511,11 +1510,14 @@ nla_put_failure:
static void erspan_setup(struct net_device *dev)
{
+ struct ip_tunnel *t = netdev_priv(dev);
+
ether_setup(dev);
dev->netdev_ops = &erspan_netdev_ops;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
ip_tunnel_setup(dev, erspan_net_id);
+ t->erspan_ver = 1;
}
static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7582713dd18f..35a786c0aaa0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,7 +307,8 @@ drop:
return true;
}
-static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
@@ -315,13 +316,6 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
struct rtable *rt;
int err;
- /* if ingress device is enslaved to an L3 master device pass the
- * skb to its handler for processing
- */
- skb = l3mdev_ip_rcv(skb);
- if (!skb)
- return NET_RX_SUCCESS;
-
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -393,7 +387,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
goto drop;
}
- return dst_input(skb);
+ return NET_RX_SUCCESS;
drop:
kfree_skb(skb);
@@ -405,13 +399,29 @@ drop_error:
goto drop;
}
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
+ ret = ip_rcv_finish_core(net, sk, skb);
+ if (ret != NET_RX_DROP)
+ ret = dst_input(skb);
+ return ret;
+}
+
/*
* Main IP Receive routine.
*/
-int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
- struct net *net;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
@@ -421,7 +431,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto drop;
- net = dev_net(dev);
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
@@ -489,9 +498,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip_rcv_finish);
+ return skb;
csum_error:
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
@@ -500,5 +507,109 @@ inhdr_error:
drop:
kfree_skb(skb);
out:
- return NET_RX_DROP;
+ return NULL;
+}
+
+/*
+ * IP receive entry point
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net *net = dev_net(dev);
+
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip_rcv_finish);
+}
+
+static void ip_sublist_rcv_finish(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ dst_input(skb);
+ }
+}
+
+static void ip_list_rcv_finish(struct net *net, struct sock *sk,
+ struct list_head *head)
+{
+ struct dst_entry *curr_dst = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct dst_entry *dst;
+
+ list_del(&skb->list);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ continue;
+ if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+ continue;
+
+ dst = skb_dst(skb);
+ if (curr_dst != dst) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dst = dst;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv_finish(&sublist);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net)
+{
+ NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
+ head, dev, NULL, ip_rcv_finish);
+ ip_list_rcv_finish(net, NULL, head);
+}
+
+/* Receive a list of IP packets */
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ list_del(&skb->list);
+ skb = ip_rcv_core(skb, net);
+ if (skb == NULL)
+ continue;
+
+ if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0e3edd25f881..c09219e7f230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *nskb = segs->next;
int err;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
@@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
}
/* Note: skb->sk can be different from sk, in case of tunnels */
-int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+ __u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
- RT_CONN_FLAGS(sk),
+ RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
@@ -478,7 +479,7 @@ packet_routed:
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
- *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+ *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
@@ -511,7 +512,7 @@ no_route:
kfree_skb(skb);
return -EHOSTUNREACH;
}
-EXPORT_SYMBOL(ip_queue_xmit);
+EXPORT_SYMBOL(__ip_queue_xmit);
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
@@ -683,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb = frag;
frag = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
if (err == 0) {
@@ -1147,14 +1148,15 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->fragsize = ip_sk_use_pmtu(sk) ?
dst_mtu(&rt->dst) : rt->dst.dev->mtu;
- cork->gso_size = sk->sk_type == SOCK_DGRAM &&
- sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0;
+ cork->gso_size = ipc->gso_size;
cork->dst = &rt->dst;
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
cork->priority = ipc->priority;
- cork->tx_flags = ipc->tx_flags;
+ cork->transmit_time = ipc->sockc.transmit_time;
+ cork->tx_flags = 0;
+ sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
return 0;
}
@@ -1415,6 +1417,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = cork->transmit_time;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
@@ -1547,11 +1550,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
return;
+ ipcm_init(&ipc);
ipc.addr = daddr;
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c0fe5ad996f2..26c36cccabdc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -149,7 +149,6 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
struct sockaddr_in sin;
- const struct iphdr *iph = ip_hdr(skb);
__be16 *ports;
int end;
@@ -164,7 +163,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
ports = (__be16 *)skb_transport_header(skb);
sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = iph->daddr;
+ sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
sin.sin_port = ports[1];
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index c4f5602308ed..284a22154b4e 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -627,6 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ unsigned int inner_nhdr_len = 0;
const struct iphdr *inner_iph;
struct flowi4 fl4;
u8 tos, ttl;
@@ -636,6 +637,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
__be32 dst;
bool connected;
+ /* ensure we can access the inner net header, for several users below */
+ if (skb->protocol == htons(ETH_P_IP))
+ inner_nhdr_len = sizeof(struct iphdr);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ inner_nhdr_len = sizeof(struct ipv6hdr);
+ if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
+ goto tx_error;
+
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3f091ccad9af..de31b302d69c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -318,9 +318,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ ipv4_update_pmtu(skb, net, info, 0, protocol);
else
- ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ ipv4_redirect(skb, net, 0, protocol);
xfrm_state_put(x);
return 0;
@@ -438,7 +438,8 @@ static int __net_init vti_init_net(struct net *net)
if (err)
return err;
itn = net_generic(net, vti_net_id);
- vti_fb_tunnel_init(itn->fb_tunnel_dev);
+ if (itn->fb_tunnel_dev)
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
return 0;
}
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d97f4f2787f5..9119d012ba46 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -48,9 +48,9 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_COMP);
xfrm_state_put(x);
return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..e65287c27e3d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -175,13 +175,12 @@ static int ipip_err(struct sk_buff *skb, u32 info)
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
- iph->protocol, 0);
+ ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
+ ipv4_redirect(skb, net, t->parms.link, iph->protocol);
goto out;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9f79b9803a16..7a3e2acda94c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
#include <linux/export.h>
+#include <linux/rhashtable.h>
#include <net/ip_tunnels.h>
#include <net/checksum.h>
#include <net/netlink.h>
@@ -1051,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *skb;
int ret;
- if (assert == IGMPMSG_WHOLEPKT)
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
skb = alloc_skb(128, GFP_ATOMIC);
@@ -1059,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
if (!skb)
return -ENOBUFS;
- if (assert == IGMPMSG_WHOLEPKT) {
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
/* Ugly, but we have no choice with this interface.
* Duplicate old header, fix ihl, length etc.
* And all this only to mangle msg->im_msgtype and
@@ -1070,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb_reset_transport_header(skb);
msg = (struct igmpmsg *)skb_network_header(skb);
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
- msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_msgtype = assert;
msg->im_mbz = 0;
- msg->im_vif = mrt->mroute_reg_vif_num;
+ if (assert == IGMPMSG_WRVIFWHOLE)
+ msg->im_vif = vifi;
+ else
+ msg->im_vif = mrt->mroute_reg_vif_num;
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1371,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
struct mr_table *mrt;
struct vifctl vif;
struct mfcctl mfc;
+ bool do_wrvifwhole;
u32 uval;
/* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1501,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
break;
}
+ do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
val = !!val;
if (val != mrt->mroute_do_pim) {
mrt->mroute_do_pim = val;
mrt->mroute_do_assert = val;
+ mrt->mroute_do_wrvifwhole = do_wrvifwhole;
}
break;
case MRT_TABLE:
@@ -1982,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
MFC_ASSERT_THRESH)) {
c->_c.mfc_un.res.last_assert = jiffies;
ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ipmr_cache_report(mrt, skb, true_vifi,
+ IGMPMSG_WRVIFWHOLE);
}
goto dont_forward;
}
@@ -2517,8 +2527,31 @@ errout_free:
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = {};
+ int err;
+
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
+
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
- _ipmr_fill_mroute, &mfc_unres_lock);
+ _ipmr_fill_mroute, &mfc_unres_lock, &filter);
}
static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
@@ -2658,7 +2691,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
mrt->mroute_reg_vif_num) ||
nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
mrt->mroute_do_assert) ||
- nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+ mrt->mroute_do_wrvifwhole))
return false;
return true;
@@ -2698,6 +2733,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
return true;
}
+static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -2706,6 +2766,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int e = 0, s_e;
struct mr_table *mrt;
+ if (cb->strict_check) {
+ int err = ipmr_valid_dumplink(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
s_t = cb->args[0];
s_e = cb->args[1];
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index cafb0506c8c9..3e614cc824f7 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -2,6 +2,7 @@
* Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
*/
+#include <linux/rhashtable.h>
#include <linux/mroute_base.h>
/* Sets everything common except 'dev', since that is done under locking */
@@ -267,6 +268,81 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
}
EXPORT_SYMBOL(mr_fill_mroute);
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+ const struct mr_mfc *c,
+ const struct net_device *dev)
+{
+ int ct;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+ const struct vif_device *vif;
+
+ vif = &mrt->vif_table[ct];
+ if (vif->dev == dev)
+ return true;
+ }
+ }
+ return false;
+}
+
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c,
+ int cmd, int flags),
+ spinlock_t *lock, struct fib_dump_filter *filter)
+{
+ unsigned int e = 0, s_e = cb->args[1];
+ unsigned int flags = NLM_F_MULTI;
+ struct mr_mfc *mfc;
+ int err;
+
+ if (filter->filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ if (e < s_e)
+ goto next_entry;
+ if (filter->dev &&
+ !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+ goto next_entry;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0)
+ goto out;
+next_entry:
+ e++;
+ }
+
+ spin_lock_bh(lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+ if (filter->dev &&
+ !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+ goto next_entry2;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0) {
+ spin_unlock_bh(lock);
+ goto out;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(lock);
+ err = 0;
+ e = 0;
+
+out:
+ cb->args[1] = e;
+ return err;
+}
+EXPORT_SYMBOL(mr_table_dump);
+
int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
struct mr_table *(*iter)(struct net *net,
struct mr_table *mrt),
@@ -274,53 +350,35 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
struct sk_buff *skb,
u32 portid, u32 seq, struct mr_mfc *c,
int cmd, int flags),
- spinlock_t *lock)
+ spinlock_t *lock, struct fib_dump_filter *filter)
{
- unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
+ unsigned int t = 0, s_t = cb->args[0];
struct net *net = sock_net(skb->sk);
struct mr_table *mrt;
- struct mr_mfc *mfc;
+ int err;
+
+ /* multicast does not track protocol or have route type other
+ * than RTN_MULTICAST
+ */
+ if (filter->filter_set) {
+ if (filter->protocol || filter->flags ||
+ (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+ return skb->len;
+ }
rcu_read_lock();
for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
if (t < s_t)
goto next_table;
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
- if (e < s_e)
- goto next_entry;
- if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, mfc,
- RTM_NEWROUTE, NLM_F_MULTI) < 0)
- goto done;
-next_entry:
- e++;
- }
- e = 0;
- s_e = 0;
-
- spin_lock_bh(lock);
- list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
- if (e < s_e)
- goto next_entry2;
- if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, mfc,
- RTM_NEWROUTE, NLM_F_MULTI) < 0) {
- spin_unlock_bh(lock);
- goto done;
- }
-next_entry2:
- e++;
- }
- spin_unlock_bh(lock);
- e = 0;
- s_e = 0;
+
+ err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
+ if (err < 0)
+ break;
next_table:
t++;
}
-done:
rcu_read_unlock();
- cb->args[1] = e;
cb->args[0] = t;
return skb->len;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 04311f7067e2..6d218f5a2e71 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -5,8 +5,8 @@
#include <net/net_namespace.h>
#include <net/tcp.h>
-int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
- u32 *metrics)
+static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
+ int fc_mx_len, u32 *metrics)
{
bool ecn_ca = false;
struct nlattr *nla;
@@ -52,4 +52,28 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
return 0;
}
-EXPORT_SYMBOL_GPL(ip_metrics_convert);
+
+struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
+ int fc_mx_len)
+{
+ struct dst_metrics *fib_metrics;
+ int err;
+
+ if (!fc_mx)
+ return (struct dst_metrics *)&dst_default_metrics;
+
+ fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
+ if (unlikely(!fib_metrics))
+ return ERR_PTR(-ENOMEM);
+
+ err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+ if (!err) {
+ refcount_set(&fib_metrics->refcnt, 1);
+ } else {
+ kfree(fib_metrics);
+ fib_metrics = ERR_PTR(err);
+ }
+
+ return fib_metrics;
+}
+EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e6774ccb7731..8d2e5dc9a827 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
}
EXPORT_SYMBOL_GPL(nf_ip_reroute);
-__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
- break;
- if ((protocol == 0 && !csum_fold(skb->csum)) ||
- !csum_tcpudp_magic(iph->saddr, iph->daddr,
- skb->len - dataoff, protocol,
- skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- /* fall through */
- case CHECKSUM_NONE:
- if (protocol == 0)
- skb->csum = 0;
- else
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len - dataoff,
- protocol, 0);
- csum = __skb_checksum_complete(skb);
- }
- return csum;
-}
-EXPORT_SYMBOL(nf_ip_checksum);
-
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, unsigned int len,
- u_int8_t protocol)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (len == skb->len - dataoff)
- return nf_ip_checksum(skb, hook, dataoff, protocol);
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
- skb->len - dataoff, 0);
- skb->ip_summed = CHECKSUM_NONE;
- return __skb_checksum_complete_head(skb, dataoff + len);
- }
- return csum;
-}
-EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
-
int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
bool strict __always_unused)
{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index bbfc356cb1b5..184bf2e0a1ed 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4
tristate
default n
-config NF_CONNTRACK_IPV4
- tristate "IPv4 connection tracking support (required for NAT)"
- depends on NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_DEFRAG_IPV4
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv4 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config NF_SOCKET_IPV4
tristate "IPv4 socket lookup support"
help
@@ -112,7 +96,7 @@ config NF_REJECT_IPV4
config NF_NAT_IPV4
tristate "IPv4 NAT"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
help
@@ -122,6 +106,10 @@ config NF_NAT_IPV4
if NF_NAT_IPV4
+config NF_NAT_MASQUERADE_IPV4
+ bool
+
+if NF_TABLES
config NFT_CHAIN_NAT_IPV4
depends on NF_TABLES_IPV4
tristate "IPv4 nf_tables nat chain support"
@@ -131,9 +119,6 @@ config NFT_CHAIN_NAT_IPV4
packet transformations such as the source, destination address and
source and destination ports.
-config NF_NAT_MASQUERADE_IPV4
- bool
-
config NFT_MASQ_IPV4
tristate "IPv4 masquerading support for nf_tables"
depends on NF_TABLES_IPV4
@@ -151,6 +136,7 @@ config NFT_REDIR_IPV4
help
This is the expression that provides IPv4 redirect support for
nf_tables.
+endif # NF_TABLES
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
@@ -279,7 +265,7 @@ config IP_NF_TARGET_SYNPROXY
# NAT + specific targets: nf_conntrack
config IP_NF_NAT
tristate "iptables NAT support"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
select NF_NAT_IPV4
@@ -340,7 +326,7 @@ config IP_NF_MANGLE
config IP_NF_TARGET_CLUSTERIP
tristate "CLUSTERIP target support"
depends on IP_NF_MANGLE
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
select NETFILTER_FAMILY_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8394c17c269f..367993adf4d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,12 +3,6 @@
# Makefile for the netfilter modules on top of IPv4.
#
-# objects for l3 independent conntrack
-nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-
-# connection tracking
-obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-
nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 12843c9ef142..0b10d8812828 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -36,7 +36,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
const struct net_device *dev, u8 flags)
{
struct fib_result res;
- bool dev_match;
int ret __maybe_unused;
if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
@@ -46,21 +45,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
return false;
}
- dev_match = false;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (ret = 0; ret < res.fi->fib_nhs; ret++) {
- struct fib_nh *nh = &res.fi->fib_nh[ret];
-
- if (nh->nh_dev == dev) {
- dev_match = true;
- break;
- }
- }
-#else
- if (FIB_RES_DEV(res) == dev)
- dev_match = true;
-#endif
- return dev_match || flags & XT_RPFILTER_LOOSE;
+ return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE;
}
static bool
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
deleted file mode 100644
index 9db988f9a4d7..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ /dev/null
@@ -1,472 +0,0 @@
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack4_net_id __read_mostly;
-static DEFINE_MUTEX(register_ipv4_hooks);
-
-struct conntrack4_net {
- unsigned int users;
-};
-
-static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- const __be32 *ap;
- __be32 _addrs[2];
- ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
- sizeof(u_int32_t) * 2, _addrs);
- if (ap == NULL)
- return false;
-
- tuple->src.u3.ip = ap[0];
- tuple->dst.u3.ip = ap[1];
-
- return true;
-}
-
-static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u3.ip = orig->dst.u3.ip;
- tuple->dst.u3.ip = orig->src.u3.ip;
-
- return true;
-}
-
-static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- const struct iphdr *iph;
- struct iphdr _iph;
-
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (iph == NULL)
- return -NF_ACCEPT;
-
- /* Conntrack defragments packets, we might still see fragments
- * inside ICMP packets though. */
- if (iph->frag_off & htons(IP_OFFSET))
- return -NF_ACCEPT;
-
- *dataoff = nhoff + (iph->ihl << 2);
- *protonum = iph->protocol;
-
- /* Check bogus IP headers */
- if (*dataoff > skb->len) {
- pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
- "nhoff %u, ihl %u, skblen %u\n",
- nhoff, iph->ihl << 2, skb->len);
- return -NF_ACCEPT;
- }
-
- return NF_ACCEPT;
-}
-
-static unsigned int ipv4_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
-}
-
-static unsigned int ipv4_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
- }
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv4_conntrack_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-static unsigned int ipv4_conntrack_local(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
- enum ip_conntrack_info ctinfo;
- struct nf_conn *tmpl;
-
- tmpl = nf_ct_get(skb, &ctinfo);
- if (tmpl && nf_ct_is_template(tmpl)) {
- /* when skipping ct, clear templates to avoid fooling
- * later targets/matches
- */
- skb->_nfct = 0;
- nf_ct_put(tmpl);
- }
- return NF_ACCEPT;
- }
-
- return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-/* Connection tracking may drop packets, but never alters them, so
- make it the first hook. */
-static const struct nf_hook_ops ipv4_conntrack_ops[] = {
- {
- .hook = ipv4_conntrack_in,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_conntrack_local,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
- {
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
-};
-
-/* Fast function for those who don't want to parse /proc (and I don't
- blame them). */
-/* Reversing the socket's dst/src point of view gives us the reply
- mapping. */
-static int
-getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
-
- memset(&tuple, 0, sizeof(tuple));
-
- lock_sock(sk);
- tuple.src.u3.ip = inet->inet_rcv_saddr;
- tuple.src.u.tcp.port = inet->inet_sport;
- tuple.dst.u3.ip = inet->inet_daddr;
- tuple.dst.u.tcp.port = inet->inet_dport;
- tuple.src.l3num = PF_INET;
- tuple.dst.protonum = sk->sk_protocol;
- release_sock(sk);
-
- /* We only do TCP and SCTP at the moment: is there a better way? */
- if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP) {
- pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
- return -ENOPROTOOPT;
- }
-
- if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
- *len, sizeof(struct sockaddr_in));
- return -EINVAL;
- }
-
- h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (h) {
- struct sockaddr_in sin;
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
- sin.sin_family = AF_INET;
- sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u3.ip;
- memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
-
- pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
- nf_ct_put(ct);
- if (copy_to_user(user, &sin, sizeof(sin)) != 0)
- return -EFAULT;
- else
- return 0;
- }
- pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
- &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
- nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
- [CTA_IP_V4_SRC] = { .type = NLA_U32 },
- [CTA_IP_V4_DST] = { .type = NLA_U32 },
-};
-
-static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
- return -EINVAL;
-
- t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
-
- return 0;
-}
-#endif
-
-static struct nf_sockopt_ops so_getorigdst = {
- .pf = PF_INET,
- .get_optmin = SO_ORIGINAL_DST,
- .get_optmax = SO_ORIGINAL_DST+1,
- .get = getorigdst,
- .owner = THIS_MODULE,
-};
-
-static int ipv4_hooks_register(struct net *net)
-{
- struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
- int err = 0;
-
- mutex_lock(&register_ipv4_hooks);
-
- cnet->users++;
- if (cnet->users > 1)
- goto out_unlock;
-
- err = nf_defrag_ipv4_enable(net);
- if (err) {
- cnet->users = 0;
- goto out_unlock;
- }
-
- err = nf_register_net_hooks(net, ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
-
- if (err)
- cnet->users = 0;
- out_unlock:
- mutex_unlock(&register_ipv4_hooks);
- return err;
-}
-
-static void ipv4_hooks_unregister(struct net *net)
-{
- struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
-
- mutex_lock(&register_ipv4_hooks);
- if (cnet->users && (--cnet->users == 0))
- nf_unregister_net_hooks(net, ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- mutex_unlock(&register_ipv4_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
- .l3proto = PF_INET,
- .pkt_to_tuple = ipv4_pkt_to_tuple,
- .invert_tuple = ipv4_invert_tuple,
- .get_l4proto = ipv4_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = ipv4_tuple_to_nlattr,
- .nlattr_to_tuple = ipv4_nlattr_to_tuple,
- .nla_policy = ipv4_nla_policy,
- .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
- NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */
-#endif
- .net_ns_get = ipv4_hooks_register,
- .net_ns_put = ipv4_hooks_unregister,
- .me = THIS_MODULE,
-};
-
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
- &nf_conntrack_htable_size, 0600);
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
-MODULE_ALIAS("ip_conntrack");
-MODULE_LICENSE("GPL");
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
- &nf_conntrack_l4proto_tcp4,
- &nf_conntrack_l4proto_udp4,
- &nf_conntrack_l4proto_icmp,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- &nf_conntrack_l4proto_dccp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
- &nf_conntrack_l4proto_sctp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
- &nf_conntrack_l4proto_udplite4,
-#endif
-};
-
-static int ipv4_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
-}
-
-static void ipv4_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
-}
-
-static struct pernet_operations ipv4_net_ops = {
- .init = ipv4_net_init,
- .exit = ipv4_net_exit,
- .id = &conntrack4_net_id,
- .size = sizeof(struct conntrack4_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv4_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
- nf_conntrack_l3proto_ipv4.nla_size))
- return -EINVAL;
-#endif
- ret = nf_register_sockopt(&so_getorigdst);
- if (ret < 0) {
- pr_err("Unable to register netfilter socket option\n");
- return ret;
- }
-
- ret = register_pernet_subsys(&ipv4_net_ops);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
- goto cleanup_sockopt;
- }
-
- ret = nf_ct_l4proto_register(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- if (ret < 0)
- goto cleanup_pernet;
-
- ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
- goto cleanup_l4proto;
- }
-
- return ret;
-cleanup_l4proto:
- nf_ct_l4proto_unregister(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- cleanup_pernet:
- unregister_pernet_subsys(&ipv4_net_ops);
- cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst);
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv4_fini(void)
-{
- synchronize_net();
- nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_unregister(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- unregister_pernet_subsys(&ipv4_net_ops);
- nf_unregister_sockopt(&so_getorigdst);
-}
-
-module_init(nf_conntrack_l3proto_ipv4_init);
-module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 4388de0e5380..1e6f28c97d3a 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = {
};
/* One level of recursion won't kill us */
-static void dump_ipv4_packet(struct nf_log_buf *m,
+static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int iphoff)
{
@@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
/* Max length: 3+maxlen */
if (!iphoff) { /* Only recurse once. */
nf_log_buf_add(m, "[");
- dump_ipv4_packet(m, info, skb,
+ dump_ipv4_packet(net, m, info, skb,
iphoff + ih->ihl*4+sizeof(_icmph));
nf_log_buf_add(m, "] ");
}
@@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m,
/* Max length: 15 "UID=4294967295 " */
if ((logflags & NF_LOG_UID) && !iphoff)
- nf_log_dump_sk_uid_gid(m, skb->sk);
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (!iphoff && skb->mark)
@@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
if (in != NULL)
dump_ipv4_mac_header(m, loginfo, skb);
- dump_ipv4_packet(m, loginfo, skb, 0);
+ dump_ipv4_packet(net, m, loginfo, skb, 0);
nf_log_buf_close(m);
}
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 6115bf1ff6f0..78a67f961d86 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -264,7 +264,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
return nf_nat_inet_fn(priv, skb, state);
}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
static unsigned int
nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index ad3aeff152ed..a9d5e013e555 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -104,12 +104,26 @@ static int masq_device_event(struct notifier_block *this,
return NOTIFY_DONE;
}
+static int inet_cmp(struct nf_conn *ct, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct nf_conntrack_tuple *tuple;
+
+ if (!device_cmp(ct, (void *)(long)dev->ifindex))
+ return 0;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ return ifa->ifa_address == tuple->dst.u3.ip;
+}
+
static int masq_inet_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
- struct netdev_notifier_info info;
+ struct net *net = dev_net(idev->dev);
/* The masq_dev_notifier will catch the case of the device going
* down. So if the inetdev is dead and being destroyed we have
@@ -119,8 +133,10 @@ static int masq_inet_event(struct notifier_block *this,
if (idev->dead)
return NOTIFY_DONE;
- netdev_notifier_info_init(&info, idev->dev);
- return masq_device_event(this, event, &info);
+ if (event == NETDEV_DOWN)
+ nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
+
+ return NOTIFY_DONE;
}
static struct notifier_block masq_dev_notifier = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index ac110c1d55b5..a0aa13bcabda 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -60,6 +60,7 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
MODULE_ALIAS("ip_nat_snmp_basic");
+MODULE_ALIAS_NFCT_HELPER("snmp_trap");
#define SNMP_PORT 161
#define SNMP_TRAP_PORT 162
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index e50976e3c213..94eb25bc8d7e 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -76,10 +76,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
.flowi4_iif = LOOPBACK_IFINDEX,
};
const struct net_device *oif;
- struct net_device *found;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- int i;
-#endif
+ const struct net_device *found;
/*
* Do not set flowi4_oif, it restricts results (for example, asking
@@ -146,25 +143,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (!oif) {
found = FIB_RES_DEV(res);
- goto ok;
- }
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (i = 0; i < res.fi->fib_nhs; i++) {
- struct fib_nh *nh = &res.fi->fib_nh[i];
+ } else {
+ if (!fib_info_nh_uses_dev(res.fi, oif))
+ return;
- if (nh->nh_dev == oif) {
- found = nh->nh_dev;
- goto ok;
- }
+ found = oif;
}
- return;
-#else
- found = FIB_RES_DEV(res);
- if (found != oif)
- return;
-#endif
-ok:
+
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
*dest = found->ifindex;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 2ed64bca54e3..7ccb5f87f70b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
- if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
- isk->freebind == 0 && isk->transparent == 0 &&
+ if ((!inet_can_nonlocal_bind(net, isk) &&
chk_addr_ret != RTN_LOCAL) ||
chk_addr_ret == RTN_MULTICAST ||
chk_addr_ret == RTN_BROADCAST)
@@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
- isk->freebind || isk->transparent || has_addr ||
+ if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
@@ -739,13 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* no remote port */
}
- ipc.sockc.tsflags = sk->sk_tsflags;
- ipc.addr = inet->inet_saddr;
- ipc.opt = NULL;
- ipc.oif = sk->sk_bound_dev_if;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
+ ipcm_init_sk(&ipc, inet);
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -769,8 +761,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
- sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
-
saddr = ipc.addr;
ipc.addr = faddr = daddr;
@@ -789,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 77350c1256ce..70289682a670 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
SNMP_MIB_SENTINEL
};
@@ -287,6 +288,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
+ SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
+ SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index abb3c9490c55..8ca3eb06ba04 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -561,13 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = inet->inet_daddr;
}
- ipc.sockc.tsflags = sk->sk_tsflags;
- ipc.addr = inet->inet_saddr;
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
- ipc.oif = sk->sk_bound_dev_if;
+ ipcm_init_sk(&ipc, inet);
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -613,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
tos |= RTO_ONLINK;
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
@@ -670,8 +665,6 @@ back_from_confirm:
&rt, msg->msg_flags, &ipc.sockc);
else {
- sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
-
if (!ipc.addr)
ipc.addr = fl4.daddr;
lock_sock(sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97106d7..c0a9d26c06ce 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1001,21 +1001,22 @@ out: kfree_skb(skb);
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
struct dst_entry *dst = &rt->dst;
+ u32 old_mtu = ipv4_mtu(dst);
struct fib_result res;
bool lock = false;
if (ip_mtu_locked(dst))
return;
- if (ipv4_mtu(dst) < mtu)
+ if (old_mtu < mtu)
return;
if (mtu < ip_rt_min_pmtu) {
lock = true;
- mtu = ip_rt_min_pmtu;
+ mtu = min(old_mtu, ip_rt_min_pmtu);
}
- if (rt->rt_pmtu == mtu &&
+ if (rt->rt_pmtu == mtu && !lock &&
time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
return;
@@ -1040,17 +1041,15 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
-
- if (!mark)
- mark = IP4_REPLY_MARK(net, skb->mark);
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
__build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ RT_TOS(iph->tos), protocol, mark, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1132,14 +1131,14 @@ out:
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct flowi4 fl4;
struct rtable *rt;
__build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ RT_TOS(iph->tos), protocol, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
@@ -1219,18 +1218,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
src = ip_hdr(skb)->saddr;
else {
struct fib_result res;
- struct flowi4 fl4;
- struct iphdr *iph;
-
- iph = ip_hdr(skb);
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = iph->daddr;
- fl4.saddr = iph->saddr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_oif = rt->dst.dev->ifindex;
- fl4.flowi4_iif = skb->dev->ifindex;
- fl4.flowi4_mark = skb->mark;
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_oif = rt->dst.dev->ifindex,
+ .flowi4_iif = skb->dev->ifindex,
+ .flowi4_mark = skb->mark,
+ };
rcu_read_lock();
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
@@ -1481,12 +1477,9 @@ void rt_del_uncached_list(struct rtable *rt)
static void ipv4_dst_destroy(struct dst_entry *dst)
{
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *)dst;
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
- kfree(p);
-
+ ip_dst_metrics_put(dst);
rt_del_uncached_list(rt);
}
@@ -1533,11 +1526,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
- if (fi->fib_metrics != &dst_default_metrics) {
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- refcount_inc(&fi->fib_metrics->refcnt);
- }
+ ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
@@ -1996,8 +1986,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto no_route;
}
- if (res->type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST) {
+ if (IN_DEV_BFORWARD(in_dev))
+ goto make_route;
goto brd_input;
+ }
if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2007,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res->type != RTN_UNICAST)
goto martian_destination;
+make_route:
err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
out: return err;
@@ -2781,7 +2775,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct rtable *rt = NULL;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi4 fl4;
+ struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
kuid_t uid;
@@ -2821,7 +2815,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (!skb)
return -ENOBUFS;
- memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
fl4.flowi4_tos = rtm->rtm_tos;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c3387dfd725b..606f868d9f3f 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -88,7 +88,7 @@ u64 cookie_init_timestamp(struct request_sock *req)
ts <<= TSBITS;
ts |= options;
}
- return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ);
+ return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5fa335fd3852..891ed2f91467 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -48,6 +48,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static int comp_sack_nr_max = 255;
+static u32 u32_max_div_HZ = UINT_MAX / HZ;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -201,6 +202,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_fwd_update_priority);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
+ net);
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -664,6 +682,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ip_forward_update_priority",
+ .data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipv4_fwd_update_priority,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
.maxlen = sizeof(int),
@@ -719,9 +746,10 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_probe_interval",
.data = &init_net.ipv4.sysctl_tcp_probe_interval,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u32),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = &u32_max_div_HZ,
},
{
.procname = "igmp_link_local_mcast_reports",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4491faf83f4f..1834818ed07b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
@@ -817,8 +817,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
* This occurs when user tries to read
* from never connected socket.
*/
- if (!sock_flag(sk, SOCK_DONE))
- ret = -ENOTCONN;
+ ret = -ENOTCONN;
break;
}
if (!timeo) {
@@ -1186,7 +1185,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
flags = msg->msg_flags;
- if (flags & MSG_ZEROCOPY && size) {
+ if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
if (sk->sk_state != TCP_ESTABLISHED) {
err = -EINVAL;
goto out_err;
@@ -1241,7 +1240,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
- sockc.tsflags = sk->sk_tsflags;
+ sockcm_init(&sockc, sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
if (unlikely(err)) {
@@ -1275,9 +1274,6 @@ restart:
int linear;
new_segment:
- /* Allocate new segment. If the interface is SG,
- * allocate skb fitting to single page.
- */
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -1299,7 +1295,7 @@ new_segment:
copy = size_goal;
/* All packets are restored as if they have
- * already been sent. skb_mstamp isn't set to
+ * already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
@@ -1757,6 +1753,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
struct tcp_sock *tp;
+ int inq;
int ret;
if (address & (PAGE_SIZE - 1) || address != zc->address)
@@ -1777,12 +1774,15 @@ static int tcp_zerocopy_receive(struct sock *sk,
tp = tcp_sk(sk);
seq = tp->copied_seq;
- zc->length = min_t(u32, zc->length, tcp_inq(sk));
+ inq = tcp_inq(sk);
+ zc->length = min_t(u32, zc->length, inq);
zc->length &= ~(PAGE_SIZE - 1);
-
- zap_page_range(vma, address, zc->length);
-
- zc->recv_skip_hint = 0;
+ if (zc->length) {
+ zap_page_range(vma, address, zc->length);
+ zc->recv_skip_hint = 0;
+ } else {
+ zc->recv_skip_hint = inq;
+ }
ret = 0;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
@@ -1805,8 +1805,17 @@ static int tcp_zerocopy_receive(struct sock *sk,
frags++;
}
}
- if (frags->size != PAGE_SIZE || frags->page_offset)
+ if (frags->size != PAGE_SIZE || frags->page_offset) {
+ int remaining = zc->recv_skip_hint;
+
+ while (remaining && (frags->size != PAGE_SIZE ||
+ frags->page_offset)) {
+ remaining -= frags->size;
+ frags++;
+ }
+ zc->recv_skip_hint -= remaining;
break;
+ }
ret = vm_insert_page(vma, address + length,
skb_frag_page(frags));
if (ret)
@@ -2042,13 +2051,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
break;
if (sk->sk_state == TCP_CLOSE) {
- if (!sock_flag(sk, SOCK_DONE)) {
- /* This occurs when user tries to read
- * from never connected socket.
- */
- copied = -ENOTCONN;
- break;
- }
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ copied = -ENOTCONN;
break;
}
@@ -2410,16 +2416,10 @@ adjudge_to_death:
sock_hold(sk);
sock_orphan(sk);
- /* It is the last release_sock in its life. It will remove backlog. */
- release_sock(sk);
-
-
- /* Now socket is owned by kernel and we acquire BH lock
- * to finish close. No need to check for user refs.
- */
local_bh_disable();
bh_lock_sock(sk);
- WARN_ON(sock_owned_by_user(sk));
+ /* remove backlog if any, without releasing ownership. */
+ __release_sock(sk);
percpu_counter_inc(sk->sk_prot->orphan_count);
@@ -2488,6 +2488,7 @@ adjudge_to_death:
out:
bh_unlock_sock(sk);
local_bh_enable();
+ release_sock(sk);
sock_put(sk);
}
EXPORT_SYMBOL(tcp_close);
@@ -2538,7 +2539,6 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int err = 0;
int old_state = sk->sk_state;
if (old_state != TCP_CLOSE)
@@ -2576,6 +2576,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
+ tp->rcv_rtt_last_tsecr = 0;
tp->write_seq += tp->max_window + 2;
if (tp->write_seq == 0)
tp->write_seq = 1;
@@ -2600,6 +2601,12 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
+ tp->bytes_sent = 0;
+ tp->bytes_retrans = 0;
+ tp->duplicate_sack[0].start_seq = 0;
+ tp->duplicate_sack[0].end_seq = 0;
+ tp->dsack_dups = 0;
+ tp->reord_seen = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
@@ -2614,7 +2621,7 @@ int tcp_disconnect(struct sock *sk, int flags)
}
sk->sk_error_report(sk);
- return err;
+ return 0;
}
EXPORT_SYMBOL(tcp_disconnect);
@@ -2995,7 +3002,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (val < 0)
err = -EINVAL;
else
- icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ icsk->icsk_user_timeout = val;
break;
case TCP_FASTOPEN:
@@ -3104,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long rate;
u32 now;
u64 rate64;
bool slow;
- u32 rate;
memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM)
@@ -3117,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_pacing_rate = rate64;
rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
@@ -3207,21 +3214,50 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_delivery_rate = rate64;
info->tcpi_delivered = tp->delivered;
info->tcpi_delivered_ce = tp->delivered_ce;
+ info->tcpi_bytes_sent = tp->bytes_sent;
+ info->tcpi_bytes_retrans = tp->bytes_retrans;
+ info->tcpi_dsack_dups = tp->dsack_dups;
+ info->tcpi_reord_seen = tp->reord_seen;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+static size_t tcp_opt_stats_get_size(void)
+{
+ return
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ 0;
+}
+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ unsigned long rate;
u64 rate64;
- u32 rate;
- stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
- 7 * nla_total_size(sizeof(u32)) +
- 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
+ stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -3238,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->total_retrans, TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
rate64 = tcp_compute_delivery_rate(tp);
@@ -3257,6 +3293,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+ TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+ TCP_NLA_PAD);
+ nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+ nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+
return stats;
}
@@ -3451,7 +3494,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_USER_TIMEOUT:
- val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ val = icsk->icsk_user_timeout;
break;
case TCP_FASTOPEN:
@@ -3861,8 +3904,8 @@ void __init tcp_init(void)
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
- init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
- init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
+ init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 4bfff3c87e8e..9277abdd822a 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -95,11 +95,10 @@ struct bbr {
u32 mode:3, /* current bbr_mode in state machine */
prev_ca_state:3, /* CA state on previous ACK */
packet_conservation:1, /* use packet conservation? */
- restore_cwnd:1, /* decided to revert cwnd to old value */
round_start:1, /* start of packet-timed tx->ack round? */
idle_restart:1, /* restarting after idle? */
probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
- unused:12,
+ unused:13,
lt_is_sampling:1, /* taking long-term ("LT") samples now? */
lt_rtt_cnt:7, /* round trips in long-term interval */
lt_use_bw:1; /* use lt_bw as our bw estimate? */
@@ -129,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */
+static const int bbr_pacing_margin_percent = 1;
+
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting
@@ -175,6 +177,8 @@ static const u32 bbr_lt_bw_diff = 4000 / 8;
/* If we estimate we're policed, use lt_bw for this many round trips: */
static const u32 bbr_lt_bw_max_rtts = 48;
+static void bbr_check_probe_rtt_done(struct sock *sk);
+
/* Do we estimate that STARTUP filled the pipe? */
static bool bbr_full_bw_reached(const struct sock *sk)
{
@@ -205,15 +209,17 @@ static u32 bbr_bw(const struct sock *sk)
*/
static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
{
- rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache);
+ unsigned int mss = tcp_sk(sk)->mss_cache;
+
+ rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
- rate *= USEC_PER_SEC;
+ rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
return rate >> BW_SCALE;
}
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{
u64 rate = bw;
@@ -252,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk);
@@ -274,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
- bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
@@ -305,6 +311,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
*/
if (bbr->mode == BBR_PROBE_BW)
bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
+ else if (bbr->mode == BBR_PROBE_RTT)
+ bbr_check_probe_rtt_done(sk);
}
}
@@ -361,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
return cwnd;
}
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 now_ns, edt_ns, interval_us;
+ u32 interval_delivered, inflight_at_edt;
+
+ now_ns = tp->tcp_clock_cache;
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+ inflight_at_edt = inflight_now;
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
+ if (interval_delivered >= inflight_at_edt)
+ return 0;
+ return inflight_at_edt - interval_delivered;
+}
+
/* An optimization in BBR to reduce losses: On the first round of recovery, we
* follow the packet conservation principle: send P packets per P packets acked.
* After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -392,17 +433,11 @@ static bool bbr_set_cwnd_to_recover_or_restore(
cwnd = tcp_packets_in_flight(tp) + acked;
} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
/* Exiting loss recovery; restore cwnd saved before recovery. */
- bbr->restore_cwnd = 1;
+ cwnd = max(cwnd, bbr->prior_cwnd);
bbr->packet_conservation = 0;
}
bbr->prev_ca_state = state;
- if (bbr->restore_cwnd) {
- /* Restore cwnd after exiting loss recovery or PROBE_RTT. */
- cwnd = max(cwnd, bbr->prior_cwnd);
- bbr->restore_cwnd = 0;
- }
-
if (bbr->packet_conservation) {
*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
return true; /* yes, using packet conservation */
@@ -419,10 +454,10 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 cwnd = 0, target_cwnd = 0;
+ u32 cwnd = tp->snd_cwnd, target_cwnd = 0;
if (!acked)
- return;
+ goto done; /* no packet fully ACKed; just apply caps */
if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
goto done;
@@ -458,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
if (bbr->pacing_gain == BBR_UNIT)
return is_full_length; /* just use wall clock time */
- inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
bw = bbr_max_bw(sk);
/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -486,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk)
bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
bbr->cycle_mstamp = tp->delivered_mstamp;
- bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
- bbr_pacing_gain[bbr->cycle_idx];
}
/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -505,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_STARTUP;
- bbr->pacing_gain = bbr_high_gain;
- bbr->cwnd_gain = bbr_high_gain;
}
static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -514,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_PROBE_BW;
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = bbr_cwnd_gain;
bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
}
@@ -733,17 +762,29 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
- bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
- bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
tcp_sk(sk)->snd_ssthresh =
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
} /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN &&
- tcp_packets_in_flight(tcp_sk(sk)) <=
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
}
+static void bbr_check_probe_rtt_done(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ if (!(bbr->probe_rtt_done_stamp &&
+ after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
+ return;
+
+ bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */
+ tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+ bbr_reset_mode(sk);
+}
+
/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
* periodically drain the bottleneck queue, to converge to measure the true
* min_rtt (unloaded propagation delay). This allows the flows to keep queues
@@ -782,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = BBR_UNIT;
bbr_save_cwnd(sk); /* note cwnd so we can restore it */
bbr->probe_rtt_done_stamp = 0;
}
@@ -802,12 +841,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
} else if (bbr->probe_rtt_done_stamp) {
if (bbr->round_start)
bbr->probe_rtt_round_done = 1;
- if (bbr->probe_rtt_round_done &&
- after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) {
- bbr->min_rtt_stamp = tcp_jiffies32;
- bbr->restore_cwnd = 1; /* snap to prior_cwnd */
- bbr_reset_mode(sk);
- }
+ if (bbr->probe_rtt_round_done)
+ bbr_check_probe_rtt_done(sk);
}
}
/* Restart after idle ends only once we process a new S/ACK for data */
@@ -815,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
bbr->idle_restart = 0;
}
+static void bbr_update_gains(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ switch (bbr->mode) {
+ case BBR_STARTUP:
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+ break;
+ case BBR_DRAIN:
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
+ break;
+ case BBR_PROBE_BW:
+ bbr->pacing_gain = (bbr->lt_use_bw ?
+ BBR_UNIT :
+ bbr_pacing_gain[bbr->cycle_idx]);
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ break;
+ case BBR_PROBE_RTT:
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ break;
+ default:
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+ break;
+ }
+}
+
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
@@ -822,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
bbr_update_min_rtt(sk, rs);
+ bbr_update_gains(sk);
}
static void bbr_main(struct sock *sk, const struct rate_sample *rs)
@@ -858,7 +923,6 @@ static void bbr_init(struct sock *sk)
bbr->has_seen_rtt = 0;
bbr_init_pacing_rate_from_rtt(sk);
- bbr->restore_cwnd = 0;
bbr->round_start = 0;
bbr->idle_restart = 0;
bbr->full_bw_reached = 0;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..b7918d4caa30
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,668 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+#include <net/inet_common.h>
+
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+ struct msghdr *msg, int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ int i, ret, copied = 0;
+ struct sk_msg *msg_rx;
+
+ msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+ struct sk_msg, list);
+
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ ret = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (ret != copy) {
+ msg_rx->sg.start = i;
+ return -EFAULT;
+ }
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while (i != msg_rx->sg.end);
+
+ if (unlikely(peek)) {
+ msg_rx = list_next_entry(msg_rx, list);
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+ list_del(&msg_rx->list);
+ if (msg_rx->skb)
+ consume_skb(msg_rx->skb);
+ kfree(msg_rx);
+ }
+ msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+ struct sk_msg, list);
+ }
+
+ return copied;
+}
+EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
+
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (skb_queue_empty(&sk->sk_receive_queue))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ u32 size, copied = 0;
+ struct sk_msg *tmp;
+ int i, ret = 0;
+
+ tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!tmp))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ tmp->sg.start = msg->sg.start;
+ i = msg->sg.start;
+ do {
+ sge = sk_msg_elem(msg, i);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ if (!sk_wmem_schedule(sk, size)) {
+ if (!copied)
+ ret = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ sk_msg_xfer(tmp, msg, i, size);
+ copied += size;
+ if (sge->length)
+ get_page(sk_msg_page(tmp, i));
+ sk_msg_iter_var_next(i);
+ tmp->sg.end = i;
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes)
+ break;
+ }
+ } while (i != msg->sg.end);
+
+ if (!ret) {
+ msg->sg.start = i;
+ msg->sg.size -= apply_bytes;
+ sk_psock_queue_msg(psock, tmp);
+ sk->sk_data_ready(sk);
+ } else {
+ sk_msg_free(sk, tmp);
+ kfree(tmp);
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+ int flags, bool uncharge)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ struct page *page;
+ int size, ret = 0;
+ u32 off;
+
+ while (1) {
+ sge = sk_msg_elem(msg, msg->sg.start);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ off = sge->offset;
+ page = sg_page(sge);
+
+ tcp_rate_check_app_limited(sk);
+retry:
+ ret = do_tcp_sendpages(sk, page, off, size, flags);
+ if (ret <= 0)
+ return ret;
+ if (apply)
+ apply_bytes -= ret;
+ msg->sg.size -= ret;
+ sge->offset += ret;
+ sge->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ if (ret != size) {
+ size -= ret;
+ off += ret;
+ goto retry;
+ }
+ if (!sge->length) {
+ put_page(page);
+ sk_msg_iter_next(msg, start);
+ sg_init_table(sge, 1);
+ if (msg->sg.start == msg->sg.end)
+ break;
+ }
+ if (apply && !apply_bytes)
+ break;
+ }
+
+ return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+ u32 apply_bytes, int flags, bool uncharge)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+ release_sock(sk);
+ return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, int flags)
+{
+ bool ingress = sk_msg_to_ingress(msg);
+ struct sk_psock *psock = sk_psock_get(sk);
+ int ret;
+
+ if (unlikely(!psock)) {
+ sk_msg_free(sk, msg);
+ return 0;
+ }
+ ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+ tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, int *copied, int flags)
+{
+ bool cork = false, enospc = msg->sg.start == msg->sg.end;
+ struct sock *sk_redir;
+ u32 tosend;
+ int ret;
+
+more_data:
+ if (psock->eval == __SK_NONE)
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+
+ if (msg->cork_bytes &&
+ msg->cork_bytes > msg->sg.size && !enospc) {
+ psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+ if (!psock->cork) {
+ psock->cork = kzalloc(sizeof(*psock->cork),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!psock->cork)
+ return -ENOMEM;
+ }
+ memcpy(psock->cork, msg, sizeof(*msg));
+ return 0;
+ }
+
+ tosend = msg->sg.size;
+ if (psock->apply_bytes && psock->apply_bytes < tosend)
+ tosend = psock->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+ if (unlikely(ret)) {
+ *copied -= sk_msg_free(sk, msg);
+ break;
+ }
+ sk_msg_apply_bytes(psock, tosend);
+ break;
+ case __SK_REDIRECT:
+ sk_redir = psock->sk_redir;
+ sk_msg_apply_bytes(psock, tosend);
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+ sk_msg_return(sk, msg, tosend);
+ release_sock(sk);
+ ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+ lock_sock(sk);
+ if (unlikely(ret < 0)) {
+ int free = sk_msg_free_nocharge(sk, msg);
+
+ if (!cork)
+ *copied -= free;
+ }
+ if (cork) {
+ sk_msg_free(sk, msg);
+ kfree(msg);
+ msg = NULL;
+ ret = 0;
+ }
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free_partial(sk, msg, tosend);
+ sk_msg_apply_bytes(psock, tosend);
+ *copied -= tosend;
+ return -EACCES;
+ }
+
+ if (likely(!ret)) {
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (msg &&
+ msg->sg.data[msg->sg.start].page_link &&
+ msg->sg.data[msg->sg.start].length)
+ goto more_data;
+ }
+ return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_msg tmp, *msg_tx = NULL;
+ int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+ int copied = 0, err = 0;
+ struct sk_psock *psock;
+ long timeo;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendmsg(sk, msg, size);
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ while (msg_data_left(msg)) {
+ bool enospc = false;
+ u32 copy, osize;
+
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+ if (psock->cork) {
+ msg_tx = psock->cork;
+ } else {
+ msg_tx = &tmp;
+ sk_msg_init(msg_tx);
+ }
+
+ osize = msg_tx->sg.size;
+ err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = msg_tx->sg.size - osize;
+ }
+
+ err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+ copy);
+ if (err < 0) {
+ sk_msg_trim(sk, msg_tx, osize);
+ goto out_err;
+ }
+
+ copied += copy;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err) {
+ if (msg_tx && msg_tx != psock->cork)
+ sk_msg_free(sk, msg_tx);
+ goto out_err;
+ }
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied ? copied : err;
+}
+
+static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct sk_msg tmp, *msg = NULL;
+ int err = 0, copied = 0;
+ struct sk_psock *psock;
+ bool enospc = false;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendpage(sk, page, offset, size, flags);
+
+ lock_sock(sk);
+ if (psock->cork) {
+ msg = psock->cork;
+ } else {
+ msg = &tmp;
+ sk_msg_init(msg);
+ }
+
+ /* Catch case where ring is full and sendpage is stalled. */
+ if (unlikely(sk_msg_full(msg)))
+ goto out_err;
+
+ sk_msg_page_add(msg, page, size, offset);
+ sk_mem_charge(sk, size);
+ copied = size;
+ if (sk_msg_full(msg))
+ enospc = true;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+out_err:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied ? copied : err;
+}
+
+static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ sk_psock_cork_free(psock);
+ __sk_psock_purge_ingress_msg(psock);
+ while ((link = sk_psock_link_pop(psock))) {
+ sk_psock_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+static void tcp_bpf_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+
+ saved_unhash = psock->saved_unhash;
+ tcp_bpf_remove(sk, psock);
+ rcu_read_unlock();
+ saved_unhash(sk);
+}
+
+static void tcp_bpf_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ saved_close = psock->saved_close;
+ tcp_bpf_remove(sk, psock);
+ rcu_read_unlock();
+ release_sock(sk);
+ saved_close(sk, timeout);
+}
+
+enum {
+ TCP_BPF_IPV4,
+ TCP_BPF_IPV6,
+ TCP_BPF_NUM_PROTS,
+};
+
+enum {
+ TCP_BPF_BASE,
+ TCP_BPF_TX,
+ TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+ struct proto *base)
+{
+ prot[TCP_BPF_BASE] = *base;
+ prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
+ prot[TCP_BPF_BASE].close = tcp_bpf_close;
+ prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
+ prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
+
+ prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+ prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(ops != tcpv6_prot_saved)) {
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+ smp_store_release(&tcpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
+ }
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+ return 0;
+}
+core_initcall(tcp_bpf_v4_build_proto);
+
+static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
+}
+
+static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
+ * or added requiring sk_prot hook updates. We keep original saved
+ * hooks in this case.
+ */
+ sk->sk_prot = &tcp_bpf_prots[family][config];
+}
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+ /* In order to avoid retpoline, we make assumptions when we call
+ * into ops if e.g. a psock is not present. Make sure they are
+ * indeed valid assumptions.
+ */
+ return ops->recvmsg == tcp_recvmsg &&
+ ops->sendmsg == tcp_sendmsg &&
+ ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
+}
+
+void tcp_bpf_reinit(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ sock_owned_by_me(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ tcp_bpf_reinit_sk_prot(sk, psock);
+ rcu_read_unlock();
+}
+
+int tcp_bpf_init(struct sock *sk)
+{
+ struct proto *ops = READ_ONCE(sk->sk_prot);
+ struct sk_psock *psock;
+
+ sock_owned_by_me(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock || psock->sk_proto ||
+ tcp_bpf_assert_proto_ops(ops))) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ tcp_bpf_update_sk_prot(sk, psock);
+ rcu_read_unlock();
+ return 0;
+}
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 06fbe102a425..37eebd910396 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -146,7 +146,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
return;
if (hystart_detect & HYSTART_ACK_TRAIN) {
- u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
+ u32 now_us = tp->tcp_mstamp;
if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
ca->last_ack = now_us;
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 8b637f9f23a2..cd4814f7e962 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -44,6 +44,7 @@
#include <linux/mm.h>
#include <net/tcp.h>
#include <linux/inet_diag.h>
+#include "tcp_dctcp.h"
#define DCTCP_MAX_ALPHA 1024U
@@ -118,54 +119,6 @@ static u32 dctcp_ssthresh(struct sock *sk)
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
}
-/* Minimal DCTP CE state machine:
- *
- * S: 0 <- last pkt was non-CE
- * 1 <- last pkt was CE
- */
-
-static void dctcp_ce_state_0_to_1(struct sock *sk)
-{
- struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!ca->ce_state) {
- /* State has changed from CE=0 to CE=1, force an immediate
- * ACK to reflect the new CE state. If an ACK was delayed,
- * send that first to reflect the prior CE state.
- */
- if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
- tcp_enter_quickack_mode(sk, 1);
- }
-
- ca->prior_rcv_nxt = tp->rcv_nxt;
- ca->ce_state = 1;
-
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-}
-
-static void dctcp_ce_state_1_to_0(struct sock *sk)
-{
- struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (ca->ce_state) {
- /* State has changed from CE=1 to CE=0, force an immediate
- * ACK to reflect the new CE state. If an ACK was delayed,
- * send that first to reflect the prior CE state.
- */
- if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
- tcp_enter_quickack_mode(sk, 1);
- }
-
- ca->prior_rcv_nxt = tp->rcv_nxt;
- ca->ce_state = 0;
-
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-}
-
static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -230,12 +183,12 @@ static void dctcp_state(struct sock *sk, u8 new_state)
static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
{
+ struct dctcp *ca = inet_csk_ca(sk);
+
switch (ev) {
case CA_EVENT_ECN_IS_CE:
- dctcp_ce_state_0_to_1(sk);
- break;
case CA_EVENT_ECN_NO_CE:
- dctcp_ce_state_1_to_0(sk);
+ dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
break;
default:
/* Don't care for the rest. */
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
new file mode 100644
index 000000000000..d69a77cbd0c7
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.h
@@ -0,0 +1,40 @@
+#ifndef _TCP_DCTCP_H
+#define _TCP_DCTCP_H
+
+static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (ce_state == 1)
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ else
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+ u32 *prior_rcv_nxt, u32 *ce_state)
+{
+ u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+ if (*ce_state != new_ce_state) {
+ /* CE state has changed, force an immediate ACK to
+ * reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ dctcp_ece_ack_cwr(sk, *ce_state);
+ __tcp_send_ack(sk, *prior_rcv_nxt);
+ }
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ }
+ *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+ *ce_state = new_ce_state;
+ dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+#endif
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f9dcb29be12d..2868ef28ce52 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
#include <linux/static_key.h>
+#include <net/busy_poll.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -244,16 +245,16 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}
-static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
{
if (tcp_hdr(skb)->cwr) {
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
/* If the sender is telling us it has entered CWR, then its
* cwnd may be very low (even just 1 packet), so we should ACK
* immediately.
*/
- tcp_enter_quickack_mode((struct sock *)tp, 2);
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
}
@@ -425,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
}
}
-/* 3. Tuning rcvbuf, when connection enters established state. */
-static void tcp_fixup_rcvbuf(struct sock *sk)
-{
- u32 mss = tcp_sk(sk)->advmss;
- int rcvmem;
-
- rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
- tcp_default_init_rwnd(mss);
-
- /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
- * Allow enough cushion so that sender is not limited by our window
- */
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
- rcvmem <<= 2;
-
- if (sk->sk_rcvbuf < rcvmem)
- sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
-}
-
-/* 4. Try to fixup all. It is made immediately after connection enters
+/* 3. Try to fixup all. It is made immediately after connection enters
* established state.
*/
void tcp_init_buffer_space(struct sock *sk)
@@ -453,12 +435,10 @@ void tcp_init_buffer_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
- tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_sndbuf_expand(sk);
- tp->rcvq_space.space = tp->rcv_wnd;
+ tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
tcp_mstamp_refresh(tp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
@@ -484,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk)
tp->snd_cwnd_stamp = tcp_jiffies32;
}
-/* 5. Recalculate window clamp after socket hit its memory bounds. */
+/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -590,9 +570,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->rx_opt.rcv_tsecr &&
- (TCP_SKB_CB(skb)->end_seq -
- TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
+ if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
+ return;
+ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
+
+ if (TCP_SKB_CB(skb)->end_seq -
+ TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
u32 delta_us;
@@ -877,6 +860,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
{
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
tp->rack.dsack_seen = 1;
+ tp->dsack_dups++;
}
/* It's reordering when higher sequence was delivered (i.e. sacked) before
@@ -908,8 +892,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
}
- tp->rack.reord = 1;
/* This exciting event is worth to be remembered. 8) */
+ tp->reord_seen++;
NET_INC_STATS(sock_net(sk),
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
@@ -1300,7 +1284,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (skb == tp->lost_skb_hint)
@@ -1575,7 +1559,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor);
@@ -1873,6 +1857,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
tp->reordering = min_t(u32, tp->packets_out + addend,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ tp->reord_seen++;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
@@ -2994,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk)
*/
rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+ TCP_RTO_MAX, tcp_rtx_queue_head(sk));
}
}
@@ -3097,7 +3082,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else if (!(sacked & TCPCB_SACKED_ACKED)) {
- last_ackt = skb->skb_mstamp;
+ last_ackt = tcp_skb_timestamp_us(skb);
WARN_ON_ONCE(last_ackt == 0);
if (!first_ackt)
first_ackt = last_ackt;
@@ -3115,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->delivered += acked_pcount;
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3209,7 +3194,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
+ tcp_skb_timestamp_us(skb))) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3269,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk)
} else {
unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- when, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ when, TCP_RTO_MAX, NULL);
}
}
@@ -3466,7 +3452,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
static void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = get_seconds();
+ tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
}
static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
@@ -4193,6 +4179,17 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+{
+ /* When the ACK path fails or drops most ACKs, the sender would
+ * timeout and spuriously retransmit the same segment repeatedly.
+ * The receiver remembers and reflects via DSACKs. Leverage the
+ * DSACK state and change the txhash to re-route speculatively.
+ */
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
+ sk_rethink_txhash(sk);
+}
+
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4205,6 +4202,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_spurious_retrans(sk, skb);
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
@@ -4347,6 +4345,11 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+#ifdef CONFIG_TLS_DEVICE
+ if (from->decrypted != to->decrypted)
+ return false;
+#endif
+
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4642,8 +4645,10 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
skb->data_len = data_len;
skb->len = size;
- if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto err_free;
+ }
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
if (err)
@@ -4690,7 +4695,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
skb_dst_drop(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
- tcp_ecn_accept_cwr(tp, skb);
+ tcp_ecn_accept_cwr(sk, skb);
tp->rx_opt.dsack = 0;
@@ -4699,18 +4704,21 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
* Out of sequence packets to the out_of_order_queue.
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
- if (tcp_receive_window(tp) == 0)
+ if (tcp_receive_window(tp) == 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
+ }
/* Ok. In sequence. In window. */
queue_and_out:
if (skb_queue_len(&sk->sk_receive_queue) == 0)
sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4719,11 +4727,11 @@ queue_and_out:
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
tcp_ofo_queue(sk);
- /* RFC2581. 4.2. SHOULD send immediate ACK, when
+ /* RFC5681. 4.2. SHOULD send immediate ACK, when
* gap in queue is filled.
*/
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
- inet_csk(sk)->icsk_ack.pingpong = 0;
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
if (tp->rx_opt.num_sacks)
@@ -4739,6 +4747,7 @@ queue_and_out:
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4766,8 +4775,10 @@ drop:
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
- if (!tcp_receive_window(tp))
+ if (!tcp_receive_window(tp)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
+ }
goto queue_and_out;
}
@@ -4885,6 +4896,9 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+#ifdef CONFIG_TLS_DEVICE
+ nskb->decrypted = skb->decrypted;
+#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
@@ -4912,6 +4926,10 @@ restart:
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
+#ifdef CONFIG_TLS_DEVICE
+ if (skb->decrypted != nskb->decrypted)
+ goto end;
+#endif
}
}
}
@@ -5154,7 +5172,9 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
- tcp_in_quickack_mode(sk)) {
+ tcp_in_quickack_mode(sk) ||
+ /* Protocol state mandates a one-time immediate ACK */
+ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
send_now:
tcp_send_ack(sk);
return;
@@ -5530,6 +5550,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tcp_ack(sk, skb, 0);
__kfree_skb(skb);
tcp_data_snd_check(sk);
+ /* When receiving pure ack in fast path, update
+ * last ts ecr directly instead of calling
+ * tcp_rcv_rtt_measure_ts()
+ */
+ tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
return;
} else { /* Header too small */
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
@@ -5631,6 +5656,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
+ sk_mark_napi_id(sk, skb);
}
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
@@ -5976,11 +6002,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (th->fin)
goto discard;
/* It is possible that we process SYN packets from backlog,
- * so we need to make sure to disable BH right there.
+ * so we need to make sure to disable BH and RCU right there.
*/
+ rcu_read_lock();
local_bh_disable();
acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
local_bh_enable();
+ rcu_read_unlock();
if (!acceptable)
return 1;
@@ -6334,8 +6362,8 @@ static bool tcp_syn_flood_action(const struct sock *sk,
if (!queue->synflood_warned &&
net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
- pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
+ net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
+ proto, ntohs(tcp_hdr(skb)->dest), msg);
return want_cookie;
}
@@ -6459,6 +6487,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
+ sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3b2711e33e4c..de47038afdf0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
if (tcptw->tw_ts_recent_stamp &&
- (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+ (!twp || (reuse && time_after32(ktime_get_seconds(),
+ tcptw->tw_ts_recent_stamp)))) {
/* In case of repair and re-using TIME-WAIT sockets we still
* want to be sure that it is safe as above but honor the
* sequence numbers and time stamps set as part of the repair
@@ -543,7 +544,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
BUG_ON(!skb);
tcp_mstamp_refresh(tp);
- delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
remaining = icsk->icsk_rto -
usecs_to_jiffies(delta_us);
@@ -942,9 +943,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- ireq_opt_deref(ireq));
+ rcu_dereference(ireq->ireq_opt));
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -2516,6 +2519,12 @@ static int __net_init tcp_sk_init(struct net *net)
if (res)
goto fail;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Please enforce IP_DF and IPID==0 for RST and
+ * ACK sent in SYN-RECV and TIME-WAIT state.
+ */
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
@@ -2542,7 +2551,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_tw_reuse = 2;
cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1dda1341a223..12affb7864d9 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tw->tw_substate = TCP_TIME_WAIT;
tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent_stamp = ktime_get_seconds();
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
}
@@ -184,12 +184,13 @@ kill:
inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
}
+ } else {
+ inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
}
- inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
if (tmp_opt.saw_tstamp) {
tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
- tcptw->tw_ts_recent_stamp = get_seconds();
+ tcptw->tw_ts_recent_stamp = ktime_get_seconds();
}
inet_twsk_put(tw);
@@ -449,119 +450,122 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct sk_buff *skb)
{
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ struct inet_connection_sock *newicsk;
+ struct tcp_sock *oldtp, *newtp;
- if (newsk) {
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct tcp_request_sock *treq = tcp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(newsk);
- struct tcp_sock *newtp = tcp_sk(newsk);
- struct tcp_sock *oldtp = tcp_sk(sk);
-
- smc_check_reset_syn_req(oldtp, req, newtp);
-
- /* Now setup tcp_sock */
- newtp->pred_flags = 0;
-
- newtp->rcv_wup = newtp->copied_seq =
- newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->segs_in = 1;
-
- newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
-
- INIT_LIST_HEAD(&newtp->tsq_node);
- INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
-
- tcp_init_wl(newtp, treq->rcv_isn);
-
- newtp->srtt_us = 0;
- newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
- minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
- newicsk->icsk_rto = TCP_TIMEOUT_INIT;
- newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
-
- newtp->packets_out = 0;
- newtp->retrans_out = 0;
- newtp->sacked_out = 0;
- newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- newtp->tlp_high_seq = 0;
- newtp->lsndtime = tcp_jiffies32;
- newsk->sk_txhash = treq->txhash;
- newtp->last_oow_ack_time = 0;
- newtp->total_retrans = req->num_retrans;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- newtp->snd_cwnd = TCP_INIT_CWND;
- newtp->snd_cwnd_cnt = 0;
+ if (!newsk)
+ return NULL;
- /* There's a bubble in the pipe until at least the first ACK. */
- newtp->app_limited = ~0U;
+ newicsk = inet_csk(newsk);
+ newtp = tcp_sk(newsk);
+ oldtp = tcp_sk(sk);
- tcp_init_xmit_timers(newsk);
- newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+ smc_check_reset_syn_req(oldtp, req, newtp);
- newtp->rx_opt.saw_tstamp = 0;
+ /* Now setup tcp_sock */
+ newtp->pred_flags = 0;
- newtp->rx_opt.dsack = 0;
- newtp->rx_opt.num_sacks = 0;
+ newtp->rcv_wup = newtp->copied_seq =
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+ newtp->segs_in = 1;
- newtp->urg_data = 0;
+ newtp->snd_sml = newtp->snd_una =
+ newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
- if (sock_flag(newsk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
+ INIT_LIST_HEAD(&newtp->tsq_node);
+ INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
- newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
- newtp->rx_opt.sack_ok = ireq->sack_ok;
- newtp->window_clamp = req->rsk_window_clamp;
- newtp->rcv_ssthresh = req->rsk_rcv_wnd;
- newtp->rcv_wnd = req->rsk_rcv_wnd;
- newtp->rx_opt.wscale_ok = ireq->wscale_ok;
- if (newtp->rx_opt.wscale_ok) {
- newtp->rx_opt.snd_wscale = ireq->snd_wscale;
- newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
- } else {
- newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
- newtp->window_clamp = min(newtp->window_clamp, 65535U);
- }
- newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) <<
- newtp->rx_opt.snd_wscale);
- newtp->max_window = newtp->snd_wnd;
-
- if (newtp->rx_opt.tstamp_ok) {
- newtp->rx_opt.ts_recent = req->ts_recent;
- newtp->rx_opt.ts_recent_stamp = get_seconds();
- newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
- } else {
- newtp->rx_opt.ts_recent_stamp = 0;
- newtp->tcp_header_len = sizeof(struct tcphdr);
- }
- newtp->tsoffset = treq->ts_off;
+ tcp_init_wl(newtp, treq->rcv_isn);
+
+ newtp->srtt_us = 0;
+ newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
+ newicsk->icsk_rto = TCP_TIMEOUT_INIT;
+ newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
+
+ newtp->packets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->sacked_out = 0;
+ newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ newtp->tlp_high_seq = 0;
+ newtp->lsndtime = tcp_jiffies32;
+ newsk->sk_txhash = treq->txhash;
+ newtp->last_oow_ack_time = 0;
+ newtp->total_retrans = req->num_retrans;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = TCP_INIT_CWND;
+ newtp->snd_cwnd_cnt = 0;
+
+ /* There's a bubble in the pipe until at least the first ACK. */
+ newtp->app_limited = ~0U;
+
+ tcp_init_xmit_timers(newsk);
+ newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+
+ newtp->rx_opt.saw_tstamp = 0;
+
+ newtp->rx_opt.dsack = 0;
+ newtp->rx_opt.num_sacks = 0;
+
+ newtp->urg_data = 0;
+
+ if (sock_flag(newsk, SOCK_KEEPOPEN))
+ inet_csk_reset_keepalive_timer(newsk,
+ keepalive_time_when(newtp));
+
+ newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+ newtp->rx_opt.sack_ok = ireq->sack_ok;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
+ newtp->rx_opt.wscale_ok = ireq->wscale_ok;
+ if (newtp->rx_opt.wscale_ok) {
+ newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+ newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
+ } else {
+ newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp, 65535U);
+ }
+ newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale;
+ newtp->max_window = newtp->snd_wnd;
+
+ if (newtp->rx_opt.tstamp_ok) {
+ newtp->rx_opt.ts_recent = req->ts_recent;
+ newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->rx_opt.ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
- newtp->md5sig_info = NULL; /*XXX*/
- if (newtp->af_specific->md5_lookup(sk, newsk))
- newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+ newtp->md5sig_info = NULL; /*XXX*/
+ if (newtp->af_specific->md5_lookup(sk, newsk))
+ newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
- if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
- newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
- newtp->rx_opt.mss_clamp = req->mss;
- tcp_ecn_openreq_child(newtp, req);
- newtp->fastopen_req = NULL;
- newtp->fastopen_rsk = NULL;
- newtp->syn_data_acked = 0;
- newtp->rack.mstamp = 0;
- newtp->rack.advanced = 0;
- newtp->rack.reo_wnd_steps = 1;
- newtp->rack.last_delivered = 0;
- newtp->rack.reo_wnd_persist = 0;
- newtp->rack.dsack_seen = 0;
-
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
- }
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
+ newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
+ newtp->rx_opt.mss_clamp = req->mss;
+ tcp_ecn_openreq_child(newtp, req);
+ newtp->fastopen_req = NULL;
+ newtp->fastopen_rsk = NULL;
+ newtp->syn_data_acked = 0;
+ newtp->rack.mstamp = 0;
+ newtp->rack.advanced = 0;
+ newtp->rack.reo_wnd_steps = 1;
+ newtp->rack.last_delivered = 0;
+ newtp->rack.reo_wnd_persist = 0;
+ newtp->rack.dsack_seen = 0;
+
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+
return newsk;
}
EXPORT_SYMBOL(tcp_create_openreq_child);
@@ -600,7 +604,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 8cc7c3487330..870b0a335061 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -180,9 +180,9 @@ out:
return segs;
}
-struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
{
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct tcphdr *th;
struct tcphdr *th2;
@@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
len = skb_gro_len(skb);
flags = tcp_flag_word(th);
- for (; (p = *head); head = &p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
goto found;
}
-
+ p = NULL;
goto out_check_final;
found:
@@ -262,8 +262,11 @@ found:
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+#ifdef CONFIG_TLS_DEVICE
+ flush |= p->decrypted ^ skb->decrypted;
+#endif
- if (flush || skb_gro_receive(head, skb)) {
+ if (flush || skb_gro_receive(p, skb)) {
mss = 1;
goto out_check_final;
}
@@ -277,7 +280,7 @@ out_check_final:
TCP_FLAG_FIN));
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
- pp = head;
+ pp = p;
out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
@@ -302,7 +305,7 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);
-static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c4172c1fb198..9c34b97d365d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,6 +45,21 @@
#include <trace/events/tcp.h>
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+ u64 val = tcp_clock_ns();
+
+ if (val > tp->tcp_clock_cache)
+ tp->tcp_clock_cache = val;
+
+ val = div_u64(val, NSEC_PER_USEC);
+ if (val > tp->tcp_mstamp)
+ tp->tcp_mstamp = val;
+}
+
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@@ -179,21 +194,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
-
-u32 tcp_default_init_rwnd(u32 mss)
-{
- /* Initial receive window should be twice of TCP_INIT_CWND to
- * enable proper sending of new unsent data during fast recovery
- * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
- * limit when mss is larger than 1460.
- */
- u32 init_rwnd = TCP_INIT_CWND * 2;
-
- if (mss > 1460)
- init_rwnd = max((1460 * init_rwnd) / mss, 2U);
- return init_rwnd;
-}
-
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
@@ -228,7 +228,10 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
- (*rcv_wnd) = space;
+ (*rcv_wnd) = min_t(u32, space, U16_MAX);
+
+ if (init_rcv_wnd)
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
(*rcv_wscale) = 0;
if (wscale_ok) {
@@ -241,11 +244,6 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
(*rcv_wscale)++;
}
}
-
- if (!init_rcv_wnd) /* Use default unless specified otherwise */
- init_rcv_wnd = tcp_default_init_rwnd(mss);
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
-
/* Set the clamp no higher than max representable value */
(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
}
@@ -977,42 +975,28 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-/* BBR congestion control needs pacing.
- * Same remark for SO_MAX_PACING_RATE.
- * sch_fq packet scheduler is efficiently handling pacing,
- * but is not always installed/used.
- * Return true if TCP stack should pace packets itself.
- */
-static bool tcp_needs_internal_pacing(const struct sock *sk)
-{
- return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
-}
-
-static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+ u64 prior_wstamp)
{
- u64 len_ns;
- u32 rate;
+ struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_needs_internal_pacing(sk))
- return;
- rate = sk->sk_pacing_rate;
- if (!rate || rate == ~0U)
- return;
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+ if (sk->sk_pacing_status != SK_PACING_NONE) {
+ unsigned long rate = sk->sk_pacing_rate;
- /* Should account for header sizes as sch_fq does,
- * but lets make things simple.
- */
- len_ns = (u64)skb->len * NSEC_PER_SEC;
- do_div(len_ns, rate);
- hrtimer_start(&tcp_sk(sk)->pacing_timer,
- ktime_add_ns(ktime_get(), len_ns),
- HRTIMER_MODE_ABS_PINNED_SOFT);
- sock_hold(sk);
-}
+ /* Original sch_fq does not pace first 10 MSS
+ * Note that tp->data_segs_out overflows after 2^32 packets,
+ * this is a minor annoyance.
+ */
+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
-static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
-{
- skb->skb_mstamp = tp->tcp_mstamp;
+ /* take into account OS jitter */
+ len_ns -= min_t(u64, len_ns / 2, credit);
+ tp->tcp_wstamp_ns += len_ns;
+ }
+ }
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
@@ -1039,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
+ u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -1059,7 +1044,11 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
}
- skb->skb_mstamp = tp->tcp_mstamp;
+
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
@@ -1150,7 +1139,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
- tcp_internal_pacing(sk, skb);
+ tp->bytes_sent += skb->len - tcp_header_size;
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1162,8 +1151,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
- /* Our usage of tstamp should remain private */
- skb->tstamp = 0;
+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1176,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = net_xmit_eval(err);
}
if (!err && oskb) {
- tcp_update_skb_after_send(tp, oskb);
+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}
return err;
@@ -1711,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
{
u32 bytes, segs;
- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ bytes = min_t(unsigned long,
+ sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
* not one big TSO packet every 100 ms.
@@ -1979,7 +1968,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
+ age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
/* If next ACK is likely to come too late (half srtt), do not defer */
if (age < (tp->srtt_us >> 4))
goto send_now;
@@ -2185,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
-static bool tcp_pacing_check(const struct sock *sk)
+static bool tcp_pacing_check(struct sock *sk)
{
- return tcp_needs_internal_pacing(sk) &&
- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_needs_internal_pacing(sk))
+ return false;
+
+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
+ return false;
+
+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
+ hrtimer_start(&tp->pacing_timer,
+ ns_to_ktime(tp->tcp_wstamp_ns),
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
+ }
+ return true;
}
/* TCP Small Queues :
@@ -2205,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk)
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor)
{
- unsigned int limit;
+ unsigned long limit;
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(u32, limit,
+ limit = max_t(unsigned long,
+ 2 * skb->truesize,
+ sk->sk_pacing_rate >> sk->sk_pacing_shift);
+ limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
limit <<= factor;
@@ -2317,18 +2321,19 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
+ if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
+ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ goto repair; /* Skip network transmission */
+ }
+
if (tcp_pacing_check(sk))
break;
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
- if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
- /* "skb_mstamp" is used as a start point for the retransmit timer */
- tcp_update_skb_after_send(tp, skb);
- goto repair; /* Skip network transmission */
- }
-
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
if (push_one == 2)
@@ -2450,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+ TCP_RTO_MAX, NULL);
return true;
}
@@ -2711,9 +2716,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = skb_rb_next(skb);
- int skb_size, next_skb_size;
+ int next_skb_size;
- skb_size = skb->len;
next_skb_size = next_skb->len;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
@@ -2884,6 +2888,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
tp->total_retrans += segs;
+ tp->bytes_retrans += skb->len;
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
@@ -2900,7 +2905,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
} tcp_skb_tsorted_restore(skb);
if (!err) {
- tcp_update_skb_after_send(tp, skb);
+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
tcp_rate_skb_sent(sk, skb);
}
} else {
@@ -3015,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto,
+ TCP_RTO_MAX,
+ skb);
}
}
@@ -3218,10 +3224,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(&opts, 0, sizeof(opts));
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- skb->skb_mstamp = cookie_init_timestamp(req);
+ skb->skb_mstamp_ns = cookie_init_timestamp(req);
else
#endif
- skb->skb_mstamp = tcp_clock_us();
+ skb->skb_mstamp_ns = tcp_clock_ns();
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
@@ -3437,7 +3443,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- syn->skb_mstamp = syn_data->skb_mstamp;
+ syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -3747,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)
icsk->icsk_probes_out = 1;
probe_max = TCP_RESOURCE_PROBE_INTERVAL;
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_when(sk, probe_max),
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+ tcp_probe0_when(sk, probe_max),
+ TCP_RTO_MAX,
+ NULL);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index c61240e43923..baed2186c7c6 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -55,8 +55,10 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
* bandwidth estimate.
*/
if (!tp->packets_out) {
- tp->first_tx_mstamp = skb->skb_mstamp;
- tp->delivered_mstamp = skb->skb_mstamp;
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
}
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
@@ -88,13 +90,12 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tcp_skb_timestamp_us(skb);
/* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(
- skb->skb_mstamp,
- scb->tx.first_tx_mstamp);
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = skb->skb_mstamp;
}
/* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets
@@ -146,6 +147,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);
+ /* Record both segment send and ack receive intervals */
+ rs->snd_interval_us = snd_us;
+ rs->rcv_interval_us = ack_us;
+
/* Normally we expect interval_us >= min-rtt.
* Note that rate may still be over-estimated when a spuriously
* retransmistted skb was first (s)acked because "interval_us"
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 71593e4400ab..fdb715bdd2d1 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tp->rack.reord) {
+ if (!tp->reord_seen) {
/* If reordering has not been observed, be aggressive during
* the recovery or starting the recovery by DUPACK threshold.
*/
@@ -50,7 +50,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
{
return tp->rack.rtt_us + reo_wnd -
- tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
}
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
@@ -91,7 +91,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
+ if (!tcp_rack_sent_after(tp->rack.mstamp,
+ tcp_skb_timestamp_us(skb),
tp->rack.end_seq, scb->end_seq))
break;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3b3611729928..676020663ce8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,35 @@
#include <linux/gfp.h>
#include <net/tcp.h>
+static u32 tcp_retransmit_stamp(const struct sock *sk)
+{
+ u32 start_ts = tcp_sk(sk)->retrans_stamp;
+
+ if (unlikely(!start_ts)) {
+ struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+ if (!head)
+ return 0;
+ start_ts = tcp_skb_timestamp(head);
+ }
+ return start_ts;
+}
+
+static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 elapsed, start_ts;
+
+ start_ts = tcp_retransmit_stamp(sk);
+ if (!icsk->icsk_user_timeout || !start_ts)
+ return icsk->icsk_rto;
+ elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
+ if (elapsed >= icsk->icsk_user_timeout)
+ return 1; /* user timeout has passed; fire ASAP */
+ else
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
+}
+
/**
* tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on.
@@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk,
if (!inet_csk(sk)->icsk_retransmits)
return false;
- start_ts = tcp_sk(sk)->retrans_stamp;
- if (unlikely(!start_ts)) {
- struct sk_buff *head = tcp_rtx_queue_head(sk);
-
- if (!head)
- return false;
- start_ts = tcp_skb_timestamp(head);
- }
+ start_ts = tcp_retransmit_stamp(sk);
+ if (!start_ts)
+ return false;
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk,
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ timeout = jiffies_to_msecs(timeout);
}
- return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout);
+ return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
}
/* A write timeout has occurred. Process the after effects. */
@@ -335,10 +360,9 @@ static void tcp_probe_timer(struct sock *sk)
*/
start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
- skb->skb_mstamp = tp->tcp_mstamp;
+ skb->skb_mstamp_ns = tp->tcp_clock_cache;
else if (icsk->icsk_user_timeout &&
- (s32)(tcp_time_stamp(tp) - start_ts) >
- jiffies_to_msecs(icsk->icsk_user_timeout))
+ (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort;
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
@@ -535,7 +559,8 @@ out_reset_timer:
/* Use normal (exponential) backoff */
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
__sk_dst_reset(sk);
@@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
- elapsed >= icsk->icsk_user_timeout &&
+ elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 622caa4039e0..95df7f7f6328 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -6,7 +6,7 @@
*
*/
-#include<linux/module.h>
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
@@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
return NULL;
}
-static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
-{
- struct tcp_ulp_ops *e;
-
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
- if (e->uid == ulp)
- return e;
- }
-
- return NULL;
-}
-
static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
{
const struct tcp_ulp_ops *ulp = NULL;
@@ -51,7 +39,7 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
#ifdef CONFIG_MODULES
if (!ulp && capable(CAP_NET_ADMIN)) {
rcu_read_unlock();
- request_module("%s", name);
+ request_module("tcp-ulp-%s", name);
rcu_read_lock();
ulp = tcp_ulp_find(name);
}
@@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
return ulp;
}
-static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
-{
- const struct tcp_ulp_ops *ulp;
-
- rcu_read_lock();
- ulp = tcp_ulp_find_id(uid);
- if (!ulp || !try_module_get(ulp->owner))
- ulp = NULL;
- rcu_read_unlock();
- return ulp;
-}
-
/* Attach new upper layer protocol to the list
* of available protocols.
*/
@@ -123,62 +99,49 @@ void tcp_cleanup_ulp(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ /* No sock_owned_by_me() check here as at the time the
+ * stack calls this function, the socket is dead and
+ * about to be destroyed.
+ */
if (!icsk->icsk_ulp_ops)
return;
if (icsk->icsk_ulp_ops->release)
icsk->icsk_ulp_ops->release(sk);
module_put(icsk->icsk_ulp_ops->owner);
+
+ icsk->icsk_ulp_ops = NULL;
}
-/* Change upper layer protocol for socket */
-int tcp_set_ulp(struct sock *sk, const char *name)
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_ulp_ops *ulp_ops;
- int err = 0;
+ int err;
+ err = -EEXIST;
if (icsk->icsk_ulp_ops)
- return -EEXIST;
-
- ulp_ops = __tcp_ulp_find_autoload(name);
- if (!ulp_ops)
- return -ENOENT;
-
- if (!ulp_ops->user_visible) {
- module_put(ulp_ops->owner);
- return -ENOENT;
- }
+ goto out_err;
err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
+ if (err)
+ goto out_err;
icsk->icsk_ulp_ops = ulp_ops;
return 0;
+out_err:
+ module_put(ulp_ops->owner);
+ return err;
}
-int tcp_set_ulp_id(struct sock *sk, int ulp)
+int tcp_set_ulp(struct sock *sk, const char *name)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_ulp_ops *ulp_ops;
- int err;
- if (icsk->icsk_ulp_ops)
- return -EEXIST;
+ sock_owned_by_me(sk);
- ulp_ops = __tcp_ulp_lookup(ulp);
+ ulp_ops = __tcp_ulp_find_autoload(name);
if (!ulp_ops)
return -ENOENT;
- err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
-
- icsk->icsk_ulp_ops = ulp_ops;
- return 0;
+ return __tcp_set_ulp(sk, ulp_ops);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24e116ddae79..cf8252d05a01 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false)) {
- return reuseport_add_sock(sk, sk2);
+ return reuseport_add_sock(sk, sk2,
+ inet_rcv_saddr_any(sk));
}
}
- return reuseport_alloc(sk);
+ return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
/**
@@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
begin:
@@ -512,6 +515,8 @@ begin:
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
+ if (unlikely(IS_ERR(result)))
+ return NULL;
if (result)
return result;
}
@@ -926,11 +931,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
- ipc.opt = NULL;
- ipc.tx_flags = 0;
- ipc.ttl = 0;
- ipc.tos = -1;
-
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
fl4 = &inet->cork.fl.u.ip4;
@@ -977,9 +977,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
connected = 1;
}
- ipc.sockc.tsflags = sk->sk_tsflags;
- ipc.addr = inet->inet_saddr;
- ipc.oif = sk->sk_bound_dev_if;
+ ipcm_init_sk(&ipc, inet);
ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
@@ -1027,8 +1025,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
saddr = ipc.addr;
ipc.addr = faddr = daddr;
- sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
-
if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr) {
err = -EINVAL;
@@ -1046,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
@@ -1631,7 +1627,7 @@ busy_check:
*err = error;
return NULL;
}
-EXPORT_SYMBOL_GPL(__skb_recv_udp);
+EXPORT_SYMBOL(__skb_recv_udp);
/*
* This should be easy, if there is something there we
@@ -1893,7 +1889,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return 0;
}
-static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void)
{
static_branch_enable(&udp_encap_needed_key);
@@ -2128,6 +2124,28 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
inet_compute_pseudo);
}
+/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+ struct udphdr *uh)
+{
+ int ret;
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ inet_compute_pseudo);
+
+ ret = udp_queue_rcv_skb(sk, skb);
+
+ /* a return value > 0 means to resubmit the input, but
+ * it wants the return to be -protocol, or 0
+ */
+ if (ret > 0)
+ return -ret;
+ return 0;
+}
+
/*
* All we need to do is get the socket, and then do a checksum.
*/
@@ -2174,14 +2192,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (unlikely(sk->sk_rx_dst != dst))
udp_sk_rx_dst_set(sk, dst);
- ret = udp_queue_rcv_skb(sk, skb);
+ ret = udp_unicast_rcv_skb(sk, skb, uh);
sock_put(sk);
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
- if (ret > 0)
- return -ret;
- return 0;
+ return ret;
}
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
@@ -2189,22 +2202,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
saddr, daddr, udptable, proto);
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
- if (sk) {
- int ret;
-
- if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- inet_compute_pseudo);
-
- ret = udp_queue_rcv_skb(sk, skb);
-
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
- if (ret > 0)
- return -ret;
- return 0;
- }
+ if (sk)
+ return udp_unicast_rcv_skb(sk, skb, uh);
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 69c54540d5b4..802f2bc00d69 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,10 +343,11 @@ out:
return segs;
}
-struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup)
+struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct udphdr *uh, udp_lookup_t lookup)
{
- struct sk_buff *p, **pp = NULL;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
@@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
unflush:
flush = 0;
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -399,12 +400,12 @@ out:
}
EXPORT_SYMBOL(udp_gro_receive);
-static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *udp4_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
- if (unlikely(!uh))
+ if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
goto flush;
/* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index bcfc00e88756..f8de2482a529 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -67,6 +67,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
if (xo && (xo->flags & XFRM_GRO)) {
skb_mac_header_rebuild(skb);
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 3d36644890bb..1ad2c2c4e250 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -46,7 +46,6 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ihl = skb->data - skb_transport_header(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
if (skb->transport_header != skb->network_header) {
memmove(skb_transport_header(skb),
@@ -54,8 +53,7 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
skb->network_header = skb->transport_header;
}
ip_hdr(skb)->tot_len = htons(skb->len + ihl);
- if (!xo || !(xo->flags & XFRM_GRO))
- skb_reset_transport_header(skb);
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index b3885ca22d6f..613282c65a10 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -15,7 +15,7 @@ menuconfig IPV6
Documentation/networking/ipv6.txt and read the HOWTO at
<http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
- To compile this protocol support as a module, choose M here: the
+ To compile this protocol support as a module, choose M here: the
module will be called ipv6.
if IPV6
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f66a1cae3366..45b84dd5c4eb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -385,8 +385,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
if (ndev->cnf.stable_secret.initialized)
ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
- else
- ndev->cnf.addr_gen_mode = ipv6_devconf_dflt.addr_gen_mode;
ndev->cnf.mtu6 = dev->mtu;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -668,6 +666,7 @@ errout:
static int inet6_netconf_dump_devconf(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
int h, s_h;
int idx, s_idx;
@@ -675,6 +674,21 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
struct inet6_dev *idev;
struct hlist_head *head;
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
+ return -EINVAL;
+ }
+ }
+
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -694,7 +708,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
if (inet6_netconf_fill_devconf(skb, dev->ifindex,
&idev->cnf,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
NETCONFA_ALL) < 0) {
@@ -711,7 +725,7 @@ cont:
if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done;
@@ -722,7 +736,7 @@ cont:
if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done;
@@ -999,6 +1013,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
(!(idev->dev->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(idev->dev) &&
addr_type & IPV6_ADDR_LOOPBACK))
return ERR_PTR(-EADDRNOTAVAIL);
@@ -2400,7 +2415,7 @@ static void addrconf_add_mroute(struct net_device *dev)
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
- ip6_route_add(&cfg, GFP_ATOMIC, NULL);
+ ip6_route_add(&cfg, GFP_KERNEL, NULL);
}
static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -3064,7 +3079,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
if (addr.s6_addr32[3]) {
add_addr(idev, &addr, plen, scope);
addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
- GFP_ATOMIC);
+ GFP_KERNEL);
return;
}
@@ -3089,7 +3104,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
add_addr(idev, &addr, plen, flag);
addrconf_prefix_route(&addr, plen, 0, idev->dev,
- 0, pflags, GFP_ATOMIC);
+ 0, pflags, GFP_KERNEL);
}
}
}
@@ -4203,7 +4218,6 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
p++;
continue;
}
- state->offset++;
return ifa;
}
@@ -4227,13 +4241,12 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
return ifa;
}
+ state->offset = 0;
while (++state->bucket < IN6_ADDR_HSIZE) {
- state->offset = 0;
hlist_for_each_entry_rcu(ifa,
&inet6_addr_lst[state->bucket], addr_lst) {
if (!net_eq(dev_net(ifa->idev->dev), net))
continue;
- state->offset++;
return ifa;
}
}
@@ -4493,6 +4506,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .len = sizeof(u32) },
[IFA_RT_PRIORITY] = { .len = sizeof(u32) },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
};
static int
@@ -4795,19 +4809,40 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(4) /* IFA_RT_PRIORITY */;
}
+enum addr_type_t {
+ UNICAST_ADDR,
+ MULTICAST_ADDR,
+ ANYCAST_ADDR,
+};
+
+struct inet6_fill_args {
+ u32 portid;
+ u32 seq;
+ int event;
+ unsigned int flags;
+ int netnsid;
+ int ifindex;
+ enum addr_type_t type;
+};
+
static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
+ struct inet6_fill_args *args)
{
struct nlmsghdr *nlh;
u32 preferred, valid;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
if (!nlh)
return -EMSGSIZE;
put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
ifa->idev->dev->ifindex);
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto error;
+
if (!((ifa->flags&IFA_F_PERMANENT) &&
(ifa->prefered_lft == INFINITY_LIFE_TIME))) {
preferred = ifa->prefered_lft;
@@ -4857,7 +4892,7 @@ error:
}
static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
- u32 portid, u32 seq, int event, u16 flags)
+ struct inet6_fill_args *args)
{
struct nlmsghdr *nlh;
u8 scope = RT_SCOPE_UNIVERSE;
@@ -4866,10 +4901,15 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
if (!nlh)
return -EMSGSIZE;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ return -EMSGSIZE;
+
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
@@ -4883,7 +4923,7 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
}
static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
- u32 portid, u32 seq, int event, unsigned int flags)
+ struct inet6_fill_args *args)
{
struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
int ifindex = dev ? dev->ifindex : 1;
@@ -4893,10 +4933,15 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
if (!nlh)
return -EMSGSIZE;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ return -EMSGSIZE;
+
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
@@ -4909,68 +4954,56 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
return 0;
}
-enum addr_type_t {
- UNICAST_ADDR,
- MULTICAST_ADDR,
- ANYCAST_ADDR,
-};
-
/* called with rcu_read_lock() */
static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
- struct netlink_callback *cb, enum addr_type_t type,
- int s_ip_idx, int *p_ip_idx)
+ struct netlink_callback *cb, int s_ip_idx,
+ struct inet6_fill_args *fillargs)
{
struct ifmcaddr6 *ifmca;
struct ifacaddr6 *ifaca;
+ int ip_idx = 0;
int err = 1;
- int ip_idx = *p_ip_idx;
read_lock_bh(&idev->lock);
- switch (type) {
+ switch (fillargs->type) {
case UNICAST_ADDR: {
struct inet6_ifaddr *ifa;
+ fillargs->event = RTM_NEWADDR;
/* unicast address incl. temp addr */
list_for_each_entry(ifa, &idev->addr_list, if_list) {
- if (++ip_idx < s_ip_idx)
- continue;
- err = inet6_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR,
- NLM_F_MULTI);
+ if (ip_idx < s_ip_idx)
+ goto next;
+ err = inet6_fill_ifaddr(skb, ifa, fillargs);
if (err < 0)
break;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+next:
+ ip_idx++;
}
break;
}
case MULTICAST_ADDR:
+ fillargs->event = RTM_GETMULTICAST;
+
/* multicast address */
for (ifmca = idev->mc_list; ifmca;
ifmca = ifmca->next, ip_idx++) {
if (ip_idx < s_ip_idx)
continue;
- err = inet6_fill_ifmcaddr(skb, ifmca,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_GETMULTICAST,
- NLM_F_MULTI);
+ err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
if (err < 0)
break;
}
break;
case ANYCAST_ADDR:
+ fillargs->event = RTM_GETANYCAST;
/* anycast address */
for (ifaca = idev->ac_list; ifaca;
ifaca = ifaca->aca_next, ip_idx++) {
if (ip_idx < s_ip_idx)
continue;
- err = inet6_fill_ifacaddr(skb, ifaca,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_GETANYCAST,
- NLM_F_MULTI);
+ err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
if (err < 0)
break;
}
@@ -4979,42 +5012,125 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
break;
}
read_unlock_bh(&idev->lock);
- *p_ip_idx = ip_idx;
+ cb->args[2] = ip_idx;
return err;
}
+static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet6_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
enum addr_type_t type)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet6_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = cb->nlh->nlmsg_seq,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ .type = type,
+ };
struct net *net = sock_net(skb->sk);
+ struct net *tgt_net = net;
+ int idx, s_idx, s_ip_idx;
int h, s_h;
- int idx, ip_idx;
- int s_idx, s_ip_idx;
struct net_device *dev;
struct inet6_dev *idev;
struct hlist_head *head;
s_h = cb->args[0];
s_idx = idx = cb->args[1];
- s_ip_idx = ip_idx = cb->args[2];
+ s_ip_idx = cb->args[2];
+
+ if (cb->strict_check) {
+ int err;
+
+ err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ return err;
+
+ if (fillargs.ifindex) {
+ dev = __dev_get_by_index(tgt_net, fillargs.ifindex);
+ if (!dev)
+ return -ENODEV;
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ err = in6_dump_addrs(idev, skb, cb, s_ip_idx,
+ &fillargs);
+ }
+ goto put_tgt_net;
+ }
+ }
rcu_read_lock();
- cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq;
+ cb->seq = atomic_read(&tgt_net->ipv6.dev_addr_genid) ^ tgt_net->dev_base_seq;
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
- head = &net->dev_index_head[h];
+ head = &tgt_net->dev_index_head[h];
hlist_for_each_entry_rcu(dev, head, index_hlist) {
if (idx < s_idx)
goto cont;
if (h > s_h || idx > s_idx)
s_ip_idx = 0;
- ip_idx = 0;
idev = __in6_dev_get(dev);
if (!idev)
goto cont;
- if (in6_dump_addrs(idev, skb, cb, type,
- s_ip_idx, &ip_idx) < 0)
+ if (in6_dump_addrs(idev, skb, cb, s_ip_idx,
+ &fillargs) < 0)
goto done;
cont:
idx++;
@@ -5024,7 +5140,9 @@ done:
rcu_read_unlock();
cb->args[0] = h;
cb->args[1] = idx;
- cb->args[2] = ip_idx;
+put_tgt_net:
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
return skb->len;
}
@@ -5055,6 +5173,14 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
+ struct inet6_fill_args fillargs = {
+ .portid = NETLINK_CB(in_skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = RTM_NEWADDR,
+ .flags = 0,
+ .netnsid = -1,
+ };
+ struct net *tgt_net = net;
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
struct in6_addr *addr = NULL, *peer;
@@ -5068,15 +5194,24 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ if (tb[IFA_TARGET_NETNSID]) {
+ fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);
+
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
+ fillargs.netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+
addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
if (!addr)
return -EINVAL;
ifm = nlmsg_data(nlh);
if (ifm->ifa_index)
- dev = dev_get_by_index(net, ifm->ifa_index);
+ dev = dev_get_by_index(tgt_net, ifm->ifa_index);
- ifa = ipv6_get_ifaddr(net, addr, dev, 1);
+ ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
if (!ifa) {
err = -EADDRNOTAVAIL;
goto errout;
@@ -5088,20 +5223,22 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
goto errout_ifa;
}
- err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, RTM_NEWADDR, 0);
+ err = inet6_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout_ifa;
}
- err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+ err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
errout_ifa:
in6_ifa_put(ifa);
errout:
if (dev)
dev_put(dev);
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
+
return err;
}
@@ -5109,13 +5246,20 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
{
struct sk_buff *skb;
struct net *net = dev_net(ifa->idev->dev);
+ struct inet6_fill_args fillargs = {
+ .portid = 0,
+ .seq = 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ };
int err = -ENOBUFS;
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
- err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+ err = inet6_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
WARN_ON(err == -EMSGSIZE);
@@ -5211,7 +5355,9 @@ static inline size_t inet6_ifla6_size(void)
+ nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
+ nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
+ nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
- + nla_total_size(sizeof(struct in6_addr)); /* IFLA_INET6_TOKEN */
+ + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
+ + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+ + 0;
}
static inline size_t inet6_if_nlmsg_size(void)
@@ -5529,6 +5675,31 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header");
+ return -EINVAL;
+ }
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -5538,6 +5709,16 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
struct inet6_dev *idev;
struct hlist_head *head;
+ /* only requests using strict checking can pass data to
+ * influence the dump
+ */
+ if (cb->strict_check) {
+ int err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
s_h = cb->args[0];
s_idx = cb->args[1];
@@ -5893,32 +6074,31 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
loff_t *ppos)
{
int ret = 0;
- int new_val;
+ u32 new_val;
struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
struct net *net = (struct net *)ctl->extra2;
+ struct ctl_table tmp = {
+ .data = &new_val,
+ .maxlen = sizeof(new_val),
+ .mode = ctl->mode,
+ };
if (!rtnl_trylock())
return restart_syscall();
- ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ new_val = *((u32 *)ctl->data);
- if (write) {
- new_val = *((int *)ctl->data);
+ ret = proc_douintvec(&tmp, write, buffer, lenp, ppos);
+ if (ret != 0)
+ goto out;
+ if (write) {
if (check_addr_gen_mode(new_val) < 0) {
ret = -EINVAL;
goto out;
}
- /* request for default */
- if (&net->ipv6.devconf_dflt->addr_gen_mode == ctl->data) {
- ipv6_devconf_dflt.addr_gen_mode = new_val;
-
- /* request for individual net device */
- } else {
- if (!idev)
- goto out;
-
+ if (idev) {
if (check_stable_privacy(idev, net, new_val) < 0) {
ret = -EINVAL;
goto out;
@@ -5928,7 +6108,21 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
idev->cnf.addr_gen_mode = new_val;
addrconf_dev_config(idev->dev);
}
+ } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
+ struct net_device *dev;
+
+ net->ipv6.devconf_dflt->addr_gen_mode = new_val;
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get(dev);
+ if (idev &&
+ idev->cnf.addr_gen_mode != new_val) {
+ idev->cnf.addr_gen_mode = new_val;
+ addrconf_dev_config(idev->dev);
+ }
+ }
}
+
+ *((u32 *)ctl->data) = new_val;
}
out:
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 1d6ced37ad71..0d1ee82ee55b 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -458,20 +458,52 @@ static int ip6addrlbl_fill(struct sk_buff *skb,
return 0;
}
+static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrlblmsg *ifal;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request");
+ return -EINVAL;
+ }
+
+ ifal = nlmsg_data(nlh);
+ if (ifal->__ifal_reserved || ifal->ifal_prefixlen ||
+ ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifal))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct ip6addrlbl_entry *p;
int idx = 0, s_idx = cb->args[0];
int err;
+ if (cb->strict_check) {
+ err = ip6addrlbl_valid_dump_req(nlh, cb->extack);
+ if (err < 0)
+ return err;
+ }
+
rcu_read_lock();
hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
if (idx >= s_idx) {
err = ip6addrlbl_fill(skb, p,
net->ipv6.ip6addrlbl_table.seq,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWADDRLABEL,
NLM_F_MULTI);
if (err < 0)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9ed0eae91758..3f4d61017a69 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -209,6 +209,7 @@ lookup_protocol:
np->hop_limit = -1;
np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
np->mc_loop = 1;
+ np->mc_all = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
np->repflow = net->ipv6.sysctl.flowlabel_reflect;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
@@ -322,8 +323,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Reproduce AF_INET checks to make the bindings consistent */
v4addr = addr->sin6_addr.s6_addr32[3];
chk_addr_ret = inet_addr_type(net, v4addr);
- if (!net->ipv4.sysctl_ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!inet_can_nonlocal_bind(net, inet) &&
v4addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
@@ -362,8 +362,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!net->ipv6.sysctl.ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!ipv6_can_nonlocal_bind(net, inet) &&
!ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
@@ -469,12 +468,10 @@ void inet6_destroy_sock(struct sock *sk)
/* Release rx options */
skb = xchg(&np->pktoptions, NULL);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
skb = xchg(&np->rxpmtu, NULL);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
/* Free flowlabels */
fl6_free_socklist(sk);
@@ -764,6 +761,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
static struct packet_type ipv6_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
.func = ipv6_rcv,
+ .list_func = ipv6_list_rcv,
};
static int __init ipv6_packet_init(void)
@@ -833,6 +831,7 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.bindv6only = 0;
net->ipv6.sysctl.icmpv6_time = 1*HZ;
+ net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
net->ipv6.sysctl.flowlabel_consistency = 1;
net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
net->ipv6.sysctl.idgen_retries = 3;
@@ -902,6 +901,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
.inet6_bind = __inet6_bind,
+ .udp6_lib_lookup = __udp6_lib_lookup,
};
static int __init inet6_init(void)
@@ -938,14 +938,14 @@ static int __init inet6_init(void)
err = proto_register(&pingv6_prot, 1);
if (err)
- goto out_unregister_ping_proto;
+ goto out_unregister_raw_proto;
/* We MUST register RAW sockets before we create the ICMP6,
* IGMP6, or NDISC control sockets.
*/
err = rawv6_init();
if (err)
- goto out_unregister_raw_proto;
+ goto out_unregister_ping_proto;
/* Register the family here so that the init calls below will
* be able to create sockets. (?? is this dangerous ??)
@@ -1113,11 +1113,11 @@ netfilter_fail:
igmp_fail:
ndisc_cleanup();
ndisc_fail:
- ip6_mr_cleanup();
+ icmpv6_cleanup();
icmp_fail:
- unregister_pernet_subsys(&inet6_net_ops);
+ ip6_mr_cleanup();
ipmr_fail:
- icmpv6_cleanup();
+ unregister_pernet_subsys(&inet6_net_ops);
register_pernet_fail:
sock_unregister(PF_INET6);
rtnl_unregister_all(PF_INET6);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1a1f876f8e28..1ede7a16a0be 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -739,7 +739,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
struct msghdr *msg, struct flowi6 *fl6,
- struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc)
+ struct ipcm6_cookie *ipc6)
{
struct in6_pktinfo *src_info;
struct cmsghdr *cmsg;
@@ -758,7 +758,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
if (cmsg->cmsg_level == SOL_SOCKET) {
- err = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc);
if (err)
return err;
continue;
@@ -803,7 +803,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
if (addr_type != IPV6_ADDR_ANY) {
int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
- if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
+ if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
!ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
dev, !strict, 0,
IFA_F_TENTATIVE) &&
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 88a7579c23bd..63b2b66f9dfa 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -601,12 +601,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ip_esp_hdr *esph;
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int ivlen = crypto_aead_ivsize(aead);
- int elen = skb->len - sizeof(*esph) - ivlen;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
int nfrags;
int assoclen;
int seqhilen;
@@ -616,7 +615,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
u8 *iv;
struct scatterlist *sg;
- if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) {
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) {
ret = -EINVAL;
goto out;
}
@@ -626,7 +625,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
}
- assoclen = sizeof(*esph);
+ assoclen = sizeof(struct ip_esp_hdr);
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 27f59b61f70f..6177e2171171 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
return 0;
}
-static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *esp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
@@ -162,8 +162,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev))
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -207,8 +206,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
if (!xo)
return -EINVAL;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev)) {
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ef2505aefc15..c9c53ade55c3 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
@@ -431,7 +431,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
struct icmp6hdr tmp_hdr;
struct flowi6 fl6;
struct icmpv6_msg msg;
- struct sockcm_cookie sockc_unused = {0};
struct ipcm6_cookie ipc6;
int iif = 0;
int addr_type = 0;
@@ -546,7 +545,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
- ipc6.tclass = np->tclass;
+ ipcm6_init_sk(&ipc6, np);
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
@@ -554,8 +553,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
goto out;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- ipc6.dontfrag = np->dontfrag;
- ipc6.opt = NULL;
msg.skb = skb;
msg.offset = skb_network_offset(skb);
@@ -576,7 +573,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr),
&ipc6, &fl6, (struct rt6_info *)dst,
- MSG_DONTWAIT, &sockc_unused)) {
+ MSG_DONTWAIT)) {
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
} else {
@@ -680,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
u32 mark = IP6_REPLY_MARK(net, skb->mark);
- struct sockcm_cookie sockc_unused = {0};
saddr = &ipv6_hdr(skb)->daddr;
@@ -727,16 +723,14 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
msg.offset = 0;
msg.type = ICMPV6_ECHO_REPLY;
+ ipcm6_init_sk(&ipc6, np);
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
- ipc6.dontfrag = np->dontfrag;
- ipc6.opt = NULL;
if (ip6_append_data(sk, icmpv6_getfrag, &msg,
skb->len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr), &ipc6, &fl6,
- (struct rt6_info *)dst, MSG_DONTWAIT,
- &sockc_unused)) {
+ (struct rt6_info *)dst, MSG_DONTWAIT)) {
__ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
} else {
@@ -800,6 +794,7 @@ out:
static int icmpv6_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
struct net_device *dev = skb->dev;
struct inet6_dev *idev = __in6_dev_get(dev);
const struct in6_addr *saddr, *daddr;
@@ -849,7 +844,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
switch (type) {
case ICMPV6_ECHO_REQUEST:
- icmpv6_echo_reply(skb);
+ if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
+ icmpv6_echo_reply(skb);
break;
case ICMPV6_ECHO_REPLY:
@@ -1110,6 +1106,13 @@ static struct ctl_table ipv6_icmp_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
+ {
+ .procname = "echo_ignore_all",
+ .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ },
};
@@ -1121,9 +1124,10 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
sizeof(ipv6_icmp_table_template),
GFP_KERNEL);
- if (table)
+ if (table) {
table[0].data = &net->ipv6.sysctl.icmpv6_time;
-
+ table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
+ }
return table;
}
#endif
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index 4b32e5921e5c..b7739aba6e68 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_IPV6_ILA) += ila.o
-ila-objs := ila_common.o ila_lwt.o ila_xlat.o
+ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 3c7a11b62334..1f747bcbec29 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -19,6 +19,7 @@
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/checksum.h>
+#include <net/genetlink.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
@@ -104,9 +105,31 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
void ila_init_saved_csum(struct ila_params *p);
+struct ila_net {
+ struct {
+ struct rhashtable rhash_table;
+ spinlock_t *locks; /* Bucket locks for entry manipulation */
+ unsigned int locks_mask;
+ bool hooks_registered;
+ } xlat;
+};
+
int ila_lwt_init(void);
void ila_lwt_fini(void);
-int ila_xlat_init(void);
-void ila_xlat_fini(void);
+
+int ila_xlat_init_net(struct net *net);
+void ila_xlat_exit_net(struct net *net);
+
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_dump_start(struct netlink_callback *cb);
+int ila_xlat_nl_dump_done(struct netlink_callback *cb);
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+extern unsigned int ila_net_id;
+
+extern struct genl_family ila_nl_family;
#endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 8c88ecf29b93..95e9146918cc 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -153,34 +153,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
/* Now change destination address */
iaddr->loc = p->locator;
}
-
-static int __init ila_init(void)
-{
- int ret;
-
- ret = ila_lwt_init();
-
- if (ret)
- goto fail_lwt;
-
- ret = ila_xlat_init();
- if (ret)
- goto fail_xlat;
-
- return 0;
-fail_xlat:
- ila_lwt_fini();
-fail_lwt:
- return ret;
-}
-
-static void __exit ila_fini(void)
-{
- ila_xlat_fini();
- ila_lwt_fini();
-}
-
-module_init(ila_init);
-module_exit(ila_fini);
-MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
new file mode 100644
index 000000000000..18fac76b9520
--- /dev/null
+++ b/net/ipv6/ila/ila_main.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/genetlink.h>
+#include <net/ila.h>
+#include <net/netns/generic.h>
+#include <uapi/linux/genetlink.h>
+#include "ila.h"
+
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+ [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+ [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
+ [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
+ [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+ [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+};
+
+static const struct genl_ops ila_nl_ops[] = {
+ {
+ .cmd = ILA_CMD_ADD,
+ .doit = ila_xlat_nl_cmd_add_mapping,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_DEL,
+ .doit = ila_xlat_nl_cmd_del_mapping,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_FLUSH,
+ .doit = ila_xlat_nl_cmd_flush,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_GET,
+ .doit = ila_xlat_nl_cmd_get_mapping,
+ .start = ila_xlat_nl_dump_start,
+ .dumpit = ila_xlat_nl_dump,
+ .done = ila_xlat_nl_dump_done,
+ .policy = ila_nl_policy,
+ },
+};
+
+unsigned int ila_net_id;
+
+struct genl_family ila_nl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = ILA_GENL_NAME,
+ .version = ILA_GENL_VERSION,
+ .maxattr = ILA_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .ops = ila_nl_ops,
+ .n_ops = ARRAY_SIZE(ila_nl_ops),
+};
+
+static __net_init int ila_init_net(struct net *net)
+{
+ int err;
+
+ err = ila_xlat_init_net(net);
+ if (err)
+ goto ila_xlat_init_fail;
+
+ return 0;
+
+ila_xlat_init_fail:
+ return err;
+}
+
+static __net_exit void ila_exit_net(struct net *net)
+{
+ ila_xlat_exit_net(net);
+}
+
+static struct pernet_operations ila_net_ops = {
+ .init = ila_init_net,
+ .exit = ila_exit_net,
+ .id = &ila_net_id,
+ .size = sizeof(struct ila_net),
+};
+
+static int __init ila_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&ila_net_ops);
+ if (ret)
+ goto register_device_fail;
+
+ ret = genl_register_family(&ila_nl_family);
+ if (ret)
+ goto register_family_fail;
+
+ ret = ila_lwt_init();
+ if (ret)
+ goto fail_lwt;
+
+ return 0;
+
+fail_lwt:
+ genl_unregister_family(&ila_nl_family);
+register_family_fail:
+ unregister_pernet_device(&ila_net_ops);
+register_device_fail:
+ return ret;
+}
+
+static void __exit ila_fini(void)
+{
+ ila_lwt_fini();
+ genl_unregister_family(&ila_nl_family);
+ unregister_pernet_device(&ila_net_ops);
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 10ae13560b40..17c455ff69ff 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -22,36 +22,14 @@ struct ila_map {
struct rcu_head rcu;
};
-static unsigned int ila_net_id;
-
-struct ila_net {
- struct rhashtable rhash_table;
- spinlock_t *locks; /* Bucket locks for entry manipulation */
- unsigned int locks_mask;
- bool hooks_registered;
-};
-
+#define MAX_LOCKS 1024
#define LOCKS_PER_CPU 10
static int alloc_ila_locks(struct ila_net *ilan)
{
- unsigned int i, size;
- unsigned int nr_pcpus = num_possible_cpus();
-
- nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
- size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
-
- if (sizeof(spinlock_t) != 0) {
- ilan->locks = kvmalloc_array(size, sizeof(spinlock_t),
- GFP_KERNEL);
- if (!ilan->locks)
- return -ENOMEM;
- for (i = 0; i < size; i++)
- spin_lock_init(&ilan->locks[i]);
- }
- ilan->locks_mask = size - 1;
-
- return 0;
+ return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask,
+ MAX_LOCKS, LOCKS_PER_CPU,
+ GFP_KERNEL);
}
static u32 hashrnd __read_mostly;
@@ -71,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc)
static inline spinlock_t *ila_get_lock(struct ila_net *ilan,
struct ila_locator loc)
{
- return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask];
+ return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask];
}
static inline int ila_cmp_wildcards(struct ila_map *ila,
@@ -115,16 +93,6 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = ila_cmpfn,
};
-static struct genl_family ila_nl_family;
-
-static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
- [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
- [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
- [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
- [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
- [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
-};
-
static int parse_nl_config(struct genl_info *info,
struct ila_xlat_params *xp)
{
@@ -162,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr,
{
struct ila_map *ila;
- ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc,
+ ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc,
rht_params);
while (ila) {
if (!ila_cmp_wildcards(ila, iaddr, ifindex))
@@ -179,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp,
{
struct ila_map *ila;
- ila = rhashtable_lookup_fast(&ilan->rhash_table,
+ ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
&xp->ip.locator_match,
rht_params);
while (ila) {
@@ -196,9 +164,9 @@ static inline void ila_release(struct ila_map *ila)
kfree_rcu(ila, rcu);
}
-static void ila_free_cb(void *ptr, void *arg)
+static void ila_free_node(struct ila_map *ila)
{
- struct ila_map *ila = (struct ila_map *)ptr, *next;
+ struct ila_map *next;
/* Assume rcu_readlock held */
while (ila) {
@@ -208,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg)
}
}
+static void ila_free_cb(void *ptr, void *arg)
+{
+ ila_free_node((struct ila_map *)ptr);
+}
+
static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
static unsigned int
@@ -235,7 +208,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
int err = 0, order;
- if (!ilan->hooks_registered) {
+ if (!ilan->xlat.hooks_registered) {
/* We defer registering net hooks in the namespace until the
* first mapping is added.
*/
@@ -244,7 +217,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
if (err)
return err;
- ilan->hooks_registered = true;
+ ilan->xlat.hooks_registered = true;
}
ila = kzalloc(sizeof(*ila), GFP_KERNEL);
@@ -259,12 +232,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
spin_lock(lock);
- head = rhashtable_lookup_fast(&ilan->rhash_table,
+ head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
&xp->ip.locator_match,
rht_params);
if (!head) {
/* New entry for the rhash_table */
- err = rhashtable_lookup_insert_fast(&ilan->rhash_table,
+ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table,
&ila->node, rht_params);
} else {
struct ila_map *tila = head, *prev = NULL;
@@ -290,7 +263,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
} else {
/* Make this ila new head */
RCU_INIT_POINTER(ila->next, head);
- err = rhashtable_replace_fast(&ilan->rhash_table,
+ err = rhashtable_replace_fast(&ilan->xlat.rhash_table,
&head->node,
&ila->node, rht_params);
if (err)
@@ -316,7 +289,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
spin_lock(lock);
- head = rhashtable_lookup_fast(&ilan->rhash_table,
+ head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
&xp->ip.locator_match, rht_params);
ila = head;
@@ -346,15 +319,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
* table
*/
err = rhashtable_replace_fast(
- &ilan->rhash_table, &ila->node,
+ &ilan->xlat.rhash_table, &ila->node,
&head->node, rht_params);
if (err)
goto out;
} else {
/* Entry no longer used */
- err = rhashtable_remove_fast(&ilan->rhash_table,
- &ila->node,
- rht_params);
+ err = rhashtable_remove_fast(
+ &ilan->xlat.rhash_table,
+ &ila->node, rht_params);
}
}
@@ -369,7 +342,7 @@ out:
return err;
}
-static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_xlat_params p;
@@ -382,7 +355,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
return ila_add_mapping(net, &p);
}
-static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_xlat_params xp;
@@ -397,6 +370,59 @@ static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
return 0;
}
+static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
+ struct ila_map *ila)
+{
+ return ila_get_lock(ilan, ila->xp.ip.locator_match);
+}
+
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct rhashtable_iter iter;
+ struct ila_map *ila;
+ spinlock_t *lock;
+ int ret;
+
+ ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
+ if (ret)
+ goto done;
+
+ rhashtable_walk_start(&iter);
+
+ for (;;) {
+ ila = rhashtable_walk_next(&iter);
+
+ if (IS_ERR(ila)) {
+ if (PTR_ERR(ila) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(ila);
+ goto done;
+ } else if (!ila) {
+ break;
+ }
+
+ lock = lock_from_ila_map(ilan, ila);
+
+ spin_lock(lock);
+
+ ret = rhashtable_remove_fast(&ilan->xlat.rhash_table,
+ &ila->node, rht_params);
+ if (!ret)
+ ila_free_node(ila);
+
+ spin_unlock(lock);
+
+ if (ret)
+ break;
+ }
+
+done:
+ rhashtable_walk_stop(&iter);
+ return ret;
+}
+
static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
{
if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
@@ -434,7 +460,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -475,27 +501,34 @@ out_free:
struct ila_dump_iter {
struct rhashtable_iter rhiter;
+ int skip;
};
-static int ila_nl_dump_start(struct netlink_callback *cb)
+int ila_xlat_nl_dump_start(struct netlink_callback *cb)
{
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
- struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+ struct ila_dump_iter *iter;
+ int ret;
- if (!iter) {
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
- if (!iter)
- return -ENOMEM;
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
- cb->args[0] = (long)iter;
+ ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(iter);
+ return ret;
}
- return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
- GFP_KERNEL);
+ iter->skip = 0;
+ cb->args[0] = (long)iter;
+
+ return ret;
}
-static int ila_nl_dump_done(struct netlink_callback *cb)
+int ila_xlat_nl_dump_done(struct netlink_callback *cb)
{
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
@@ -506,24 +539,49 @@ static int ila_nl_dump_done(struct netlink_callback *cb)
return 0;
}
-static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
struct rhashtable_iter *rhiter = &iter->rhiter;
+ int skip = iter->skip;
struct ila_map *ila;
int ret;
rhashtable_walk_start(rhiter);
- for (;;) {
- ila = rhashtable_walk_next(rhiter);
+ /* Get first entry */
+ ila = rhashtable_walk_peek(rhiter);
+
+ if (ila && !IS_ERR(ila) && skip) {
+ /* Skip over visited entries */
+
+ while (ila && skip) {
+ /* Skip over any ila entries in this list that we
+ * have already dumped.
+ */
+ ila = rcu_access_pointer(ila->next);
+ skip--;
+ }
+ }
+ skip = 0;
+
+ for (;;) {
if (IS_ERR(ila)) {
- if (PTR_ERR(ila) == -EAGAIN)
- continue;
ret = PTR_ERR(ila);
- goto done;
+ if (ret == -EAGAIN) {
+ /* Table has changed and iter has reset. Return
+ * -EAGAIN to the application even if we have
+ * written data to the skb. The application
+ * needs to deal with this.
+ */
+
+ goto out_ret;
+ } else {
+ break;
+ }
} else if (!ila) {
+ ret = 0;
break;
}
@@ -532,90 +590,54 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->nlh->nlmsg_seq, NLM_F_MULTI,
skb, ILA_CMD_GET);
if (ret)
- goto done;
+ goto out;
+ skip++;
ila = rcu_access_pointer(ila->next);
}
+
+ skip = 0;
+ ila = rhashtable_walk_next(rhiter);
}
- ret = skb->len;
+out:
+ iter->skip = skip;
+ ret = (skb->len ? : ret);
-done:
+out_ret:
rhashtable_walk_stop(rhiter);
return ret;
}
-static const struct genl_ops ila_nl_ops[] = {
- {
- .cmd = ILA_CMD_ADD,
- .doit = ila_nl_cmd_add_mapping,
- .policy = ila_nl_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = ILA_CMD_DEL,
- .doit = ila_nl_cmd_del_mapping,
- .policy = ila_nl_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = ILA_CMD_GET,
- .doit = ila_nl_cmd_get_mapping,
- .start = ila_nl_dump_start,
- .dumpit = ila_nl_dump,
- .done = ila_nl_dump_done,
- .policy = ila_nl_policy,
- },
-};
-
-static struct genl_family ila_nl_family __ro_after_init = {
- .hdrsize = 0,
- .name = ILA_GENL_NAME,
- .version = ILA_GENL_VERSION,
- .maxattr = ILA_ATTR_MAX,
- .netnsok = true,
- .parallel_ops = true,
- .module = THIS_MODULE,
- .ops = ila_nl_ops,
- .n_ops = ARRAY_SIZE(ila_nl_ops),
-};
-
#define ILA_HASH_TABLE_SIZE 1024
-static __net_init int ila_init_net(struct net *net)
+int ila_xlat_init_net(struct net *net)
{
- int err;
struct ila_net *ilan = net_generic(net, ila_net_id);
+ int err;
err = alloc_ila_locks(ilan);
if (err)
return err;
- rhashtable_init(&ilan->rhash_table, &rht_params);
+ rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
return 0;
}
-static __net_exit void ila_exit_net(struct net *net)
+void ila_xlat_exit_net(struct net *net)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
- rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
+ rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL);
- kvfree(ilan->locks);
+ free_bucket_spinlocks(ilan->xlat.locks);
- if (ilan->hooks_registered)
+ if (ilan->xlat.hooks_registered)
nf_unregister_net_hooks(net, ila_nf_hook_ops,
ARRAY_SIZE(ila_nf_hook_ops));
}
-static struct pernet_operations ila_net_ops = {
- .init = ila_init_net,
- .exit = ila_exit_net,
- .id = &ila_net_id,
- .size = sizeof(struct ila_net),
-};
-
static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
{
struct ila_map *ila;
@@ -641,29 +663,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
return 0;
}
-
-int __init ila_xlat_init(void)
-{
- int ret;
-
- ret = register_pernet_device(&ila_net_ops);
- if (ret)
- goto exit;
-
- ret = genl_register_family(&ila_nl_family);
- if (ret < 0)
- goto unregister;
-
- return 0;
-
-unregister:
- unregister_pernet_device(&ila_net_ops);
-exit:
- return ret;
-}
-
-void ila_xlat_fini(void)
-{
- genl_unregister_family(&ila_nl_family);
- unregister_pernet_device(&ila_net_ops);
-}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 595ad408dba0..3d7c7460a0c5 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net,
saddr, sport, daddr, hnum,
dif, sdif);
if (result)
- return result;
+ goto done;
/* Lookup lhash2 with in6addr_any */
@@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net,
if (ilb2->count > ilb->count)
goto port_lookup;
- return inet6_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
- dif, sdif);
+ result = inet6_lhash2_lookup(net, ilb2, skb, doff,
+ saddr, sport, daddr, hnum,
+ dif, sdif);
+ goto done;
port_lookup:
sk_for_each(sk, &ilb->head) {
@@ -214,12 +215,15 @@ port_lookup:
result = reuseport_select_sock(sk, phash,
skb, doff);
if (result)
- return result;
+ goto done;
}
result = sk;
hiscore = score;
}
}
+done:
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d212738e9d10..2a058b408a6a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -29,6 +29,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
@@ -46,6 +47,7 @@ struct fib6_cleaner {
int (*func)(struct fib6_info *, void *arg);
int sernum;
void *arg;
+ bool skip_notify;
};
#ifdef CONFIG_IPV6_SUBTREES
@@ -160,8 +162,6 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
}
INIT_LIST_HEAD(&f6i->fib6_siblings);
- f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
-
atomic_inc(&f6i->fib6_ref);
return f6i;
@@ -171,7 +171,6 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
{
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
struct rt6_exception_bucket *bucket;
- struct dst_metrics *m;
WARN_ON(f6i->fib6_node);
@@ -196,14 +195,16 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
*ppcpu_rt = NULL;
}
}
+
+ free_percpu(f6i->rt6i_pcpu);
}
+ lwtstate_put(f6i->fib6_nh.nh_lwtstate);
+
if (f6i->fib6_nh.nh_dev)
dev_put(f6i->fib6_nh.nh_dev);
- m = f6i->fib6_metrics;
- if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
- kfree(m);
+ ip_fib_metrics_put(f6i->fib6_metrics);
kfree(f6i);
}
@@ -566,17 +567,31 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
+ struct rt6_rtnl_dump_arg arg = {};
unsigned int h, s_h;
unsigned int e = 0, s_e;
- struct rt6_rtnl_dump_arg arg;
struct fib6_walker *w;
struct fib6_table *tb;
struct hlist_head *head;
int res = 0;
- s_h = cb->args[0];
- s_e = cb->args[1];
+ if (cb->strict_check) {
+ int err;
+
+ err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
+ if (err < 0)
+ return err;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED);
+ }
+
+ /* fib entries are never clones */
+ if (arg.filter.flags & RTM_F_CLONED)
+ return skb->len;
w = (void *)cb->args[2];
if (!w) {
@@ -602,6 +617,20 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
arg.net = net;
w->args = &arg;
+ if (arg.filter.table_id) {
+ tb = fib6_get_table(net, arg.filter.table_id);
+ if (!tb) {
+ NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
+ return -ENOENT;
+ }
+
+ res = fib6_dump_table(tb, skb, cb);
+ goto out;
+ }
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
rcu_read_lock();
for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
@@ -611,16 +640,16 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
goto next;
res = fib6_dump_table(tb, skb, cb);
if (res != 0)
- goto out;
+ goto out_unlock;
next:
e++;
}
}
-out:
+out_unlock:
rcu_read_unlock();
cb->args[1] = e;
cb->args[0] = h;
-
+out:
res = res < 0 ? res : skb->len;
if (res <= 0)
fib6_dump_end(cb);
@@ -987,7 +1016,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_clean_expires(iter);
else
fib6_set_expires(iter, rt->expires);
- fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
+
+ if (rt->fib6_pmtu)
+ fib6_metric_set(iter, RTAX_MTU,
+ rt->fib6_pmtu);
return -EEXIST;
}
/* If we have the same destination and the same metric,
@@ -1947,6 +1979,7 @@ static int fib6_clean_node(struct fib6_walker *w)
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
struct nl_info info = {
.nl_net = c->net,
+ .skip_notify = c->skip_notify,
};
if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
@@ -1998,7 +2031,7 @@ static int fib6_clean_node(struct fib6_walker *w)
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
int (*func)(struct fib6_info *, void *arg),
- int sernum, void *arg)
+ int sernum, void *arg, bool skip_notify)
{
struct fib6_cleaner c;
@@ -2010,13 +2043,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.sernum = sernum;
c.arg = arg;
c.net = net;
+ c.skip_notify = skip_notify;
fib6_walk(net, &c.w);
}
static void __fib6_clean_all(struct net *net,
int (*func)(struct fib6_info *, void *),
- int sernum, void *arg)
+ int sernum, void *arg, bool skip_notify)
{
struct fib6_table *table;
struct hlist_head *head;
@@ -2028,7 +2062,7 @@ static void __fib6_clean_all(struct net *net,
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
spin_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
- func, sernum, arg);
+ func, sernum, arg, skip_notify);
spin_unlock_bh(&table->tb6_lock);
}
}
@@ -2038,14 +2072,21 @@ static void __fib6_clean_all(struct net *net,
void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
void *arg)
{
- __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
+}
+
+void fib6_clean_all_skip_notify(struct net *net,
+ int (*func)(struct fib6_info *, void *),
+ void *arg)
+{
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}
static void fib6_flush_trees(struct net *net)
{
int new_sernum = fib6_new_sernum(net);
- __fib6_clean_all(net, NULL, new_sernum, NULL);
+ __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}
/*
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 3eee7637bdfe..cb54a8a3c273 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
if (olen > 0) {
struct msghdr msg;
struct flowi6 flowi6;
- struct sockcm_cookie sockc_junk;
struct ipcm6_cookie ipc6;
err = -ENOMEM;
@@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
memset(&flowi6, 0, sizeof(flowi6));
ipc6.opt = fl->opt;
- err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk);
+ err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6);
if (err)
goto done;
err = -EINVAL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index cd2cfb04e5d8..515adbdba1d2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -427,35 +427,17 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct net *net = dev_net(skb->dev);
- const struct gre_base_hdr *greh;
const struct ipv6hdr *ipv6h;
- int grehlen = sizeof(*greh);
+ struct tnl_ptk_info tpi;
struct ip6_tnl *t;
- int key_off = 0;
- __be16 flags;
- __be32 key;
- if (!pskb_may_pull(skb, offset + grehlen))
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
+ offset) < 0)
return;
- greh = (const struct gre_base_hdr *)(skb->data + offset);
- flags = greh->flags;
- if (flags & (GRE_VERSION | GRE_ROUTING))
- return;
- if (flags & GRE_CSUM)
- grehlen += 4;
- if (flags & GRE_KEY) {
- key_off = grehlen + offset;
- grehlen += 4;
- }
- if (!pskb_may_pull(skb, offset + grehlen))
- return;
ipv6h = (const struct ipv6hdr *)skb->data;
- greh = (const struct gre_base_hdr *)(skb->data + offset);
- key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
-
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
- key, greh->protocol);
+ tpi.key, tpi.proto);
if (!t)
return;
@@ -989,6 +971,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
dsfield = key->tos;
+ if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+ goto tx_err;
md = ip_tunnel_info_opts(tun_info);
if (!md)
goto tx_err;
@@ -1127,7 +1111,7 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
return;
if (rt->dst.dev) {
- dev->hard_header_len = rt->dst.dev->hard_header_len +
+ dev->needed_headroom = rt->dst.dev->hard_header_len +
t_hlen;
if (set_mtu) {
@@ -1153,7 +1137,7 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
- tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
return t_hlen;
}
@@ -1776,6 +1760,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
if (data[IFLA_GRE_COLLECT_METADATA])
parms->collect_md = true;
+ parms->erspan_ver = 1;
if (data[IFLA_GRE_ERSPAN_VER])
parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
@@ -1823,7 +1808,7 @@ static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel)
erspan_hdr_len(tunnel->parms.erspan_ver);
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
- tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
return t_hlen;
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index f08d34491ece..96577e742afd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,17 +47,11 @@
#include <net/inet_ecn.h>
#include <net/dst_metadata.h>
-int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
void (*edemux)(struct sk_buff *skb);
- /* if ingress device is enslaved to an L3 master device pass the
- * skb to its handler for processing
- */
- skb = l3mdev_ip6_rcv(skb);
- if (!skb)
- return NET_RX_SUCCESS;
-
if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot;
@@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
}
if (!skb_valid_dst(skb))
ip6_route_input(skb);
+}
+
+int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+ ip6_rcv_finish_core(net, sk, skb);
return dst_input(skb);
}
-int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static void ip6_sublist_rcv_finish(struct list_head *head)
+{
+ struct sk_buff *skb, *next;
+
+ list_for_each_entry_safe(skb, next, head, list)
+ dst_input(skb);
+}
+
+static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
+ struct list_head *head)
+{
+ struct dst_entry *curr_dst = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct dst_entry *dst;
+
+ list_del(&skb->list);
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip6_rcv(skb);
+ if (!skb)
+ continue;
+ ip6_rcv_finish_core(net, sk, skb);
+ dst = skb_dst(skb);
+ if (curr_dst != dst) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv_finish(&sublist);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dst = dst;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip6_sublist_rcv_finish(&sublist);
+}
+
+static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
+ struct net *net)
{
const struct ipv6hdr *hdr;
u32 pkt_len;
struct inet6_dev *idev;
- struct net *net = dev_net(skb->dev);
if (skb->pkt_type == PACKET_OTHERHOST) {
kfree_skb(skb);
- return NET_RX_DROP;
+ return NULL;
}
rcu_read_lock();
@@ -131,7 +178,8 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
*/
if ((ipv6_addr_loopback(&hdr->saddr) ||
ipv6_addr_loopback(&hdr->daddr)) &&
- !(dev->flags & IFF_LOOPBACK))
+ !(dev->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev))
goto err;
/* RFC4291 Errata ID: 3480
@@ -196,7 +244,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
if (ipv6_parse_hopopts(skb) < 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
rcu_read_unlock();
- return NET_RX_DROP;
+ return NULL;
}
}
@@ -205,15 +253,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
- net, NULL, skb, dev, NULL,
- ip6_rcv_finish);
+ return skb;
err:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
drop:
rcu_read_unlock();
kfree_skb(skb);
- return NET_RX_DROP;
+ return NULL;
+}
+
+int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct net *net = dev_net(skb->dev);
+
+ skb = ip6_rcv_core(skb, dev, net);
+ if (skb == NULL)
+ return NET_RX_DROP;
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ net, NULL, skb, dev, NULL,
+ ip6_rcv_finish);
+}
+
+static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net)
+{
+ NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
+ head, dev, NULL, ip6_rcv_finish);
+ ip6_list_rcv_finish(net, NULL, head);
+}
+
+/* Receive a list of IPv6 packets */
+void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+ struct net_device *orig_dev)
+{
+ struct net_device *curr_dev = NULL;
+ struct net *curr_net = NULL;
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+
+ INIT_LIST_HEAD(&sublist);
+ list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+
+ list_del(&skb->list);
+ skb = ip6_rcv_core(skb, dev, net);
+ if (skb == NULL)
+ continue;
+
+ if (curr_dev != dev || curr_net != net) {
+ /* dispatch old sublist */
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+ /* start new sublist */
+ INIT_LIST_HEAD(&sublist);
+ curr_dev = dev;
+ curr_net = net;
+ }
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* dispatch final sublist */
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}
/*
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 5b3f2f89ef41..c7e495f12011 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -115,6 +115,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
payload_len = skb->len - nhoff - sizeof(*ipv6h);
ipv6h->payload_len = htons(payload_len);
skb->network_header = (u8 *)ipv6h - skb->head;
+ skb_reset_mac_len(skb);
if (udpfrag) {
int err = ip6_find_1stfragopt(skb, &prevhdr);
@@ -163,11 +164,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
return len;
}
-static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ipv6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
- struct sk_buff **pp = NULL;
+ struct sk_buff *pp = NULL;
struct sk_buff *p;
struct ipv6hdr *iph;
unsigned int nlen;
@@ -214,7 +215,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
flush--;
nlen = skb_network_header_len(skb);
- for (p = *head; p; p = p->next) {
+ list_for_each_entry(p, head, list) {
const struct ipv6hdr *iph2;
__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
@@ -263,8 +264,8 @@ out:
return pp;
}
-static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Common GRO receive for SIT and IP6IP6 */
@@ -278,8 +279,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head,
return ipv6_gro_receive(head, skb);
}
-static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Common GRO receive for SIT and IP6IP6 */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3168847c30d1..89e0d5118afe 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -219,12 +219,10 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
kfree_skb(skb);
return -ENOBUFS;
}
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
- /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
- * it is safe to call in our context (socket lock not held)
- */
- skb_set_owner_w(skb, (struct sock *)sk);
}
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
@@ -727,7 +725,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb = frag;
frag = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
kfree(tmp_hdr);
@@ -1221,13 +1219,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (mtu < IPV6_MIN_MTU)
return -EINVAL;
cork->base.fragsize = mtu;
- cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
- sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
+ cork->base.gso_size = ipc6->gso_size;
+ cork->base.tx_flags = 0;
+ sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
cork->base.flags |= IPCORK_ALLFRAG;
cork->base.length = 0;
+ cork->base.transmit_time = ipc6->sockc.transmit_time;
+
return 0;
}
@@ -1240,8 +1241,7 @@ static int __ip6_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
- unsigned int flags, struct ipcm6_cookie *ipc6,
- const struct sockcm_cookie *sockc)
+ unsigned int flags, struct ipcm6_cookie *ipc6)
{
struct sk_buff *skb, *skb_prev = NULL;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
@@ -1251,7 +1251,6 @@ static int __ip6_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
- __u8 tx_flags = 0;
u32 tskey = 0;
struct rt6_info *rt = (struct rt6_info *)cork->dst;
struct ipv6_txoptions *opt = v6_cork->opt;
@@ -1270,6 +1269,10 @@ static int __ip6_append_data(struct sock *sk,
mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
+ if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = sk->sk_tskey++;
+
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
@@ -1319,13 +1322,6 @@ emsgsize:
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
- if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
- sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
- if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
- tskey = sk->sk_tskey++;
- }
-
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1444,8 +1440,8 @@ alloc_new_skb:
dst_exthdrlen);
/* Only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = tx_flags;
- tx_flags = 0;
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
@@ -1562,8 +1558,7 @@ int ip6_append_data(struct sock *sk,
int odd, struct sk_buff *skb),
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags,
- const struct sockcm_cookie *sockc)
+ struct rt6_info *rt, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -1591,7 +1586,7 @@ int ip6_append_data(struct sock *sk,
return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
&np->cork, sk_page_frag(sk), getfrag,
- from, length, transhdrlen, flags, ipc6, sockc);
+ from, length, transhdrlen, flags, ipc6);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
@@ -1675,6 +1670,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = cork->base.transmit_time;
+
skb_dst_set(skb, dst_clone(&rt->dst));
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
if (proto == IPPROTO_ICMPV6) {
@@ -1749,8 +1746,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
void *from, int length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags,
- struct inet_cork_full *cork,
- const struct sockcm_cookie *sockc)
+ struct inet_cork_full *cork)
{
struct inet6_cork v6_cork;
struct sk_buff_head queue;
@@ -1778,7 +1774,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
- flags, ipc6, sockc);
+ flags, ipc6);
if (err) {
__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 1cc9650af9fb..a9d06d4dd057 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1113,7 +1113,7 @@ route_lookup:
dst = NULL;
goto tx_err_link_failure;
}
- if (t->parms.collect_md &&
+ if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) &&
ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
&fl6->daddr, 0, &fl6->saddr))
goto tx_err_link_failure;
@@ -1184,11 +1184,14 @@ route_lookup:
}
skb_dst_set(skb, dst);
- if (encap_limit >= 0) {
- init_tel_txopt(&opt, encap_limit);
- ipv6_push_frag_opts(skb, &opt.ops, &proto);
+ if (hop_limit == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ hop_limit = ip_hdr(skb)->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ hop_limit = ipv6_hdr(skb)->hop_limit;
+ else
+ hop_limit = ip6_dst_hoplimit(dst);
}
- hop_limit = hop_limit ? : ip6_dst_hoplimit(dst);
/* Calculate max headroom for all the headers and adjust
* needed_headroom if necessary.
@@ -1202,6 +1205,11 @@ route_lookup:
if (err)
return err;
+ if (encap_limit >= 0) {
+ init_tel_txopt(&opt, encap_limit);
+ ipv6_push_frag_opts(skb, &opt.ops, &proto);
+ }
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
@@ -1226,7 +1234,7 @@ static inline int
ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- const struct iphdr *iph = ip_hdr(skb);
+ const struct iphdr *iph;
int encap_limit = -1;
struct flowi6 fl6;
__u8 dsfield;
@@ -1234,6 +1242,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
u8 tproto;
int err;
+ /* ensure we can access the full inner ip header */
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return -1;
+
+ iph = ip_hdr(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
tproto = READ_ONCE(t->parms.proto);
@@ -1251,6 +1264,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.saddr = key->u.ipv6.src;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
dsfield = key->tos;
@@ -1297,7 +1311,7 @@ static inline int
ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct ipv6hdr *ipv6h;
int encap_limit = -1;
__u16 offset;
struct flowi6 fl6;
@@ -1306,6 +1320,10 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
u8 tproto;
int err;
+ if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
+ return -1;
+
+ ipv6h = ipv6_hdr(skb);
tproto = READ_ONCE(t->parms.proto);
if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
ip6_tnl_addr_conflict(t, ipv6h))
@@ -1322,6 +1340,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.saddr = key->u.ipv6.src;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
dsfield = key->tos;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c72ae3a4fe09..eeaf7455d51e 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -481,7 +481,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
}
mtu = dst_mtu(dst);
- if (!skb->ignore_df && skb->len > mtu) {
+ if (skb->len > mtu) {
skb_dst_update_pmtu(skb, mtu);
if (skb->protocol == htons(ETH_P_IPV6)) {
@@ -503,17 +503,9 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
skb->dev = skb_dst(skb)->dev;
err = dst_output(t->net, skb->sk, skb);
- if (net_xmit_eval(err) == 0) {
- struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->tx_bytes += pkt_len;
- tstats->tx_packets++;
- u64_stats_update_end(&tstats->syncp);
- } else {
- stats->tx_errors++;
- stats->tx_aborted_errors++;
- }
+ if (net_xmit_eval(err) == 0)
+ err = pkt_len;
+ iptunnel_xmit_stats(dev, err);
return 0;
tx_err_link_failure:
@@ -1102,7 +1094,8 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n,
}
t = rtnl_dereference(ip6n->tnls_wc[0]);
- unregister_netdevice_queue(t->dev, list);
+ if (t)
+ unregister_netdevice_queue(t->dev, list);
}
static int __net_init vti6_init_net(struct net *net)
@@ -1114,6 +1107,8 @@ static int __net_init vti6_init_net(struct net *net)
ip6n->tnls[0] = ip6n->tnls_wc;
ip6n->tnls[1] = ip6n->tnls_r_l;
+ if (!net_has_fallback_tunnels(net))
+ return 0;
err = -ENOMEM;
ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0",
NET_NAME_UNKNOWN, vti6_dev_setup);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0d0f0053bb11..c3317ffb09eb 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -32,6 +32,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/compat.h>
+#include <linux/rhashtable.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/raw.h>
@@ -84,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
static void ip6mr_free_table(struct mr_table *mrt);
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc6_cache *cache);
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *cache);
static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert);
static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
@@ -137,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi6_to_flowi(flp6));
+
err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
flowi6_to_flowi(flp6), 0, &arg);
if (err < 0)
@@ -163,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
return -EINVAL;
}
- mrt = ip6mr_get_table(rule->fr_net, rule->table);
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = ip6mr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -1013,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else
- ip6_mr_forward(net, mrt, skb, c);
+ ip6_mr_forward(net, mrt, skb->dev, skb, c);
}
}
@@ -1119,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
/* Queue a packet for resolution. It gets locked cache entry! */
static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
{
struct mfc6_cache *c;
bool found = false;
@@ -1179,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -2042,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
}
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc6_cache *c)
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *c)
{
int psend = -1;
int vif, ct;
- int true_vifi = ip6mr_find_vif(mrt, skb->dev);
+ int true_vifi = ip6mr_find_vif(mrt, dev);
vif = c->_c.mfc_parent;
c->_c.mfc_un.res.pkt++;
@@ -2072,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
- if (mrt->vif_table[vif].dev != skb->dev) {
+ if (mrt->vif_table[vif].dev != dev) {
c->_c.mfc_un.res.wrong_if++;
if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2153,6 +2165,19 @@ int ip6_mr_input(struct sk_buff *skb)
.flowi6_mark = skb->mark,
};
int err;
+ struct net_device *dev;
+
+ /* skb->dev passed in is the master dev for vrfs.
+ * Get the proper interface that does have a vif associated with it.
+ */
+ dev = skb->dev;
+ if (netif_is_l3_master(skb->dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
err = ip6mr_fib_lookup(net, &fl6, &mrt);
if (err < 0) {
@@ -2164,7 +2189,7 @@ int ip6_mr_input(struct sk_buff *skb)
cache = ip6mr_cache_find(mrt,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
if (!cache) {
- int vif = ip6mr_find_vif(mrt, skb->dev);
+ int vif = ip6mr_find_vif(mrt, dev);
if (vif >= 0)
cache = ip6mr_cache_find_any(mrt,
@@ -2178,9 +2203,9 @@ int ip6_mr_input(struct sk_buff *skb)
if (!cache) {
int vif;
- vif = ip6mr_find_vif(mrt, skb->dev);
+ vif = ip6mr_find_vif(mrt, dev);
if (vif >= 0) {
- int err = ip6mr_cache_unresolved(mrt, vif, skb);
+ int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);
read_unlock(&mrt_lock);
return err;
@@ -2190,7 +2215,7 @@ int ip6_mr_input(struct sk_buff *skb)
return -ENODEV;
}
- ip6_mr_forward(net, mrt, skb, cache);
+ ip6_mr_forward(net, mrt, dev, skb, cache);
read_unlock(&mrt_lock);
@@ -2256,7 +2281,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
iph->saddr = rt->rt6i_src.addr;
iph->daddr = rt->rt6i_dst.addr;
- err = ip6mr_cache_unresolved(mrt, vif, skb2);
+ err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
read_unlock(&mrt_lock);
return err;
@@ -2432,6 +2457,30 @@ errout:
static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct fib_dump_filter filter = {};
+ int err;
+
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
+
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
- _ip6mr_fill_mroute, &mfc_unres_lock);
+ _ip6mr_fill_mroute, &mfc_unres_lock, &filter);
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 568ca4187cd1..381ce38940ae 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -500,7 +500,6 @@ sticky_done:
struct ipv6_txoptions *opt = NULL;
struct msghdr msg;
struct flowi6 fl6;
- struct sockcm_cookie sockc_junk;
struct ipcm6_cookie ipc6;
memset(&fl6, 0, sizeof(fl6));
@@ -533,7 +532,7 @@ sticky_done:
msg.msg_control = (void *)(opt+1);
ipc6.opt = opt;
- retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk);
+ retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6);
if (retv)
goto done;
update:
@@ -675,6 +674,13 @@ done:
retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr);
break;
}
+ case IPV6_MULTICAST_ALL:
+ if (optlen < sizeof(int))
+ goto e_inval;
+ np->mc_all = valbool;
+ retv = 0;
+ break;
+
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
{
@@ -1267,6 +1273,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = np->mcast_oif;
break;
+ case IPV6_MULTICAST_ALL:
+ val = np->mc_all;
+ break;
+
case IPV6_UNICAST_IF:
val = (__force int)htonl((__u32) np->ucast_oif);
break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f60f310785fd..21f6deb2aec9 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -636,7 +636,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
}
if (!mc) {
rcu_read_unlock();
- return true;
+ return np->mc_all;
}
read_lock(&mc->sflock);
psl = mc->sflist;
@@ -660,7 +660,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
return rv;
}
-static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode)
+static void igmp6_group_added(struct ifmcaddr6 *mc)
{
struct net_device *dev = mc->idev->dev;
char buf[MAX_ADDR_LEN];
@@ -690,7 +690,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode)
* should not send filter-mode change record as the mode
* should be from IN() to IN(A).
*/
- if (mode == MCAST_EXCLUDE)
+ if (mc->mca_sfmode == MCAST_EXCLUDE)
mc->mca_crcount = mc->idev->mc_qrv;
mld_ifc_event(mc->idev);
@@ -931,7 +931,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
write_unlock_bh(&idev->lock);
mld_del_delrec(idev, mc);
- igmp6_group_added(mc, mode);
+ igmp6_group_added(mc);
ma_put(mc);
return 0;
}
@@ -2436,17 +2436,17 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
{
int err;
- /* callers have the socket lock and rtnl lock
- * so no other readers or writers of iml or its sflist
- */
+ write_lock_bh(&iml->sflock);
if (!iml->sflist) {
/* any-source empty exclude case */
- return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ } else {
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
+ iml->sflist->sl_count, iml->sflist->sl_addr, 0);
+ sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
+ iml->sflist = NULL;
}
- err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
- iml->sflist->sl_count, iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+ write_unlock_bh(&iml->sflock);
return err;
}
@@ -2571,7 +2571,7 @@ void ipv6_mc_up(struct inet6_dev *idev)
ipv6_mc_reset(idev);
for (i = idev->mc_list; i; i = i->next) {
mld_del_delrec(idev, i);
- igmp6_group_added(i, i->mca_sfmode);
+ igmp6_group_added(i);
}
read_unlock_bh(&idev->lock);
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0ec273997d1d..a25cfdd47c89 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
if (!ndopts.nd_opts_rh) {
ip6_redirect_no_header(skb, dev_net(skb->dev),
- skb->dev->ifindex, 0);
+ skb->dev->ifindex);
return;
}
@@ -1784,6 +1784,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&nd_tbl, dev);
+ if (!netif_carrier_ok(dev))
+ neigh_carrier_down(&nd_tbl, dev);
break;
case NETDEV_DOWN:
neigh_ifdown(&nd_tbl, dev);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 531d6957af36..5ae8e1c51079 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,7 +15,6 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/xfrm.h>
-#include <net/ip6_checksum.h>
#include <net/netfilter/nf_queue.h>
int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
@@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst,
return err;
}
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
- break;
- if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- skb->len - dataoff, protocol,
- csum_sub(skb->csum,
- skb_checksum(skb, 0,
- dataoff, 0)))) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = ~csum_unfold(
- csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- skb->len - dataoff,
- protocol,
- csum_sub(0,
- skb_checksum(skb, 0,
- dataoff, 0))));
- csum = __skb_checksum_complete(skb);
- }
- return csum;
-}
-EXPORT_SYMBOL(nf_ip6_checksum);
-
-static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, unsigned int len,
- u_int8_t protocol)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- __wsum hsum;
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (len == skb->len - dataoff)
- return nf_ip6_checksum(skb, hook, dataoff, protocol);
- /* fall through */
- case CHECKSUM_NONE:
- hsum = skb_checksum(skb, 0, dataoff, 0);
- skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
- &ip6h->daddr,
- skb->len - dataoff,
- protocol,
- csum_sub(0, hsum)));
- skb->ip_summed = CHECKSUM_NONE;
- return __skb_checksum_complete_head(skb, dataoff + len);
- }
- return csum;
-};
-
static const struct nf_ipv6_ops ipv6ops = {
.chk_addr = ipv6_chk_addr,
.route_input = ip6_route_input,
.fragment = ip6_fragment,
- .checksum = nf_ip6_checksum,
- .checksum_partial = nf_ip6_checksum_partial,
.route = nf_ip6_route,
.reroute = nf_ip6_reroute,
};
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 37b14dc9d863..339d0762b027 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,26 +5,6 @@
menu "IPv6: Netfilter Configuration"
depends on INET && IPV6 && NETFILTER
-config NF_DEFRAG_IPV6
- tristate
- default n
-
-config NF_CONNTRACK_IPV6
- tristate "IPv6 connection tracking support"
- depends on INET && IPV6 && NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_DEFRAG_IPV6
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv6 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config NF_SOCKET_IPV6
tristate "IPv6 socket lookup support"
help
@@ -128,7 +108,7 @@ config NF_LOG_IPV6
config NF_NAT_IPV6
tristate "IPv6 NAT"
- depends on NF_CONNTRACK_IPV6
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_NAT
help
@@ -328,7 +308,7 @@ config IP6_NF_SECURITY
config IP6_NF_NAT
tristate "ip6tables NAT support"
- depends on NF_CONNTRACK_IPV6
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_NAT
select NF_NAT_IPV6
@@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT
endif # IP6_NF_NAT
endif # IP6_NF_IPTABLES
-
endmenu
+config NF_DEFRAG_IPV6
+ tristate
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 10a5a1c87320..200c0c235565 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
-# objects for l3 independent conntrack
-nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
-
-# l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
-
nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 8b147440fbdc..af737b47b9b5 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -65,7 +65,10 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
}
hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
+ if (!hp) {
+ par->hotdrop = true;
+ return false;
+ }
/* Calculate the header length */
if (nexthdr == NEXTHDR_FRAGMENT)
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 0fe61ede77c6..c3c6b09acdc4 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -26,6 +26,12 @@ static bool rpfilter_addr_unicast(const struct in6_addr *addr)
return addr_type & IPV6_ADDR_UNICAST;
}
+static bool rpfilter_addr_linklocal(const struct in6_addr *addr)
+{
+ int addr_type = ipv6_addr_type(addr);
+ return addr_type & IPV6_ADDR_LINKLOCAL;
+}
+
static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
const struct net_device *dev, u8 flags)
{
@@ -48,7 +54,11 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
}
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- if ((flags & XT_RPFILTER_LOOSE) == 0)
+
+ if (rpfilter_addr_linklocal(&iph->saddr)) {
+ lookup_flags |= RT6_LOOKUP_F_IFACE;
+ fl6.flowi6_oif = dev->ifindex;
+ } else if ((flags & XT_RPFILTER_LOOSE) == 0)
fl6.flowi6_oif = dev->ifindex;
rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 2c99b94eeca3..21bf6bf04323 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -137,7 +137,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
sizeof(_addr),
&_addr);
- BUG_ON(ap == NULL);
+ if (ap == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
pr_debug("i=%d temp=%d;\n", i, temp);
@@ -166,7 +169,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+ temp * sizeof(_addr),
sizeof(_addr),
&_addr);
- BUG_ON(ap == NULL);
+ if (ap == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
break;
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
deleted file mode 100644
index 663827ee3cf8..000000000000
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (C)2004 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- */
-
-#include <linux/types.h>
-#include <linux/ipv6.h>
-#include <linux/in6.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <net/ipv6.h>
-#include <net/inet_frag.h>
-
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack6_net_id;
-static DEFINE_MUTEX(register_ipv6_hooks);
-
-struct conntrack6_net {
- unsigned int users;
-};
-
-static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- const u_int32_t *ap;
- u_int32_t _addrs[8];
-
- ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
- sizeof(_addrs), _addrs);
- if (ap == NULL)
- return false;
-
- memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
- memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
-
- return true;
-}
-
-static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
- memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
-
- return true;
-}
-
-static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
- __be16 frag_off;
- int protoff;
- u8 nexthdr;
-
- if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
- &nexthdr, sizeof(nexthdr)) != 0) {
- pr_debug("ip6_conntrack_core: can't get nexthdr\n");
- return -NF_ACCEPT;
- }
- protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
- /*
- * (protoff == skb->len) means the packet has not data, just
- * IPv6 and possibly extensions headers, but it is tracked anyway
- */
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
- return -NF_ACCEPT;
- }
-
- *dataoff = protoff;
- *protonum = nexthdr;
- return NF_ACCEPT;
-}
-
-static unsigned int ipv6_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
- enum ip_conntrack_info ctinfo;
- __be16 frag_off;
- int protoff;
- u8 nexthdr;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
-
- return helper->help(skb, protoff, ct, ctinfo);
-}
-
-static unsigned int ipv6_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned char pnum = ipv6_hdr(skb)->nexthdr;
- int protoff;
- __be16 frag_off;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- goto out;
- }
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
- }
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv6_conntrack_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
-}
-
-static unsigned int ipv6_conntrack_local(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
-}
-
-static const struct nf_hook_ops ipv6_conntrack_ops[] = {
- {
- .hook = ipv6_conntrack_in,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK,
- },
- {
- .hook = ipv6_conntrack_local,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP6_PRI_CONNTRACK,
- },
- {
- .hook = ipv6_helper,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv6_confirm,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_LAST,
- },
- {
- .hook = ipv6_helper,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP6_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv6_confirm,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP6_PRI_LAST-1,
- },
-};
-
-static int
-ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
- const struct ipv6_pinfo *inet6 = inet6_sk(sk);
- const struct inet_sock *inet = inet_sk(sk);
- const struct nf_conntrack_tuple_hash *h;
- struct sockaddr_in6 sin6;
- struct nf_conn *ct;
- __be32 flow_label;
- int bound_dev_if;
-
- lock_sock(sk);
- tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
- tuple.src.u.tcp.port = inet->inet_sport;
- tuple.dst.u3.in6 = sk->sk_v6_daddr;
- tuple.dst.u.tcp.port = inet->inet_dport;
- tuple.dst.protonum = sk->sk_protocol;
- bound_dev_if = sk->sk_bound_dev_if;
- flow_label = inet6->flow_label;
- release_sock(sk);
-
- if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP)
- return -ENOPROTOOPT;
-
- if (*len < 0 || (unsigned int) *len < sizeof(sin6))
- return -EINVAL;
-
- h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (!h) {
- pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
- &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
- }
-
- ct = nf_ct_tuplehash_to_ctrack(h);
-
- sin6.sin6_family = AF_INET6;
- sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
- sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
- memcpy(&sin6.sin6_addr,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
- sizeof(sin6.sin6_addr));
-
- nf_ct_put(ct);
- sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
- return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
- nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = {
- [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 },
- [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 },
-};
-
-static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
- return -EINVAL;
-
- t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
- t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
-
- return 0;
-}
-#endif
-
-static int ipv6_hooks_register(struct net *net)
-{
- struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
- int err = 0;
-
- mutex_lock(&register_ipv6_hooks);
- cnet->users++;
- if (cnet->users > 1)
- goto out_unlock;
-
- err = nf_defrag_ipv6_enable(net);
- if (err < 0) {
- cnet->users = 0;
- goto out_unlock;
- }
-
- err = nf_register_net_hooks(net, ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- if (err)
- cnet->users = 0;
- out_unlock:
- mutex_unlock(&register_ipv6_hooks);
- return err;
-}
-
-static void ipv6_hooks_unregister(struct net *net)
-{
- struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
-
- mutex_lock(&register_ipv6_hooks);
- if (cnet->users && (--cnet->users == 0))
- nf_unregister_net_hooks(net, ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- mutex_unlock(&register_ipv6_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
- .l3proto = PF_INET6,
- .pkt_to_tuple = ipv6_pkt_to_tuple,
- .invert_tuple = ipv6_invert_tuple,
- .get_l4proto = ipv6_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = ipv6_tuple_to_nlattr,
- .nlattr_to_tuple = ipv6_nlattr_to_tuple,
- .nla_policy = ipv6_nla_policy,
- .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) +
- NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])),
-#endif
- .net_ns_get = ipv6_hooks_register,
- .net_ns_put = ipv6_hooks_unregister,
- .me = THIS_MODULE,
-};
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
-
-static struct nf_sockopt_ops so_getorigdst6 = {
- .pf = NFPROTO_IPV6,
- .get_optmin = IP6T_SO_ORIGINAL_DST,
- .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
- .get = ipv6_getorigdst,
- .owner = THIS_MODULE,
-};
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = {
- &nf_conntrack_l4proto_tcp6,
- &nf_conntrack_l4proto_udp6,
- &nf_conntrack_l4proto_icmpv6,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- &nf_conntrack_l4proto_dccp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
- &nf_conntrack_l4proto_sctp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
- &nf_conntrack_l4proto_udplite6,
-#endif
-};
-
-static int ipv6_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
-}
-
-static void ipv6_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
-}
-
-static struct pernet_operations ipv6_net_ops = {
- .init = ipv6_net_init,
- .exit = ipv6_net_exit,
- .id = &conntrack6_net_id,
- .size = sizeof(struct conntrack6_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv6_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) !=
- nf_conntrack_l3proto_ipv6.nla_size))
- return -EINVAL;
-#endif
-
- ret = nf_register_sockopt(&so_getorigdst6);
- if (ret < 0) {
- pr_err("Unable to register netfilter socket option\n");
- return ret;
- }
-
- ret = register_pernet_subsys(&ipv6_net_ops);
- if (ret < 0)
- goto cleanup_sockopt;
-
- ret = nf_ct_l4proto_register(builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
- if (ret < 0)
- goto cleanup_pernet;
-
- ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
- goto cleanup_l4proto;
- }
- return ret;
-cleanup_l4proto:
- nf_ct_l4proto_unregister(builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
- cleanup_pernet:
- unregister_pernet_subsys(&ipv6_net_ops);
- cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst6);
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv6_fini(void)
-{
- synchronize_net();
- nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
- nf_ct_l4proto_unregister(builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
- unregister_pernet_subsys(&ipv6_net_ops);
- nf_unregister_sockopt(&so_getorigdst6);
-}
-
-module_init(nf_conntrack_l3proto_ipv6_init);
-module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e4d9e6976d3c..b8ac369f98ad 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -33,9 +33,8 @@
#include <net/sock.h>
#include <net/snmp.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
-#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/rawv6.h>
@@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
fq = container_of(frag, struct frag_queue, q);
net = container_of(fq->q.net, struct net, nf_frag.frags);
- ip6_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(net, fq);
}
/* Creation primitives. */
@@ -446,11 +445,12 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
+ fp->sk = NULL;
}
sub_frag_mem_limit(fq->q.net, head->truesize);
head->ignore_df = 1;
- head->next = NULL;
+ skb_mark_not_on_list(head);
head->dev = dev;
head->tstamp = fq->q.stamp;
ipv6_hdr(head)->payload_len = htons(payload_len);
@@ -464,6 +464,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
head->csum);
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return true;
@@ -558,6 +559,10 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
+ fhdr->frag_off & htons(IP6_MF))
+ return -EINVAL;
+
skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0);
@@ -624,16 +629,24 @@ static struct pernet_operations nf_ct_net_ops = {
.exit = nf_ct_net_exit,
};
+static const struct rhashtable_params nfct_rhash_params = {
+ .head_offset = offsetof(struct inet_frag_queue, node),
+ .hashfn = ip6frag_key_hashfn,
+ .obj_hashfn = ip6frag_obj_hashfn,
+ .obj_cmpfn = ip6frag_obj_cmpfn,
+ .automatic_shrinking = true,
+};
+
int nf_ct_frag6_init(void)
{
int ret = 0;
- nf_frags.constructor = ip6_frag_init;
+ nf_frags.constructor = ip6frag_init;
nf_frags.destructor = NULL;
nf_frags.qsize = sizeof(struct frag_queue);
nf_frags.frag_expire = nf_ct_frag6_expire;
nf_frags.frags_cache_name = nf_frags_cache_name;
- nf_frags.rhash_params = ip6_rhash_params;
+ nf_frags.rhash_params = nfct_rhash_params;
ret = inet_frags_init(&nf_frags);
if (ret)
goto out;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index c87b48359e8f..72dd3e202375 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -14,8 +14,7 @@
#include <linux/skbuff.h>
#include <linux/icmp.h>
#include <linux/sysctl.h>
-#include <net/ipv6.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
@@ -23,7 +22,6 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#endif
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index b397a8fe88b9..c6bf580d0f33 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = {
};
/* One level of recursion won't kill us */
-static void dump_ipv6_packet(struct nf_log_buf *m,
+static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
const struct nf_loginfo *info,
const struct sk_buff *skb, unsigned int ip6hoff,
int recurse)
@@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
/* Max length: 3+maxlen */
if (recurse) {
nf_log_buf_add(m, "[");
- dump_ipv6_packet(m, info, skb,
+ dump_ipv6_packet(net, m, info, skb,
ptr + sizeof(_icmp6h), 0);
nf_log_buf_add(m, "] ");
}
@@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
/* Max length: 15 "UID=4294967295 " */
if ((logflags & NF_LOG_UID) && recurse)
- nf_log_dump_sk_uid_gid(m, skb->sk);
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
if (recurse && skb->mark)
@@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
if (in != NULL)
dump_ipv6_mac_header(m, loginfo, skb);
- dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
+ dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
nf_log_buf_close(m);
}
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index e6eb7cf9b54f..3e4bf2286abe 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -87,18 +87,30 @@ static struct notifier_block masq_dev_notifier = {
struct masq_dev_work {
struct work_struct work;
struct net *net;
+ struct in6_addr addr;
int ifindex;
};
+static int inet_cmp(struct nf_conn *ct, void *work)
+{
+ struct masq_dev_work *w = (struct masq_dev_work *)work;
+ struct nf_conntrack_tuple *tuple;
+
+ if (!device_cmp(ct, (void *)(long)w->ifindex))
+ return 0;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6);
+}
+
static void iterate_cleanup_work(struct work_struct *work)
{
struct masq_dev_work *w;
- long index;
w = container_of(work, struct masq_dev_work, work);
- index = w->ifindex;
- nf_ct_iterate_cleanup_net(w->net, device_cmp, (void *)index, 0, 0);
+ nf_ct_iterate_cleanup_net(w->net, inet_cmp, (void *)w, 0, 0);
put_net(w->net);
kfree(w);
@@ -147,6 +159,7 @@ static int masq_inet_event(struct notifier_block *this,
INIT_WORK(&w->work, iterate_cleanup_work);
w->ifindex = dev->ifindex;
w->net = net;
+ w->addr = ifa->addr;
schedule_work(&w->work);
return NOTIFY_DONE;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 96f56bf49a30..4c04bccc7417 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct dst_entry *dst;
struct rt6_info *rt;
struct pingfakehdr pfh;
- struct sockcm_cookie junk = {0};
struct ipcm6_cookie ipc6;
pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
@@ -119,7 +118,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.fl6_icmp_code = user_icmph.icmp6_code;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- ipc6.tclass = np->tclass;
+ ipcm6_init_sk(&ipc6, np);
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
@@ -142,13 +141,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.family = AF_INET6;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- ipc6.dontfrag = np->dontfrag;
- ipc6.opt = NULL;
lock_sock(sk);
err = ip6_append_data(sk, ping_getfrag, &pfh, len,
0, &ipc6, &fl6, rt,
- MSG_DONTWAIT, &junk);
+ MSG_DONTWAIT);
if (err) {
ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index afc307c89d1a..5e0efd3954e9 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -620,7 +620,7 @@ out:
static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
struct flowi6 *fl6, struct dst_entry **dstp,
- unsigned int flags)
+ unsigned int flags, const struct sockcm_cookie *sockc)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
@@ -650,8 +650,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->protocol = htons(ETH_P_IPV6);
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- skb_dst_set(skb, &rt->dst);
- *dstp = NULL;
+ skb->tstamp = sockc->transmit_time;
skb_put(skb, length);
skb_reset_network_header(skb);
@@ -664,8 +663,14 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->transport_header = skb->network_header;
err = memcpy_from_msg(iph, msg, length);
- if (err)
- goto error_fault;
+ if (err) {
+ err = -EFAULT;
+ kfree_skb(skb);
+ goto error;
+ }
+
+ skb_dst_set(skb, &rt->dst);
+ *dstp = NULL;
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -674,21 +679,28 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
if (unlikely(!skb))
return 0;
+ /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev
+ * in the error path. Since skb has been freed, the dst could
+ * have been queued for deletion.
+ */
+ rcu_read_lock();
IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
NULL, rt->dst.dev, dst_output);
if (err > 0)
err = net_xmit_errno(err);
- if (err)
- goto error;
+ if (err) {
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_unlock();
+ goto error_check;
+ }
+ rcu_read_unlock();
out:
return 0;
-error_fault:
- err = -EFAULT;
- kfree_skb(skb);
error:
IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+error_check:
if (err == -ENOBUFS && !np->recverr)
err = 0;
return err;
@@ -766,7 +778,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct dst_entry *dst = NULL;
struct raw6_frag_vec rfv;
struct flowi6 fl6;
- struct sockcm_cookie sockc;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
u16 proto;
@@ -790,10 +801,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_mark = sk->sk_mark;
fl6.flowi6_uid = sk->sk_uid;
- ipc6.hlimit = -1;
- ipc6.tclass = -1;
- ipc6.dontfrag = -1;
- ipc6.opt = NULL;
+ ipcm6_init(&ipc6);
+ ipc6.sockc.tsflags = sk->sk_tsflags;
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -847,14 +856,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (fl6.flowi6_oif == 0)
fl6.flowi6_oif = sk->sk_bound_dev_if;
- sockc.tsflags = sk->sk_tsflags;
if (msg->msg_controllen) {
opt = &opt_space;
memset(opt, 0, sizeof(struct ipv6_txoptions));
opt->tot_len = sizeof(struct ipv6_txoptions);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -921,13 +929,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
back_from_confirm:
if (inet->hdrincl)
- err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags);
+ err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
+ msg->msg_flags, &ipc6.sockc);
else {
ipc6.opt = opt;
lock_sock(sk);
err = ip6_append_data(sk, raw6_getfrag, &rfv,
len, 0, &ipc6, &fl6, (struct rt6_info *)dst,
- msg->msg_flags, &sockc);
+ msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b939b94e7e91..5c3c92713096 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -57,7 +57,7 @@
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
#include <net/inet_ecn.h>
static const char ip6_frag_cache_name[] = "ip6-frags";
@@ -72,61 +72,6 @@ static struct inet_frags ip6_frags;
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
struct net_device *dev);
-void ip6_frag_init(struct inet_frag_queue *q, const void *a)
-{
- struct frag_queue *fq = container_of(q, struct frag_queue, q);
- const struct frag_v6_compare_key *key = a;
-
- q->key.v6 = *key;
- fq->ecn = 0;
-}
-EXPORT_SYMBOL(ip6_frag_init);
-
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
-{
- struct net_device *dev = NULL;
- struct sk_buff *head;
-
- rcu_read_lock();
- spin_lock(&fq->q.lock);
-
- if (fq->q.flags & INET_FRAG_COMPLETE)
- goto out;
-
- inet_frag_kill(&fq->q);
-
- dev = dev_get_by_index_rcu(net, fq->iif);
- if (!dev)
- goto out;
-
- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-
- /* Don't send error if the first segment did not arrive. */
- head = fq->q.fragments;
- if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
- goto out;
-
- /* But use as source device on which LAST ARRIVED
- * segment was received. And do not use fq->dev
- * pointer directly, device might already disappeared.
- */
- head->dev = dev;
- skb_get(head);
- spin_unlock(&fq->q.lock);
-
- icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
- kfree_skb(head);
- goto out_rcu_unlock;
-
-out:
- spin_unlock(&fq->q.lock);
-out_rcu_unlock:
- rcu_read_unlock();
- inet_frag_put(&fq->q);
-}
-EXPORT_SYMBOL(ip6_expire_frag_queue);
-
static void ip6_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
@@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t)
fq = container_of(frag, struct frag_queue, q);
net = container_of(fq->q.net, struct net, ipv6.frags);
- ip6_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(net, fq);
}
static struct frag_queue *
@@ -200,7 +145,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
*/
if (end < fq->q.len ||
((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
- goto err;
+ goto discard_fq;
fq->q.flags |= INET_FRAG_LAST_IN;
fq->q.len = end;
} else {
@@ -217,20 +162,20 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
if (end > fq->q.len) {
/* Some bits beyond end -> corruption. */
if (fq->q.flags & INET_FRAG_LAST_IN)
- goto err;
+ goto discard_fq;
fq->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_fq;
/* Point into the IP datagram 'data' part. */
if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
- goto err;
+ goto discard_fq;
if (pskb_trim_rcsum(skb, end - offset))
- goto err;
+ goto discard_fq;
/* Find out which fragments are in front and at the back of us
* in the chain of fragments so far. We must know where to put
@@ -443,7 +388,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
}
sub_frag_mem_limit(fq->q.net, sum_truesize);
- head->next = NULL;
+ skb_mark_not_on_list(head);
head->dev = dev;
head->tstamp = fq->q.stamp;
ipv6_hdr(head)->payload_len = htons(payload_len);
@@ -460,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return 1;
@@ -472,6 +418,7 @@ out_fail:
rcu_read_lock();
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
rcu_read_unlock();
+ inet_frag_kill(&fq->q);
return -1;
}
@@ -510,6 +457,10 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return 1;
}
+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
+ fhdr->frag_off & htons(IP6_MF))
+ goto fail_hdr;
+
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
@@ -603,7 +554,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
table[0].data = &net->ipv6.frags.high_thresh;
table[0].extra1 = &net->ipv6.frags.low_thresh;
- table[0].extra2 = &init_net.ipv6.frags.high_thresh;
table[1].data = &net->ipv6.frags.low_thresh;
table[1].extra2 = &net->ipv6.frags.high_thresh;
table[2].data = &net->ipv6.frags.timeout;
@@ -696,42 +646,19 @@ static struct pernet_operations ip6_frags_ops = {
.exit = ipv6_frags_exit_net,
};
-static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
-{
- return jhash2(data,
- sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
-}
-
-static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
-{
- const struct inet_frag_queue *fq = data;
-
- return jhash2((const u32 *)&fq->key.v6,
- sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
-}
-
-static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
-{
- const struct frag_v6_compare_key *key = arg->key;
- const struct inet_frag_queue *fq = ptr;
-
- return !!memcmp(&fq->key, key, sizeof(*key));
-}
-
-const struct rhashtable_params ip6_rhash_params = {
+static const struct rhashtable_params ip6_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
- .hashfn = ip6_key_hashfn,
- .obj_hashfn = ip6_obj_hashfn,
- .obj_cmpfn = ip6_obj_cmpfn,
+ .hashfn = ip6frag_key_hashfn,
+ .obj_hashfn = ip6frag_obj_hashfn,
+ .obj_cmpfn = ip6frag_obj_cmpfn,
.automatic_shrinking = true,
};
-EXPORT_SYMBOL(ip6_rhash_params);
int __init ipv6_frag_init(void)
{
int ret;
- ip6_frags.constructor = ip6_frag_init;
+ ip6_frags.constructor = ip6frag_init;
ip6_frags.destructor = NULL;
ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.frag_expire = ip6_frag_expire;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7208c16302f6..e3226284e480 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -368,7 +368,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
struct fib6_info *from;
struct inet6_dev *idev;
- dst_destroy_metrics_generic(dst);
+ ip_dst_metrics_put(dst);
rt6_uncached_list_del(rt);
idev = rt->rt6i_idev;
@@ -517,10 +517,11 @@ static void rt6_probe_deferred(struct work_struct *w)
static void rt6_probe(struct fib6_info *rt)
{
- struct __rt6_probe_work *work;
+ struct __rt6_probe_work *work = NULL;
const struct in6_addr *nh_gw;
struct neighbour *neigh;
struct net_device *dev;
+ struct inet6_dev *idev;
/*
* Okay, this does not seem to be appropriate
@@ -536,15 +537,12 @@ static void rt6_probe(struct fib6_info *rt)
nh_gw = &rt->fib6_nh.nh_gw;
dev = rt->fib6_nh.nh_dev;
rcu_read_lock_bh();
+ idev = __in6_dev_get(dev);
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
if (neigh) {
- struct inet6_dev *idev;
-
if (neigh->nud_state & NUD_VALID)
goto out;
- idev = __in6_dev_get(dev);
- work = NULL;
write_lock(&neigh->lock);
if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
@@ -554,11 +552,13 @@ static void rt6_probe(struct fib6_info *rt)
__neigh_set_probe_once(neigh);
}
write_unlock(&neigh->lock);
- } else {
+ } else if (time_after(jiffies, rt->last_probe +
+ idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
if (work) {
+ rt->last_probe = jiffies;
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = *nh_gw;
dev_hold(dev);
@@ -946,8 +946,6 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
{
- rt->dst.flags |= fib6_info_dst_flags(ort);
-
if (ort->fib6_flags & RTF_REJECT) {
ip6_rt_init_dst_reject(rt, ort);
return;
@@ -956,7 +954,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
rt->dst.error = 0;
rt->dst.output = ip6_output;
- if (ort->fib6_type == RTN_LOCAL) {
+ if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
rt->dst.input = ip6_input;
} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
rt->dst.input = ip6_mc_input;
@@ -977,7 +975,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
rt->rt6i_flags &= ~RTF_EXPIRES;
rcu_assign_pointer(rt->from, from);
- dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
+ ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}
/* Caller must already hold reference to @ort */
@@ -995,8 +993,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
#ifdef CONFIG_IPV6_SUBTREES
rt->rt6i_src = ort->fib6_src;
#endif
- rt->rt6i_prefsrc = ort->fib6_prefsrc;
- rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
}
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1450,11 +1446,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
if (ort->fib6_src.plen)
src_key = &nrt->rt6i_src.addr;
#endif
-
- /* Update rt6i_prefsrc as it could be changed
- * in rt6_remove_prefsrc()
- */
- nrt->rt6i_prefsrc = ort->fib6_prefsrc;
/* rt6_mtu_change() might lower mtu on ort.
* Only insert this exception route if its mtu
* is less than ort's mtu value.
@@ -1636,25 +1627,6 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
rcu_read_unlock();
}
-static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
-{
- struct rt6_exception_bucket *bucket;
- struct rt6_exception *rt6_ex;
- int i;
-
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
- if (bucket) {
- for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
- hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
- rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
- }
- bucket++;
- }
- }
-}
-
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
struct rt6_info *rt, int mtu)
{
@@ -2099,7 +2071,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
{
bool any_src;
- if (rt6_need_strict(&fl6->daddr)) {
+ if (ipv6_addr_type(&fl6->daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
dst = l3mdev_link_scope_lookup(net, fl6);
@@ -2369,15 +2342,14 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
- fl6.daddr = iph->daddr;
- fl6.saddr = iph->saddr;
- fl6.flowlabel = ip6_flowinfo(iph);
- fl6.flowi6_uid = uid;
+ struct flowi6 fl6 = {
+ .flowi6_oif = oif,
+ .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_uid = uid,
+ };
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
@@ -2528,16 +2500,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_iif = LOOPBACK_IFINDEX;
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = mark;
- fl6.daddr = iph->daddr;
- fl6.saddr = iph->saddr;
- fl6.flowlabel = ip6_flowinfo(iph);
- fl6.flowi6_uid = uid;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_oif = oif,
+ .flowi6_mark = mark,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_uid = uid,
+ };
dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -2545,21 +2516,18 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
}
EXPORT_SYMBOL_GPL(ip6_redirect);
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
- u32 mark)
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_iif = LOOPBACK_IFINDEX;
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = mark;
- fl6.daddr = msg->dest;
- fl6.saddr = iph->daddr;
- fl6.flowi6_uid = sock_net_uid(net, NULL);
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_oif = oif,
+ .daddr = msg->dest,
+ .saddr = iph->daddr,
+ .flowi6_uid = sock_net_uid(net, NULL),
+ };
dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -2730,24 +2698,6 @@ out:
return entries > rt_max_size;
}
-static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
- struct fib6_config *cfg)
-{
- struct dst_metrics *p;
-
- if (!cfg->fc_mx)
- return 0;
-
- p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
- if (unlikely(!p))
- return -ENOMEM;
-
- refcount_set(&p->refcnt, 1);
- rt->fib6_metrics = p;
-
- return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
-}
-
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
struct fib6_config *cfg,
const struct in6_addr *gw_addr,
@@ -3023,13 +2973,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (!rt)
goto out;
+ rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
+ if (IS_ERR(rt->fib6_metrics)) {
+ err = PTR_ERR(rt->fib6_metrics);
+ /* Do not leave garbage there. */
+ rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
+ goto out;
+ }
+
if (cfg->fc_flags & RTF_ADDRCONF)
rt->dst_nocount = true;
- err = ip6_convert_metrics(net, rt, cfg);
- if (err < 0)
- goto out;
-
if (cfg->fc_flags & RTF_EXPIRES)
fib6_set_expires(rt, jiffies +
clock_t_to_jiffies(cfg->fc_expires));
@@ -3138,8 +3092,6 @@ install_route:
rt->fib6_nh.nh_dev = dev;
rt->fib6_table = table;
- cfg->fc_nlinfo.nl_net = dev_net(dev);
-
if (idev)
in6_dev_put(idev);
@@ -3631,23 +3583,23 @@ static void rtmsg_to_fib6_config(struct net *net,
struct in6_rtmsg *rtmsg,
struct fib6_config *cfg)
{
- memset(cfg, 0, sizeof(*cfg));
-
- cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
- : RT6_TABLE_MAIN;
- cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
- cfg->fc_metric = rtmsg->rtmsg_metric;
- cfg->fc_expires = rtmsg->rtmsg_info;
- cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
- cfg->fc_src_len = rtmsg->rtmsg_src_len;
- cfg->fc_flags = rtmsg->rtmsg_flags;
- cfg->fc_type = rtmsg->rtmsg_type;
+ *cfg = (struct fib6_config){
+ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+ : RT6_TABLE_MAIN,
+ .fc_ifindex = rtmsg->rtmsg_ifindex,
+ .fc_metric = rtmsg->rtmsg_metric,
+ .fc_expires = rtmsg->rtmsg_info,
+ .fc_dst_len = rtmsg->rtmsg_dst_len,
+ .fc_src_len = rtmsg->rtmsg_src_len,
+ .fc_flags = rtmsg->rtmsg_flags,
+ .fc_type = rtmsg->rtmsg_type,
- cfg->fc_nlinfo.nl_net = net;
+ .fc_nlinfo.nl_net = net,
- cfg->fc_dst = rtmsg->rtmsg_dst;
- cfg->fc_src = rtmsg->rtmsg_src;
- cfg->fc_gateway = rtmsg->rtmsg_gateway;
+ .fc_dst = rtmsg->rtmsg_dst,
+ .fc_src = rtmsg->rtmsg_src,
+ .fc_gateway = rtmsg->rtmsg_gateway,
+ };
}
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
@@ -3754,6 +3706,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
if (!f6i)
return ERR_PTR(-ENOMEM);
+ f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
f6i->dst_nocount = true;
f6i->dst_host = true;
f6i->fib6_protocol = RTPROT_KERNEL;
@@ -3796,8 +3749,6 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
spin_lock_bh(&rt6_exception_lock);
/* remove prefsrc entry */
rt->fib6_prefsrc.plen = 0;
- /* need to update cache as well */
- rt6_exceptions_remove_prefsrc(rt);
spin_unlock_bh(&rt6_exception_lock);
}
return 0;
@@ -4075,8 +4026,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
.event = event,
},
};
+ struct net *net = dev_net(dev);
- fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
+ if (net->ipv6.sysctl.skip_notify_on_dev_down)
+ fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
+ else
+ fib6_clean_all(net, fib6_ifdown, &arg);
}
void rt6_disable_ip(struct net_device *dev, unsigned long event)
@@ -4166,20 +4121,25 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
int err;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
- NULL);
+ extack);
if (err < 0)
goto errout;
err = -EINVAL;
rtm = nlmsg_data(nlh);
- memset(cfg, 0, sizeof(*cfg));
- cfg->fc_table = rtm->rtm_table;
- cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_src_len = rtm->rtm_src_len;
- cfg->fc_flags = RTF_UP;
- cfg->fc_protocol = rtm->rtm_protocol;
- cfg->fc_type = rtm->rtm_type;
+ *cfg = (struct fib6_config){
+ .fc_table = rtm->rtm_table,
+ .fc_dst_len = rtm->rtm_dst_len,
+ .fc_src_len = rtm->rtm_src_len,
+ .fc_flags = RTF_UP,
+ .fc_protocol = rtm->rtm_protocol,
+ .fc_type = rtm->rtm_type,
+
+ .fc_nlinfo.portid = NETLINK_CB(skb).portid,
+ .fc_nlinfo.nlh = nlh,
+ .fc_nlinfo.nl_net = sock_net(skb->sk),
+ };
if (rtm->rtm_type == RTN_UNREACHABLE ||
rtm->rtm_type == RTN_BLACKHOLE ||
@@ -4195,10 +4155,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
- cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
- cfg->fc_nlinfo.nlh = nlh;
- cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
-
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
@@ -4317,11 +4273,6 @@ static int ip6_route_info_append(struct net *net,
if (!nh)
return -ENOMEM;
nh->fib6_info = rt;
- err = ip6_convert_metrics(net, rt, r_cfg);
- if (err) {
- kfree(nh);
- return err;
- }
memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
list_add_tail(&nh->next, rt6_nh_list);
@@ -4671,20 +4622,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
int iif, int type, u32 portid, u32 seq,
unsigned int flags)
{
- struct rtmsg *rtm;
+ struct rt6_info *rt6 = (struct rt6_info *)dst;
+ struct rt6key *rt6_dst, *rt6_src;
+ u32 *pmetrics, table, rt6_flags;
struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
long expires = 0;
- u32 *pmetrics;
- u32 table;
nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
if (!nlh)
return -EMSGSIZE;
+ if (rt6) {
+ rt6_dst = &rt6->rt6i_dst;
+ rt6_src = &rt6->rt6i_src;
+ rt6_flags = rt6->rt6i_flags;
+ } else {
+ rt6_dst = &rt->fib6_dst;
+ rt6_src = &rt->fib6_src;
+ rt6_flags = rt->fib6_flags;
+ }
+
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET6;
- rtm->rtm_dst_len = rt->fib6_dst.plen;
- rtm->rtm_src_len = rt->fib6_src.plen;
+ rtm->rtm_dst_len = rt6_dst->plen;
+ rtm->rtm_src_len = rt6_src->plen;
rtm->rtm_tos = 0;
if (rt->fib6_table)
table = rt->fib6_table->tb6_id;
@@ -4699,7 +4661,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
rtm->rtm_scope = RT_SCOPE_UNIVERSE;
rtm->rtm_protocol = rt->fib6_protocol;
- if (rt->fib6_flags & RTF_CACHE)
+ if (rt6_flags & RTF_CACHE)
rtm->rtm_flags |= RTM_F_CLONED;
if (dest) {
@@ -4707,7 +4669,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
rtm->rtm_dst_len = 128;
} else if (rtm->rtm_dst_len)
- if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
+ if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
if (src) {
@@ -4715,12 +4677,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
rtm->rtm_src_len = 128;
} else if (rtm->rtm_src_len &&
- nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
+ nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
goto nla_put_failure;
#endif
if (iif) {
#ifdef CONFIG_IPV6_MROUTE
- if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
+ if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
int err = ip6mr_get_route(net, skb, rtm, portid);
if (err == 0)
@@ -4755,7 +4717,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
/* For multipath routes, walk the siblings list and add
* each as a nexthop within RTA_MULTIPATH.
*/
- if (rt->fib6_nsiblings) {
+ if (rt6) {
+ if (rt6_flags & RTF_GATEWAY &&
+ nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
+ goto nla_put_failure;
+
+ if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+ goto nla_put_failure;
+ } else if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
struct nlattr *mp;
@@ -4778,7 +4747,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
}
- if (rt->fib6_flags & RTF_EXPIRES) {
+ if (rt6_flags & RTF_EXPIRES) {
expires = dst ? dst->expires : rt->expires;
expires -= jiffies;
}
@@ -4786,7 +4755,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;
- if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
+ if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
goto nla_put_failure;
@@ -4798,28 +4767,52 @@ nla_put_failure:
return -EMSGSIZE;
}
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+ const struct net_device *dev)
+{
+ if (f6i->fib6_nh.nh_dev == dev)
+ return true;
+
+ if (f6i->fib6_nsiblings) {
+ struct fib6_info *sibling, *next_sibling;
+
+ list_for_each_entry_safe(sibling, next_sibling,
+ &f6i->fib6_siblings, fib6_siblings) {
+ if (sibling->fib6_nh.nh_dev == dev)
+ return true;
+ }
+ }
+
+ return false;
+}
+
int rt6_dump_route(struct fib6_info *rt, void *p_arg)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+ struct fib_dump_filter *filter = &arg->filter;
+ unsigned int flags = NLM_F_MULTI;
struct net *net = arg->net;
if (rt == net->ipv6.fib6_null_entry)
return 0;
- if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
- struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-
- /* user wants prefix routes only */
- if (rtm->rtm_flags & RTM_F_PREFIX &&
- !(rt->fib6_flags & RTF_PREFIX_RT)) {
- /* success since this is not a prefix route */
+ if ((filter->flags & RTM_F_PREFIX) &&
+ !(rt->fib6_flags & RTF_PREFIX_RT)) {
+ /* success since this is not a prefix route */
+ return 1;
+ }
+ if (filter->filter_set) {
+ if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+ (filter->protocol && rt->fib6_protocol != filter->protocol)) {
return 1;
}
+ flags |= NLM_F_DUMP_FILTERED;
}
return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
- arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ arg->cb->nlh->nlmsg_seq, flags);
}
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4833,7 +4826,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct rt6_info *rt;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi6 fl6;
+ struct flowi6 fl6 = {};
bool fibmatch;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
@@ -4842,7 +4835,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
goto errout;
err = -EINVAL;
- memset(&fl6, 0, sizeof(fl6));
rtm = nlmsg_data(nlh);
fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
@@ -5067,7 +5059,10 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
return 0;
}
-struct ctl_table ipv6_route_table_template[] = {
+static int zero;
+static int one = 1;
+
+static struct ctl_table ipv6_route_table_template[] = {
{
.procname = "flush",
.data = &init_net.ipv6.sysctl.flush_delay,
@@ -5138,6 +5133,15 @@ struct ctl_table ipv6_route_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
+ {
+ .procname = "skip_notify_on_dev_down",
+ .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
@@ -5161,6 +5165,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+ table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
@@ -5225,6 +5230,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+ net->ipv6.sysctl.skip_notify_on_dev_down = 0;
net->ipv6.ip6_rt_gc_expire = 30*HZ;
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 0fdf2a55e746..8d0ba757a46c 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -17,6 +17,7 @@
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/slab.h>
+#include <linux/rhashtable.h>
#include <net/ipv6.h>
#include <net/protocol.h>
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 558fe8cc6d43..8546f94f30d4 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -22,6 +22,7 @@
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
#include <linux/slab.h>
+#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index cd6e4cab63f6..60325dbfe88b 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -459,36 +459,57 @@ drop:
DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
+{
+ struct seg6_bpf_srh_state *srh_state =
+ this_cpu_ptr(&seg6_bpf_srh_states);
+ struct ipv6_sr_hdr *srh = srh_state->srh;
+
+ if (unlikely(srh == NULL))
+ return false;
+
+ if (unlikely(!srh_state->valid)) {
+ if ((srh_state->hdrlen & 7) != 0)
+ return false;
+
+ srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+ if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))
+ return false;
+
+ srh_state->valid = true;
+ }
+
+ return true;
+}
+
static int input_action_end_bpf(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
- struct seg6_bpf_srh_state local_srh_state;
struct ipv6_sr_hdr *srh;
- int srhoff = 0;
int ret;
srh = get_and_validate_srh(skb);
- if (!srh)
- goto drop;
+ if (!srh) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
/* preempt_disable is needed to protect the per-CPU buffer srh_state,
* which is also accessed by the bpf_lwt_seg6_* helpers
*/
preempt_disable();
+ srh_state->srh = srh;
srh_state->hdrlen = srh->hdrlen << 3;
- srh_state->valid = 1;
+ srh_state->valid = true;
rcu_read_lock();
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
rcu_read_unlock();
- local_srh_state = *srh_state;
- preempt_enable();
-
switch (ret) {
case BPF_OK:
case BPF_REDIRECT:
@@ -500,24 +521,17 @@ static int input_action_end_bpf(struct sk_buff *skb,
goto drop;
}
- if (unlikely((local_srh_state.hdrlen & 7) != 0))
- goto drop;
-
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
- goto drop;
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
- srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
-
- if (!local_srh_state.valid &&
- unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+ if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
goto drop;
+ preempt_enable();
if (ret != BPF_REDIRECT)
seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
drop:
+ preempt_enable();
kfree_skb(skb);
return -EINVAL;
}
@@ -637,12 +651,10 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
if (!seg6_validate_srh(srh, len))
return -EINVAL;
- slwt->srh = kmalloc(len, GFP_KERNEL);
+ slwt->srh = kmemdup(srh, len, GFP_KERNEL);
if (!slwt->srh)
return -ENOMEM;
- memcpy(slwt->srh, srh, len);
-
slwt->headroom += len;
return 0;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e9400ffa7875..51c9f75f34b9 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -534,13 +534,13 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, iph->protocol, 0);
+ t->parms.link, iph->protocol);
err = 0;
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- iph->protocol, 0);
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link,
+ iph->protocol);
err = 0;
goto out;
}
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 278e49cd67d4..e72947c99454 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -15,8 +15,8 @@
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
-static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *tcp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e6645cae403e..06d17ff3562f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net,
exact_dif, hslot2,
skb);
}
+ if (unlikely(IS_ERR(result)))
+ return NULL;
return result;
}
begin:
@@ -249,6 +251,8 @@ begin:
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
+ if (unlikely(IS_ERR(result)))
+ return NULL;
if (result)
return result;
}
@@ -544,7 +548,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
}
-static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
void udpv6_encap_enable(void)
{
static_branch_enable(&udpv6_encap_needed_key);
@@ -748,6 +752,26 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
}
}
+/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+ * return code conversion for ip layer consumption
+ */
+static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
+ struct udphdr *uh)
+{
+ int ret;
+
+ if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
+ skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ ip6_compute_pseudo);
+
+ ret = udpv6_queue_rcv_skb(sk, skb);
+
+ /* a return value > 0 means to resubmit the input */
+ if (ret > 0)
+ return ret;
+ return 0;
+}
+
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
@@ -799,13 +823,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (unlikely(sk->sk_rx_dst != dst))
udp6_sk_rx_dst_set(sk, dst);
- ret = udpv6_queue_rcv_skb(sk, skb);
- sock_put(sk);
+ if (!uh->check && !udp_sk(sk)->no_check6_rx) {
+ sock_put(sk);
+ goto report_csum_error;
+ }
- /* a return value > 0 means to resubmit the input */
- if (ret > 0)
- return ret;
- return 0;
+ ret = udp6_unicast_rcv_skb(sk, skb, uh);
+ sock_put(sk);
+ return ret;
}
/*
@@ -818,30 +843,13 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
/* Unicast */
sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk) {
- int ret;
-
- if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- udp6_csum_zero_error(skb);
- goto csum_error;
- }
-
- if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- ip6_compute_pseudo);
-
- ret = udpv6_queue_rcv_skb(sk, skb);
-
- /* a return value > 0 means to resubmit the input */
- if (ret > 0)
- return ret;
-
- return 0;
+ if (!uh->check && !udp_sk(sk)->no_check6_rx)
+ goto report_csum_error;
+ return udp6_unicast_rcv_skb(sk, skb, uh);
}
- if (!uh->check) {
- udp6_csum_zero_error(skb);
- goto csum_error;
- }
+ if (!uh->check)
+ goto report_csum_error;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard;
@@ -862,6 +870,9 @@ short_packet:
ulen, skb->len,
daddr, ntohs(uh->dest));
goto discard;
+
+report_csum_error:
+ udp6_csum_zero_error(skb);
csum_error:
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
discard:
@@ -1141,13 +1152,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int err;
int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
- struct sockcm_cookie sockc;
- ipc6.hlimit = -1;
- ipc6.tclass = -1;
- ipc6.dontfrag = -1;
+ ipcm6_init(&ipc6);
ipc6.gso_size = up->gso_size;
- sockc.tsflags = sk->sk_tsflags;
+ ipc6.sockc.tsflags = sk->sk_tsflags;
/* destination address check */
if (sin6) {
@@ -1282,7 +1290,7 @@ do_udp_sendmsg:
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
if (err > 0)
err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
- &ipc6, &sockc);
+ &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -1376,7 +1384,7 @@ back_from_confirm:
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, &cork, &sockc);
+ msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
err = udp_v6_send_skb(skb, &fl6, &cork.base);
@@ -1402,7 +1410,7 @@ do_append_data:
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
&ipc6, &fl6, (struct rt6_info *)dst,
- corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc);
+ corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
else if (!corkreq)
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 03a2ff3fe1e6..1b8e161ac527 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -114,12 +114,12 @@ out:
return segs;
}
-static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+static struct sk_buff *udp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
- if (unlikely(!uh))
+ if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
goto flush;
/* Don't bother verifying checksum if we're going to flush anyway. */
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 841f4a07438e..9ef490dddcea 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -59,6 +59,7 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
if (xo && (xo->flags & XFRM_GRO)) {
skb_mac_header_rebuild(skb);
+ skb_reset_transport_header(skb);
return -1;
}
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index 07d36573f50b..da28e4407b8f 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -55,7 +55,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
__skb_pull(skb, hdr_len);
memmove(ipv6_hdr(skb), iph, hdr_len);
- x->lastused = get_seconds();
+ x->lastused = ktime_get_real_seconds();
return 0;
}
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 9ad07a91708e..3c29da5defe6 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -51,7 +51,6 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ihl = skb->data - skb_transport_header(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
if (skb->transport_header != skb->network_header) {
memmove(skb_transport_header(skb),
@@ -60,8 +59,7 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
}
ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
sizeof(struct ipv6hdr));
- if (!xo || !(xo->flags & XFRM_GRO))
- skb_reset_transport_header(skb);
+ skb_reset_transport_header(skb);
return 0;
}
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 5959ce9620eb..6a74080005cf 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -170,9 +170,11 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (toobig && xfrm6_local_dontfrag(skb)) {
xfrm6_local_rxpmtu(skb, mtu);
+ kfree_skb(skb);
return -EMSGSIZE;
} else if (!skb->ignore_df && toobig && skb->sk) {
xfrm_local_error(skb, mtu);
+ kfree_skb(skb);
return -EMSGSIZE;
}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ef3defaf43b9..d35bcf92969c 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -146,8 +146,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
- while (nh + offset + 1 < skb->data ||
- pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
+ while (nh + offset + sizeof(*exthdr) < skb->data ||
+ pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
nh = skb_network_header(skb);
exthdr = (struct ipv6_opt_hdr *)(nh + offset);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 893a022f9620..0bed4cc20603 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -48,7 +48,7 @@ static struct iucv_interface *pr_iucv;
static const u8 iprm_shutdown[8] =
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
-#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class))
+#define TRGCLS_SIZE FIELD_SIZEOF(struct iucv_message, class)
#define __iucv_sock_wait(sk, condition, timeo, ret) \
do { \
@@ -150,7 +150,6 @@ static int afiucv_pm_freeze(struct device *dev)
{
struct iucv_sock *iucv;
struct sock *sk;
- int err = 0;
#ifdef CONFIG_PM_DEBUG
printk(KERN_WARNING "afiucv_pm_freeze\n");
@@ -175,7 +174,7 @@ static int afiucv_pm_freeze(struct device *dev)
skb_queue_purge(&iucv->backlog_skb_q);
}
read_unlock(&iucv_sk_list.lock);
- return err;
+ return 0;
}
/**
@@ -321,13 +320,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
struct sk_buff *nskb;
int err, confirm_recv = 0;
- memset(skb->head, 0, ETH_HLEN);
- phs_hdr = skb_push(skb, sizeof(struct af_iucv_trans_hdr));
- skb_reset_mac_header(skb);
+ phs_hdr = skb_push(skb, sizeof(*phs_hdr));
+ memset(phs_hdr, 0, sizeof(*phs_hdr));
skb_reset_network_header(skb);
- skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- memset(phs_hdr, 0, sizeof(struct af_iucv_trans_hdr));
phs_hdr->magic = ETH_P_AF_IUCV;
phs_hdr->version = 1;
@@ -351,21 +346,32 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
if (imsg)
memcpy(&phs_hdr->iucv_hdr, imsg, sizeof(struct iucv_message));
+ skb_push(skb, ETH_HLEN);
+ memset(skb->data, 0, ETH_HLEN);
+
skb->dev = iucv->hs_dev;
- if (!skb->dev)
- return -ENODEV;
- if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev))
- return -ENETDOWN;
+ if (!skb->dev) {
+ err = -ENODEV;
+ goto err_free;
+ }
+ if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev)) {
+ err = -ENETDOWN;
+ goto err_free;
+ }
if (skb->len > skb->dev->mtu) {
- if (sock->sk_type == SOCK_SEQPACKET)
- return -EMSGSIZE;
- else
- skb_trim(skb, skb->dev->mtu);
+ if (sock->sk_type == SOCK_SEQPACKET) {
+ err = -EMSGSIZE;
+ goto err_free;
+ }
+ skb_trim(skb, skb->dev->mtu);
}
skb->protocol = cpu_to_be16(ETH_P_AF_IUCV);
nskb = skb_clone(skb, GFP_ATOMIC);
- if (!nskb)
- return -ENOMEM;
+ if (!nskb) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
skb_queue_tail(&iucv->send_skb_q, nskb);
err = dev_queue_xmit(skb);
if (net_xmit_eval(err)) {
@@ -376,6 +382,10 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
WARN_ON(atomic_read(&iucv->msg_recv) < 0);
}
return net_xmit_eval(err);
+
+err_free:
+ kfree_skb(skb);
+ return err;
}
static struct sock *__iucv_get_sock_by_name(char *nm)
@@ -1168,7 +1178,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
err = afiucv_hs_send(&txmsg, sk, skb, 0);
if (err) {
atomic_dec(&iucv->msg_sent);
- goto fail;
+ goto out;
}
} else { /* Classic VM IUCV transport */
skb_queue_tail(&iucv->send_skb_q, skb);
@@ -1494,7 +1504,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
__poll_t mask = 0;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
if (sk->sk_state == IUCV_LISTEN)
return iucv_accept_poll(sk);
@@ -1932,8 +1942,7 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16])
/***************** HiperSockets transport callbacks ********************/
static void afiucv_swap_src_dest(struct sk_buff *skb)
{
- struct af_iucv_trans_hdr *trans_hdr =
- (struct af_iucv_trans_hdr *)skb->data;
+ struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
char tmpID[8];
char tmpName[8];
@@ -1956,13 +1965,12 @@ static void afiucv_swap_src_dest(struct sk_buff *skb)
**/
static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
{
+ struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
struct sock *nsk;
struct iucv_sock *iucv, *niucv;
- struct af_iucv_trans_hdr *trans_hdr;
int err;
iucv = iucv_sk(sk);
- trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
if (!iucv) {
/* no sock - connection refused */
afiucv_swap_src_dest(skb);
@@ -2023,15 +2031,13 @@ out:
static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
- struct af_iucv_trans_hdr *trans_hdr =
- (struct af_iucv_trans_hdr *)skb->data;
if (!iucv)
goto out;
if (sk->sk_state != IUCV_BOUND)
goto out;
bh_lock_sock(sk);
- iucv->msglimit_peer = trans_hdr->window;
+ iucv->msglimit_peer = iucv_trans_hdr(skb)->window;
sk->sk_state = IUCV_CONNECTED;
sk->sk_state_change(sk);
bh_unlock_sock(sk);
@@ -2087,8 +2093,6 @@ out:
static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
- struct af_iucv_trans_hdr *trans_hdr =
- (struct af_iucv_trans_hdr *)skb->data;
if (!iucv)
return NET_RX_SUCCESS;
@@ -2096,7 +2100,7 @@ static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state != IUCV_CONNECTED)
return NET_RX_SUCCESS;
- atomic_sub(trans_hdr->window, &iucv->msg_sent);
+ atomic_sub(iucv_trans_hdr(skb)->window, &iucv->msg_sent);
iucv_sock_wake_msglim(sk);
return NET_RX_SUCCESS;
}
@@ -2156,25 +2160,16 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
struct sock *sk;
struct iucv_sock *iucv;
struct af_iucv_trans_hdr *trans_hdr;
+ int err = NET_RX_SUCCESS;
char nullstring[8];
- int err = 0;
- if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) {
- WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d",
- (int)skb->len,
- (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr)));
+ if (!pskb_may_pull(skb, sizeof(*trans_hdr))) {
+ WARN_ONCE(1, "AF_IUCV failed to receive skb, len=%u", skb->len);
kfree_skb(skb);
return NET_RX_SUCCESS;
}
- if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr)))
- if (skb_linearize(skb)) {
- WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d",
- (int)skb->len);
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
- skb_pull(skb, ETH_HLEN);
- trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
+
+ trans_hdr = iucv_trans_hdr(skb);
EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID));
EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName));
@@ -2255,7 +2250,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
err = afiucv_hs_callback_rx(sk, skb);
break;
default:
- ;
+ kfree_skb(skb);
}
return err;
@@ -2515,4 +2510,3 @@ MODULE_DESCRIPTION("IUCV Sockets ver " VERSION);
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_IUCV);
-
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 8f7ef167c45a..eb502c6290c2 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1874,7 +1874,7 @@ static void iucv_pm_complete(struct device *dev)
* Returns 0 if there are still iucv pathes defined
* 1 if there are no iucv pathes defined
*/
-int iucv_path_table_empty(void)
+static int iucv_path_table_empty(void)
{
int i;
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 87fca36e6c47..9ca83f2ade6f 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -8,4 +8,3 @@ config AF_KCM
KCM (Kernel Connection Multiplexor) sockets provide a method
for multiplexing messages of a message based application
protocol over kernel connectons (e.g. TCP connections).
-
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d3601d421571..571d824e4e24 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -2104,4 +2104,3 @@ module_exit(kcm_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_KCM);
-
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 5e1d2946ffbf..9d61266526e7 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
}
if (!x)
- x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family);
+ x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
if (x == NULL)
return -ENOENT;
@@ -2414,7 +2414,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
return err;
}
- xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
1, &err);
security_xfrm_policy_free(pol_ctx);
@@ -2663,7 +2663,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
return -EINVAL;
delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
- xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
dir, pol->sadb_x_policy_id, delete, &err);
if (xp == NULL)
return -ENOENT;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 40261cb68e83..82cdf9020b53 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -203,44 +203,44 @@ struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth)
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth);
-/* Lookup a session. A new reference is held on the returned session. */
-struct l2tp_session *l2tp_session_get(const struct net *net,
- struct l2tp_tunnel *tunnel,
- u32 session_id)
+struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel,
+ u32 session_id)
{
struct hlist_head *session_list;
struct l2tp_session *session;
- if (!tunnel) {
- struct l2tp_net *pn = l2tp_pernet(net);
-
- session_list = l2tp_session_id_hash_2(pn, session_id);
+ session_list = l2tp_session_id_hash(tunnel, session_id);
- rcu_read_lock_bh();
- hlist_for_each_entry_rcu(session, session_list, global_hlist) {
- if (session->session_id == session_id) {
- l2tp_session_inc_refcount(session);
- rcu_read_unlock_bh();
+ read_lock_bh(&tunnel->hlist_lock);
+ hlist_for_each_entry(session, session_list, hlist)
+ if (session->session_id == session_id) {
+ l2tp_session_inc_refcount(session);
+ read_unlock_bh(&tunnel->hlist_lock);
- return session;
- }
+ return session;
}
- rcu_read_unlock_bh();
+ read_unlock_bh(&tunnel->hlist_lock);
- return NULL;
- }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_get_session);
- session_list = l2tp_session_id_hash(tunnel, session_id);
- read_lock_bh(&tunnel->hlist_lock);
- hlist_for_each_entry(session, session_list, hlist) {
+struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id)
+{
+ struct hlist_head *session_list;
+ struct l2tp_session *session;
+
+ session_list = l2tp_session_id_hash_2(l2tp_pernet(net), session_id);
+
+ rcu_read_lock_bh();
+ hlist_for_each_entry_rcu(session, session_list, global_hlist)
if (session->session_id == session_id) {
l2tp_session_inc_refcount(session);
- read_unlock_bh(&tunnel->hlist_lock);
+ rcu_read_unlock_bh();
return session;
}
- }
- read_unlock_bh(&tunnel->hlist_lock);
+ rcu_read_unlock_bh();
return NULL;
}
@@ -322,8 +322,7 @@ int l2tp_session_register(struct l2tp_session *session,
if (tunnel->version == L2TP_HDR_VER_3) {
pn = l2tp_pernet(tunnel->l2tp_net);
- g_head = l2tp_session_id_hash_2(l2tp_pernet(tunnel->l2tp_net),
- session->session_id);
+ g_head = l2tp_session_id_hash_2(pn, session->session_id);
spin_lock_bh(&pn->l2tp_session_hlist_lock);
@@ -620,7 +619,7 @@ discard:
*/
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
- int length, int (*payload_hook)(struct sk_buff *skb))
+ int length)
{
struct l2tp_tunnel *tunnel = session->tunnel;
int offset;
@@ -741,13 +740,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
__skb_pull(skb, offset);
- /* If caller wants to process the payload before we queue the
- * packet, do so now.
- */
- if (payload_hook)
- if ((*payload_hook)(skb))
- goto discard;
-
/* Prepare skb for adding to the session's reorder_q. Hold
* packets for max reorder_timeout or 1 second if not
* reordering.
@@ -783,7 +775,7 @@ EXPORT_SYMBOL(l2tp_recv_common);
/* Drop skbs from the session's reorder_q
*/
-int l2tp_session_queue_purge(struct l2tp_session *session)
+static int l2tp_session_queue_purge(struct l2tp_session *session)
{
struct sk_buff *skb = NULL;
BUG_ON(!session);
@@ -794,7 +786,6 @@ int l2tp_session_queue_purge(struct l2tp_session *session)
}
return 0;
}
-EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
* here. The skb is not on a list when we get here.
@@ -802,8 +793,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_queue_purge);
* Returns 1 if the packet was not a good data packet and could not be
* forwarded. All such packets are passed up to userspace to deal with.
*/
-static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
- int (*payload_hook)(struct sk_buff *skb))
+static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
{
struct l2tp_session *session = NULL;
unsigned char *ptr, *optr;
@@ -882,7 +872,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
}
/* Find the session context */
- session = l2tp_session_get(tunnel->l2tp_net, tunnel, session_id);
+ session = l2tp_tunnel_get_session(tunnel, session_id);
if (!session || !session->recv_skb) {
if (session)
l2tp_session_dec_refcount(session);
@@ -894,7 +884,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
goto error;
}
- l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
+ l2tp_recv_common(session, skb, ptr, optr, hdrflags, length);
l2tp_session_dec_refcount(session);
return 0;
@@ -923,7 +913,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
tunnel->name, skb->len);
- if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook))
+ if (l2tp_udp_recv_core(tunnel, skb))
goto pass_up;
return 0;
@@ -1009,8 +999,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
-static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
- struct flowi *fl, size_t data_len)
+static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
+ struct flowi *fl, size_t data_len)
{
struct l2tp_tunnel *tunnel = session->tunnel;
unsigned int len = skb->len;
@@ -1052,8 +1042,6 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
atomic_long_inc(&tunnel->stats.tx_errors);
atomic_long_inc(&session->stats.tx_errors);
}
-
- return 0;
}
/* If caller requires the skb to have a ppp header, the header must be
@@ -1110,7 +1098,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
/* Get routing info from the tunnel socket */
skb_dst_drop(skb);
- skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0)));
+ skb_dst_set(skb, sk_dst_check(sk, 0));
inet = inet_sk(sk);
fl = &inet->cork.fl;
@@ -1193,7 +1181,7 @@ end:
/* When the tunnel is closed, all the attached sessions need to go too.
*/
-void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
+static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
{
int hash;
struct hlist_node *walk;
@@ -1242,7 +1230,6 @@ again:
}
write_unlock_bh(&tunnel->hlist_lock);
}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
/* Tunnel socket destroy hook for UDP encapsulation */
static void l2tp_udp_encap_destroy(struct sock *sk)
@@ -1687,8 +1674,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
if (cfg) {
session->pwtype = cfg->pw_type;
session->debug = cfg->debug;
- session->mtu = cfg->mtu;
- session->mru = cfg->mru;
session->send_seq = cfg->send_seq;
session->recv_seq = cfg->recv_seq;
session->lns_mode = cfg->lns_mode;
@@ -1800,4 +1785,3 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
MODULE_DESCRIPTION("L2TP core");
MODULE_LICENSE("GPL");
MODULE_VERSION(L2TP_DRV_VERSION);
-
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index c199020f8a8a..9c9afe94d389 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -12,6 +12,13 @@
#ifndef _L2TP_CORE_H_
#define _L2TP_CORE_H_
+#include <net/dst.h>
+#include <net/sock.h>
+
+#ifdef CONFIG_XFRM
+#include <net/xfrm.h>
+#endif
+
/* Just some random numbers */
#define L2TP_TUNNEL_MAGIC 0x42114DDA
#define L2TP_SESSION_MAGIC 0x0C04EB7D
@@ -45,10 +52,6 @@ struct l2tp_tunnel;
*/
struct l2tp_session_cfg {
enum l2tp_pwtype pw_type;
- unsigned int data_seq:2; /* data sequencing level
- * 0 => none, 1 => IP only,
- * 2 => all
- */
unsigned int recv_seq:1; /* expect receive packets with
* sequence numbers? */
unsigned int send_seq:1; /* send packets with sequence
@@ -58,7 +61,6 @@ struct l2tp_session_cfg {
* control of LNS. */
int debug; /* bitmask of debug message
* categories */
- u16 vlan_id; /* VLAN pseudowire only */
u16 l2specific_type; /* Layer 2 specific type */
u8 cookie[8]; /* optional cookie */
int cookie_len; /* 0, 4 or 8 bytes */
@@ -66,8 +68,6 @@ struct l2tp_session_cfg {
int peer_cookie_len; /* 0, 4 or 8 bytes */
int reorder_timeout; /* configured reorder timeout
* (in jiffies) */
- int mtu;
- int mru;
char *ifname;
};
@@ -99,10 +99,6 @@ struct l2tp_session {
char name[32]; /* for logging */
char ifname[IFNAMSIZ];
- unsigned int data_seq:2; /* data sequencing level
- * 0 => none, 1 => IP only,
- * 2 => all
- */
unsigned int recv_seq:1; /* expect receive packets with
* sequence numbers? */
unsigned int send_seq:1; /* send packets with sequence
@@ -115,8 +111,6 @@ struct l2tp_session {
int reorder_timeout; /* configured reorder timeout
* (in jiffies) */
int reorder_skip; /* set if skip to next nr */
- int mtu;
- int mru;
enum l2tp_pwtype pwtype;
struct l2tp_stats stats;
struct hlist_node global_hlist; /* Global hash list node */
@@ -124,9 +118,7 @@ struct l2tp_session {
int (*build_header)(struct l2tp_session *session, void *buf);
void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len);
void (*session_close)(struct l2tp_session *session);
-#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
void (*show)(struct seq_file *m, void *priv);
-#endif
uint8_t priv[0]; /* private data */
};
@@ -180,18 +172,12 @@ struct l2tp_tunnel {
struct net *l2tp_net; /* the net we belong to */
refcount_t ref_count;
-#ifdef CONFIG_DEBUG_FS
- void (*show)(struct seq_file *m, void *arg);
-#endif
- int (*recv_payload_hook)(struct sk_buff *skb);
void (*old_sk_destruct)(struct sock *);
struct sock *sock; /* Parent socket */
int fd; /* Parent fd, if tunnel socket
* was created by userspace */
struct work_struct del_work;
-
- uint8_t priv[0]; /* private data */
};
struct l2tp_nl_cmd_ops {
@@ -201,11 +187,6 @@ struct l2tp_nl_cmd_ops {
int (*session_delete)(struct l2tp_session *session);
};
-static inline void *l2tp_tunnel_priv(struct l2tp_tunnel *tunnel)
-{
- return &tunnel->priv[0];
-}
-
static inline void *l2tp_session_priv(struct l2tp_session *session)
{
return &session->priv[0];
@@ -213,12 +194,12 @@ static inline void *l2tp_session_priv(struct l2tp_session *session)
struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth);
+struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel,
+ u32 session_id);
void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
-struct l2tp_session *l2tp_session_get(const struct net *net,
- struct l2tp_tunnel *tunnel,
- u32 session_id);
+struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id);
struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth);
struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
const char *ifname);
@@ -229,7 +210,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
struct l2tp_tunnel_cfg *cfg);
-void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
struct l2tp_session *l2tp_session_create(int priv_size,
struct l2tp_tunnel *tunnel,
@@ -243,8 +223,7 @@ int l2tp_session_delete(struct l2tp_session *session);
void l2tp_session_free(struct l2tp_session *session);
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
- int length, int (*payload_hook)(struct sk_buff *skb));
-int l2tp_session_queue_purge(struct l2tp_session *session);
+ int length);
int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
void l2tp_session_set_header_len(struct l2tp_session *session, int version);
@@ -292,6 +271,36 @@ static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
}
}
+static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel)
+{
+ struct dst_entry *dst;
+ u32 mtu;
+
+ dst = sk_dst_get(tunnel->sock);
+ if (!dst)
+ return 0;
+
+ mtu = dst_mtu(dst);
+ dst_release(dst);
+
+ return mtu;
+}
+
+#ifdef CONFIG_XFRM
+static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel)
+{
+ struct sock *sk = tunnel->sock;
+
+ return sk && (rcu_access_pointer(sk->sk_policy[0]) ||
+ rcu_access_pointer(sk->sk_policy[1]));
+}
+#else
+static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel)
+{
+ return false;
+}
+#endif
+
#define l2tp_printk(ptr, type, func, fmt, ...) \
do { \
if (((ptr)->debug) & (type)) \
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index e87686f7d63c..9821a1458555 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -177,9 +177,6 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
atomic_long_read(&tunnel->stats.rx_packets),
atomic_long_read(&tunnel->stats.rx_bytes),
atomic_long_read(&tunnel->stats.rx_errors));
-
- if (tunnel->show != NULL)
- tunnel->show(m, tunnel);
}
static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
@@ -194,12 +191,9 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
if (session->send_seq || session->recv_seq)
seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns);
seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count));
- seq_printf(m, " config %d/%d/%c/%c/%s/%s %08x %u\n",
- session->mtu, session->mru,
+ seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n",
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
- session->data_seq == 1 ? "IPSEQ" :
- session->data_seq == 2 ? "DATASEQ" : "-",
session->lns_mode ? "LNS" : "LAC",
session->debug,
jiffies_to_msecs(session->reorder_timeout));
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 5c366ecfa1cb..8aadc4f3bb9e 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -199,7 +199,6 @@ static void l2tp_eth_delete(struct l2tp_session *session)
}
}
-#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
static void l2tp_eth_show(struct seq_file *m, void *arg)
{
struct l2tp_session *session = arg;
@@ -219,29 +218,25 @@ static void l2tp_eth_show(struct seq_file *m, void *arg)
dev_put(dev);
}
-#endif
static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
struct l2tp_session *session,
struct net_device *dev)
{
unsigned int overhead = 0;
- struct dst_entry *dst;
u32 l3_overhead = 0;
+ u32 mtu;
/* if the encap is UDP, account for UDP header size */
if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
overhead += sizeof(struct udphdr);
dev->needed_headroom += sizeof(struct udphdr);
}
- if (session->mtu != 0) {
- dev->mtu = session->mtu;
- dev->needed_headroom += session->hdr_len;
- return;
- }
+
lock_sock(tunnel->sock);
l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
release_sock(tunnel->sock);
+
if (l3_overhead == 0) {
/* L3 Overhead couldn't be identified, this could be
* because tunnel->sock was NULL or the socket's
@@ -255,18 +250,12 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
*/
overhead += session->hdr_len + ETH_HLEN + l3_overhead;
- /* If PMTU discovery was enabled, use discovered MTU on L2TP device */
- dst = sk_dst_get(tunnel->sock);
- if (dst) {
- /* dst_mtu will use PMTU if found, else fallback to intf MTU */
- u32 pmtu = dst_mtu(dst);
+ mtu = l2tp_tunnel_dst_mtu(tunnel) - overhead;
+ if (mtu < dev->min_mtu || mtu > dev->max_mtu)
+ dev->mtu = ETH_DATA_LEN - overhead;
+ else
+ dev->mtu = mtu;
- if (pmtu != 0)
- dev->mtu = pmtu;
- dst_release(dst);
- }
- session->mtu = dev->mtu - overhead;
- dev->mtu = session->mtu;
dev->needed_headroom += session->hdr_len;
}
@@ -314,9 +303,8 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
session->recv_skb = l2tp_eth_dev_recv;
session->session_close = l2tp_eth_delete;
-#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
- session->show = l2tp_eth_show;
-#endif
+ if (IS_ENABLED(CONFIG_L2TP_DEBUGFS))
+ session->show = l2tp_eth_show;
spriv = l2tp_session_priv(session);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a9c05b2bc1b0..35f6f86d4dcc 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -144,7 +144,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_get(net, NULL, session_id);
+ session = l2tp_session_get(net, session_id);
if (!session)
goto discard;
@@ -165,7 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
}
- l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
+ l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
l2tp_session_dec_refcount(session);
return 0;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 957369192ca1..237f1a4a0b0c 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -157,7 +157,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_get(net, NULL, session_id);
+ session = l2tp_session_get(net, session_id);
if (!session)
goto discard;
@@ -178,8 +178,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
}
- l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
- tunnel->recv_payload_hook);
+ l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
l2tp_session_dec_refcount(session);
return 0;
@@ -500,7 +499,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
struct flowi6 fl6;
- struct sockcm_cookie sockc_unused = {0};
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
int transhdrlen = 4; /* zero session-id */
@@ -525,9 +523,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowi6_mark = sk->sk_mark;
fl6.flowi6_uid = sk->sk_uid;
- ipc6.hlimit = -1;
- ipc6.tclass = -1;
- ipc6.dontfrag = -1;
+ ipcm6_init(&ipc6);
if (lsa) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -575,8 +571,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt->tot_len = sizeof(struct ipv6_txoptions);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6,
- &sockc_unused);
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
@@ -641,7 +636,7 @@ back_from_confirm:
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
&fl6, (struct rt6_info *)dst,
- msg->msg_flags, &sockc_unused);
+ msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE))
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 5b9900889e31..edbd5d1fbcde 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -66,7 +66,7 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info)
session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
tunnel = l2tp_tunnel_get(net, tunnel_id);
if (tunnel) {
- session = l2tp_session_get(net, tunnel, session_id);
+ session = l2tp_tunnel_get_session(tunnel, session_id);
l2tp_tunnel_dec_refcount(tunnel);
}
}
@@ -560,9 +560,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
}
if (tunnel->version > 2) {
- if (info->attrs[L2TP_ATTR_DATA_SEQ])
- cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) {
cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT &&
@@ -594,9 +591,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
}
if (info->attrs[L2TP_ATTR_IFNAME])
cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
-
- if (info->attrs[L2TP_ATTR_VLAN_ID])
- cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
}
if (info->attrs[L2TP_ATTR_DEBUG])
@@ -614,12 +608,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
- if (info->attrs[L2TP_ATTR_MTU])
- cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
-
- if (info->attrs[L2TP_ATTR_MRU])
- cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
#ifdef CONFIG_MODULES
if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
genl_unlock();
@@ -639,7 +627,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
&cfg);
if (ret >= 0) {
- session = l2tp_session_get(net, tunnel, session_id);
+ session = l2tp_tunnel_get_session(tunnel, session_id);
if (session) {
ret = l2tp_session_notify(&l2tp_nl_family, info, session,
L2TP_CMD_SESSION_CREATE);
@@ -693,9 +681,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_DEBUG])
session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
- if (info->attrs[L2TP_ATTR_DATA_SEQ])
- session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -710,12 +695,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
- if (info->attrs[L2TP_ATTR_MTU])
- session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
-
- if (info->attrs[L2TP_ATTR_MRU])
- session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
ret = l2tp_session_notify(&l2tp_nl_family, info,
session, L2TP_CMD_SESSION_MODIFY);
@@ -731,9 +710,6 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
void *hdr;
struct nlattr *nest;
struct l2tp_tunnel *tunnel = session->tunnel;
- struct sock *sk = NULL;
-
- sk = tunnel->sock;
hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd);
if (!hdr)
@@ -745,10 +721,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID,
session->peer_session_id) ||
nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
- nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) ||
- nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) ||
- (session->mru &&
- nla_put_u16(skb, L2TP_ATTR_MRU, session->mru)))
+ nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
goto nla_put_failure;
if ((session->ifname[0] &&
@@ -762,10 +735,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) ||
nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) ||
nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) ||
-#ifdef CONFIG_XFRM
- (((sk) && (sk->sk_policy[0] || sk->sk_policy[1])) &&
+ (l2tp_tunnel_uses_xfrm(tunnel) &&
nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) ||
-#endif
(session->reorder_timeout &&
nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT,
session->reorder_timeout, L2TP_ATTR_PAD)))
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index cf6cca260e7b..04d9946dcdba 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -93,10 +93,8 @@
#include <linux/nsproxy.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <net/dst.h>
#include <net/ip.h>
#include <net/udp.h>
-#include <net/xfrm.h>
#include <net/inet_common.h>
#include <asm/byteorder.h>
@@ -127,8 +125,6 @@ struct pppol2tp_session {
* PPPoX socket */
struct sock *__sk; /* Copy of .sk, for cleanup */
struct rcu_head rcu; /* For asynchronous release */
- int flags; /* accessed by PPPIOCGFLAGS.
- * Unused. */
};
static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
@@ -183,25 +179,6 @@ out:
* Receive data handling
*****************************************************************************/
-static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
-{
- /* Skip PPP header, if present. In testing, Microsoft L2TP clients
- * don't send the PPP header (PPP header compression enabled), but
- * other clients can include the header. So we cope with both cases
- * here. The PPP header is always FF03 when using L2TP.
- *
- * Note that skb->data[] isn't dereferenced from a u16 ptr here since
- * the field may be unaligned.
- */
- if (!pskb_may_pull(skb, 2))
- return 1;
-
- if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI))
- skb_pull(skb, 2);
-
- return 0;
-}
-
/* Receive message. This is the recvmsg for the PPPoL2TP socket.
*/
static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -248,6 +225,17 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk == NULL)
goto no_sock;
+ /* If the first two bytes are 0xFF03, consider that it is the PPP's
+ * Address and Control fields and skip them. The L2TP module has always
+ * worked this way, although, in theory, the use of these fields should
+ * be negociated and handled at the PPP layer. These fields are
+ * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered
+ * Information command with Poll/Final bit set to zero (RFC 1662).
+ */
+ if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS &&
+ skb->data[1] == PPP_UI)
+ skb_pull(skb, 2);
+
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
@@ -424,12 +412,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
sock_put(ps->__sk);
}
-/* Called by l2tp_core when a session socket is being closed.
- */
-static void pppol2tp_session_close(struct l2tp_session *session)
-{
-}
-
/* Really kill the session socket. (Called from sock_put() if
* refcnt == 0.)
*/
@@ -551,7 +533,6 @@ out:
return error;
}
-#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
static void pppol2tp_show(struct seq_file *m, void *arg)
{
struct l2tp_session *session = arg;
@@ -565,34 +546,118 @@ static void pppol2tp_show(struct seq_file *m, void *arg)
sock_put(sk);
}
}
-#endif
static void pppol2tp_session_init(struct l2tp_session *session)
{
struct pppol2tp_session *ps;
- struct dst_entry *dst;
session->recv_skb = pppol2tp_recv;
- session->session_close = pppol2tp_session_close;
-#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
- session->show = pppol2tp_show;
-#endif
+ if (IS_ENABLED(CONFIG_L2TP_DEBUGFS))
+ session->show = pppol2tp_show;
ps = l2tp_session_priv(session);
mutex_init(&ps->sk_lock);
ps->owner = current->pid;
+}
+
+struct l2tp_connect_info {
+ u8 version;
+ int fd;
+ u32 tunnel_id;
+ u32 peer_tunnel_id;
+ u32 session_id;
+ u32 peer_session_id;
+};
+
+static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len,
+ struct l2tp_connect_info *info)
+{
+ switch (sa_len) {
+ case sizeof(struct sockaddr_pppol2tp):
+ {
+ const struct sockaddr_pppol2tp *sa_v2in4 = sa;
- /* If PMTU discovery was enabled, use the MTU that was discovered */
- dst = sk_dst_get(session->tunnel->sock);
- if (dst) {
- u32 pmtu = dst_mtu(dst);
+ if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
- if (pmtu) {
- session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD;
- session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD;
- }
- dst_release(dst);
+ info->version = 2;
+ info->fd = sa_v2in4->pppol2tp.fd;
+ info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel;
+ info->session_id = sa_v2in4->pppol2tp.s_session;
+ info->peer_session_id = sa_v2in4->pppol2tp.d_session;
+
+ break;
}
+ case sizeof(struct sockaddr_pppol2tpv3):
+ {
+ const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa;
+
+ if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 3;
+ info->fd = sa_v3in4->pppol2tp.fd;
+ info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel;
+ info->session_id = sa_v3in4->pppol2tp.s_session;
+ info->peer_session_id = sa_v3in4->pppol2tp.d_session;
+
+ break;
+ }
+ case sizeof(struct sockaddr_pppol2tpin6):
+ {
+ const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa;
+
+ if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 2;
+ info->fd = sa_v2in6->pppol2tp.fd;
+ info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel;
+ info->session_id = sa_v2in6->pppol2tp.s_session;
+ info->peer_session_id = sa_v2in6->pppol2tp.d_session;
+
+ break;
+ }
+ case sizeof(struct sockaddr_pppol2tpv3in6):
+ {
+ const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa;
+
+ if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP)
+ return -EINVAL;
+
+ info->version = 3;
+ info->fd = sa_v3in6->pppol2tp.fd;
+ info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel;
+ info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel;
+ info->session_id = sa_v3in6->pppol2tp.s_session;
+ info->peer_session_id = sa_v3in6->pppol2tp.d_session;
+
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Rough estimation of the maximum payload size a tunnel can transmit without
+ * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence
+ * numbers and no IP option. Not quite accurate, but the result is mostly
+ * unused anyway.
+ */
+static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel)
+{
+ int mtu;
+
+ mtu = l2tp_tunnel_dst_mtu(tunnel);
+ if (mtu <= PPPOL2TP_HEADER_OVERHEAD)
+ return 1500 - PPPOL2TP_HEADER_OVERHEAD;
+
+ return mtu - PPPOL2TP_HEADER_OVERHEAD;
}
/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
@@ -601,34 +666,23 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
int sockaddr_len, int flags)
{
struct sock *sk = sock->sk;
- struct sockaddr_pppol2tp *sp = (struct sockaddr_pppol2tp *) uservaddr;
struct pppox_sock *po = pppox_sk(sk);
struct l2tp_session *session = NULL;
+ struct l2tp_connect_info info;
struct l2tp_tunnel *tunnel;
struct pppol2tp_session *ps;
struct l2tp_session_cfg cfg = { 0, };
- int error = 0;
- u32 tunnel_id, peer_tunnel_id;
- u32 session_id, peer_session_id;
bool drop_refcnt = false;
bool drop_tunnel = false;
bool new_session = false;
bool new_tunnel = false;
- int ver = 2;
- int fd;
-
- lock_sock(sk);
-
- error = -EINVAL;
+ int error;
- if (sockaddr_len != sizeof(struct sockaddr_pppol2tp) &&
- sockaddr_len != sizeof(struct sockaddr_pppol2tpv3) &&
- sockaddr_len != sizeof(struct sockaddr_pppol2tpin6) &&
- sockaddr_len != sizeof(struct sockaddr_pppol2tpv3in6))
- goto end;
+ error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info);
+ if (error < 0)
+ return error;
- if (sp->sa_protocol != PX_PROTO_OL2TP)
- goto end;
+ lock_sock(sk);
/* Check for already bound sockets */
error = -EBUSY;
@@ -640,56 +694,12 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (sk->sk_user_data)
goto end; /* socket is already attached */
- /* Get params from socket address. Handle L2TPv2 and L2TPv3.
- * This is nasty because there are different sockaddr_pppol2tp
- * structs for L2TPv2, L2TPv3, over IPv4 and IPv6. We use
- * the sockaddr size to determine which structure the caller
- * is using.
- */
- peer_tunnel_id = 0;
- if (sockaddr_len == sizeof(struct sockaddr_pppol2tp)) {
- fd = sp->pppol2tp.fd;
- tunnel_id = sp->pppol2tp.s_tunnel;
- peer_tunnel_id = sp->pppol2tp.d_tunnel;
- session_id = sp->pppol2tp.s_session;
- peer_session_id = sp->pppol2tp.d_session;
- } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3)) {
- struct sockaddr_pppol2tpv3 *sp3 =
- (struct sockaddr_pppol2tpv3 *) sp;
- ver = 3;
- fd = sp3->pppol2tp.fd;
- tunnel_id = sp3->pppol2tp.s_tunnel;
- peer_tunnel_id = sp3->pppol2tp.d_tunnel;
- session_id = sp3->pppol2tp.s_session;
- peer_session_id = sp3->pppol2tp.d_session;
- } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpin6)) {
- struct sockaddr_pppol2tpin6 *sp6 =
- (struct sockaddr_pppol2tpin6 *) sp;
- fd = sp6->pppol2tp.fd;
- tunnel_id = sp6->pppol2tp.s_tunnel;
- peer_tunnel_id = sp6->pppol2tp.d_tunnel;
- session_id = sp6->pppol2tp.s_session;
- peer_session_id = sp6->pppol2tp.d_session;
- } else if (sockaddr_len == sizeof(struct sockaddr_pppol2tpv3in6)) {
- struct sockaddr_pppol2tpv3in6 *sp6 =
- (struct sockaddr_pppol2tpv3in6 *) sp;
- ver = 3;
- fd = sp6->pppol2tp.fd;
- tunnel_id = sp6->pppol2tp.s_tunnel;
- peer_tunnel_id = sp6->pppol2tp.d_tunnel;
- session_id = sp6->pppol2tp.s_session;
- peer_session_id = sp6->pppol2tp.d_session;
- } else {
- error = -EINVAL;
- goto end; /* bad socket address */
- }
-
/* Don't bind if tunnel_id is 0 */
error = -EINVAL;
- if (tunnel_id == 0)
+ if (!info.tunnel_id)
goto end;
- tunnel = l2tp_tunnel_get(sock_net(sk), tunnel_id);
+ tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id);
if (tunnel)
drop_tunnel = true;
@@ -697,7 +707,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
* peer_session_id is 0. Otherwise look up tunnel using supplied
* tunnel id.
*/
- if ((session_id == 0) && (peer_session_id == 0)) {
+ if (!info.session_id && !info.peer_session_id) {
if (tunnel == NULL) {
struct l2tp_tunnel_cfg tcfg = {
.encap = L2TP_ENCAPTYPE_UDP,
@@ -707,12 +717,16 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
/* Prevent l2tp_tunnel_register() from trying to set up
* a kernel socket.
*/
- if (fd < 0) {
+ if (info.fd < 0) {
error = -EBADF;
goto end;
}
- error = l2tp_tunnel_create(sock_net(sk), fd, ver, tunnel_id, peer_tunnel_id, &tcfg, &tunnel);
+ error = l2tp_tunnel_create(sock_net(sk), info.fd,
+ info.version,
+ info.tunnel_id,
+ info.peer_tunnel_id, &tcfg,
+ &tunnel);
if (error < 0)
goto end;
@@ -737,13 +751,10 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
- if (tunnel->recv_payload_hook == NULL)
- tunnel->recv_payload_hook = pppol2tp_recv_payload_hook;
-
if (tunnel->peer_tunnel_id == 0)
- tunnel->peer_tunnel_id = peer_tunnel_id;
+ tunnel->peer_tunnel_id = info.peer_tunnel_id;
- session = l2tp_session_get(sock_net(sk), tunnel, session_id);
+ session = l2tp_tunnel_get_session(tunnel, info.session_id);
if (session) {
drop_refcnt = true;
@@ -766,14 +777,11 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
} else {
- /* Default MTU must allow space for UDP/L2TP/PPP headers */
- cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
- cfg.mru = cfg.mtu;
cfg.pw_type = L2TP_PWTYPE_PPP;
session = l2tp_session_create(sizeof(struct pppol2tp_session),
- tunnel, session_id,
- peer_session_id, &cfg);
+ tunnel, info.session_id,
+ info.peer_session_id, &cfg);
if (IS_ERR(session)) {
error = PTR_ERR(session);
goto end;
@@ -813,7 +821,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
po->chan.private = sk;
po->chan.ops = &pppol2tp_chan_ops;
- po->chan.mtu = session->mtu;
+ po->chan.mtu = pppol2tp_tunnel_mtu(tunnel);
error = ppp_register_net_channel(sock_net(sk), &po->chan);
if (error) {
@@ -869,12 +877,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel,
goto err;
}
- /* Default MTU values. */
- if (cfg->mtu == 0)
- cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
- if (cfg->mru == 0)
- cfg->mru = cfg->mtu;
-
/* Allocate and initialize a new session context. */
session = l2tp_session_create(sizeof(struct pppol2tp_session),
tunnel, session_id,
@@ -1021,8 +1023,10 @@ end:
****************************************************************************/
static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
- struct l2tp_stats *stats)
+ const struct l2tp_stats *stats)
{
+ memset(dest, 0, sizeof(*dest));
+
dest->tx_packets = atomic_long_read(&stats->tx_packets);
dest->tx_bytes = atomic_long_read(&stats->tx_bytes);
dest->tx_errors = atomic_long_read(&stats->tx_errors);
@@ -1033,256 +1037,107 @@ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
dest->rx_errors = atomic_long_read(&stats->rx_errors);
}
-/* Session ioctl helper.
- */
-static int pppol2tp_session_ioctl(struct l2tp_session *session,
- unsigned int cmd, unsigned long arg)
+static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats,
+ struct l2tp_tunnel *tunnel)
{
- struct ifreq ifr;
- int err = 0;
- struct sock *sk;
- int val = (int) arg;
- struct pppol2tp_session *ps = l2tp_session_priv(session);
- struct l2tp_tunnel *tunnel = session->tunnel;
- struct pppol2tp_ioc_stats stats;
+ struct l2tp_session *session;
- l2tp_dbg(session, L2TP_MSG_CONTROL,
- "%s: pppol2tp_session_ioctl(cmd=%#x, arg=%#lx)\n",
- session->name, cmd, arg);
+ if (!stats->session_id) {
+ pppol2tp_copy_stats(stats, &tunnel->stats);
+ return 0;
+ }
- sk = pppol2tp_session_get_sock(session);
- if (!sk)
+ /* If session_id is set, search the corresponding session in the
+ * context of this tunnel and record the session's statistics.
+ */
+ session = l2tp_tunnel_get_session(tunnel, stats->session_id);
+ if (!session)
return -EBADR;
- switch (cmd) {
- case SIOCGIFMTU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- err = -EFAULT;
- if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
- break;
- ifr.ifr_mtu = session->mtu;
- if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
- break;
-
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n",
- session->name, session->mtu);
- err = 0;
- break;
-
- case SIOCSIFMTU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
+ if (session->pwtype != L2TP_PWTYPE_PPP) {
+ l2tp_session_dec_refcount(session);
+ return -EBADR;
+ }
- err = -EFAULT;
- if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
- break;
+ pppol2tp_copy_stats(stats, &session->stats);
+ l2tp_session_dec_refcount(session);
- session->mtu = ifr.ifr_mtu;
+ return 0;
+}
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n",
- session->name, session->mtu);
- err = 0;
- break;
+static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct pppol2tp_ioc_stats stats;
+ struct l2tp_session *session;
+ int val;
+ switch (cmd) {
case PPPIOCGMRU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
+ case PPPIOCGFLAGS:
+ session = sock->sk->sk_user_data;
+ if (!session)
+ return -ENOTCONN;
- err = -EFAULT;
- if (put_user(session->mru, (int __user *) arg))
- break;
+ /* Not defined for tunnels */
+ if (!session->session_id && !session->peer_session_id)
+ return -ENOSYS;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n",
- session->name, session->mru);
- err = 0;
+ if (put_user(0, (int __user *)arg))
+ return -EFAULT;
break;
case PPPIOCSMRU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- err = -EFAULT;
- if (get_user(val, (int __user *) arg))
- break;
-
- session->mru = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
- session->name, session->mru);
- err = 0;
- break;
-
- case PPPIOCGFLAGS:
- err = -EFAULT;
- if (put_user(ps->flags, (int __user *) arg))
- break;
-
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
- session->name, ps->flags);
- err = 0;
- break;
-
case PPPIOCSFLAGS:
- err = -EFAULT;
- if (get_user(val, (int __user *) arg))
- break;
- ps->flags = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
- session->name, ps->flags);
- err = 0;
- break;
-
- case PPPIOCGL2TPSTATS:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
+ session = sock->sk->sk_user_data;
+ if (!session)
+ return -ENOTCONN;
- memset(&stats, 0, sizeof(stats));
- stats.tunnel_id = tunnel->tunnel_id;
- stats.session_id = session->session_id;
- pppol2tp_copy_stats(&stats, &session->stats);
- if (copy_to_user((void __user *) arg, &stats,
- sizeof(stats)))
- break;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
- session->name);
- err = 0;
- break;
+ /* Not defined for tunnels */
+ if (!session->session_id && !session->peer_session_id)
+ return -ENOSYS;
- default:
- err = -ENOSYS;
+ if (get_user(val, (int __user *)arg))
+ return -EFAULT;
break;
- }
- sock_put(sk);
-
- return err;
-}
-
-/* Tunnel ioctl helper.
- *
- * Note the special handling for PPPIOCGL2TPSTATS below. If the ioctl data
- * specifies a session_id, the session ioctl handler is called. This allows an
- * application to retrieve session stats via a tunnel socket.
- */
-static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
- unsigned int cmd, unsigned long arg)
-{
- int err = 0;
- struct sock *sk;
- struct pppol2tp_ioc_stats stats;
-
- l2tp_dbg(tunnel, L2TP_MSG_CONTROL,
- "%s: pppol2tp_tunnel_ioctl(cmd=%#x, arg=%#lx)\n",
- tunnel->name, cmd, arg);
-
- sk = tunnel->sock;
- sock_hold(sk);
-
- switch (cmd) {
case PPPIOCGL2TPSTATS:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- if (copy_from_user(&stats, (void __user *) arg,
- sizeof(stats))) {
- err = -EFAULT;
- break;
+ session = sock->sk->sk_user_data;
+ if (!session)
+ return -ENOTCONN;
+
+ /* Session 0 represents the parent tunnel */
+ if (!session->session_id && !session->peer_session_id) {
+ u32 session_id;
+ int err;
+
+ if (copy_from_user(&stats, (void __user *)arg,
+ sizeof(stats)))
+ return -EFAULT;
+
+ session_id = stats.session_id;
+ err = pppol2tp_tunnel_copy_stats(&stats,
+ session->tunnel);
+ if (err < 0)
+ return err;
+
+ stats.session_id = session_id;
+ } else {
+ pppol2tp_copy_stats(&stats, &session->stats);
+ stats.session_id = session->session_id;
}
- if (stats.session_id != 0) {
- /* resend to session ioctl handler */
- struct l2tp_session *session =
- l2tp_session_get(sock_net(sk), tunnel,
- stats.session_id);
-
- if (!session) {
- err = -EBADR;
- break;
- }
- if (session->pwtype != L2TP_PWTYPE_PPP) {
- l2tp_session_dec_refcount(session);
- err = -EBADR;
- break;
- }
+ stats.tunnel_id = session->tunnel->tunnel_id;
+ stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel);
- err = pppol2tp_session_ioctl(session, cmd, arg);
- l2tp_session_dec_refcount(session);
- break;
- }
-#ifdef CONFIG_XFRM
- stats.using_ipsec = (sk->sk_policy[0] || sk->sk_policy[1]) ? 1 : 0;
-#endif
- pppol2tp_copy_stats(&stats, &tunnel->stats);
- if (copy_to_user((void __user *) arg, &stats, sizeof(stats))) {
- err = -EFAULT;
- break;
- }
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get L2TP stats\n",
- tunnel->name);
- err = 0;
+ if (copy_to_user((void __user *)arg, &stats, sizeof(stats)))
+ return -EFAULT;
break;
default:
- err = -ENOSYS;
- break;
+ return -ENOIOCTLCMD;
}
- sock_put(sk);
-
- return err;
-}
-
-/* Main ioctl() handler.
- * Dispatch to tunnel or session helpers depending on the socket.
- */
-static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
- unsigned long arg)
-{
- struct sock *sk = sock->sk;
- struct l2tp_session *session;
- struct l2tp_tunnel *tunnel;
- int err;
-
- if (!sk)
- return 0;
-
- err = -EBADF;
- if (sock_flag(sk, SOCK_DEAD) != 0)
- goto end;
-
- err = -ENOTCONN;
- if ((sk->sk_user_data == NULL) ||
- (!(sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND))))
- goto end;
-
- /* Get session context from the socket */
- err = -EBADF;
- session = pppol2tp_sock_to_session(sk);
- if (session == NULL)
- goto end;
-
- /* Special case: if session's session_id is zero, treat ioctl as a
- * tunnel ioctl
- */
- if ((session->session_id == 0) &&
- (session->peer_session_id == 0)) {
- tunnel = session->tunnel;
- err = pppol2tp_tunnel_ioctl(tunnel, cmd, arg);
- goto end_put_sess;
- }
-
- err = pppol2tp_session_ioctl(session, cmd, arg);
-
-end_put_sess:
- sock_put(sk);
-end:
- return err;
+ return 0;
}
/*****************************************************************************
@@ -1722,8 +1577,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
tunnel->peer_tunnel_id,
session->peer_session_id,
state, user_data_ok);
- seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n",
- session->mtu, session->mru,
+ seq_printf(m, " 0/0/%c/%c/%s %08x %u\n",
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
diff --git a/net/llc/Kconfig b/net/llc/Kconfig
index b91c65108162..176a6c1521a5 100644
--- a/net/llc/Kconfig
+++ b/net/llc/Kconfig
@@ -6,5 +6,5 @@ config LLC2
tristate "ANSI/IEEE 802.2 LLC type 2 Support"
select LLC
help
- This is a Logical Link Layer type 2, connection oriented support.
+ This is a Logical Link Layer type 2, connection oriented support.
Select this if you want to have support for PF_LLC sockets.
diff --git a/net/llc/Makefile b/net/llc/Makefile
index 4e260cff3c5d..5e0ef436daae 100644
--- a/net/llc/Makefile
+++ b/net/llc/Makefile
@@ -4,7 +4,7 @@
# Copyright (c) 1997 by Procom Technology,Inc.
# 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
#
-# This program can be redistributed or modified under the terms of the
+# This program can be redistributed or modified under the terms of the
# GNU General Public License as published by the Free Software Foundation.
# This program is distributed without any warranty or implied warranty
# of merchantability or fitness for a particular purpose.
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 1beeea9549fa..b99e73a7e7e0 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -730,7 +730,6 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct sk_buff *skb = NULL;
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
- unsigned long cpu_flags;
size_t copied = 0;
u32 peek_seq = 0;
u32 *seq, skb_len;
@@ -855,9 +854,8 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
goto copy_uaddr;
if (!(flags & MSG_PEEK)) {
- spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
- sk_eat_skb(sk, skb);
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+ skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
*seq = 0;
}
@@ -878,9 +876,8 @@ copy_uaddr:
llc_cmsg_rcv(msg, skb);
if (!(flags & MSG_PEEK)) {
- spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
- sk_eat_skb(sk, skb);
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+ skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
*seq = 0;
}
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index c0ac522b48a1..4ff89cb7c86f 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -734,6 +734,7 @@ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk)
llc_sk(sk)->sap = sap;
spin_lock_bh(&sap->sk_lock);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sap->sk_count++;
sk_nulls_add_node_rcu(sk, laddr_hb);
hlist_add_head(&llc->dev_hash_node, dev_hb);
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 260b3dc1b4a2..64d4bef04e73 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -127,9 +127,7 @@ void llc_sap_close(struct llc_sap *sap)
list_del_rcu(&sap->node);
spin_unlock_bh(&llc_sap_list_lock);
- synchronize_rcu();
-
- kfree(sap);
+ kfree_rcu(sap, rcu);
}
static struct packet_type llc_packet_type __read_mostly = {
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 6daf391b3e84..8db03c2d5440 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -151,4 +151,3 @@ out:
sock_put(sk);
return rc;
}
-
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 76e30f4797fb..f869e35d0974 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -27,20 +27,6 @@ config MAC80211_RC_MINSTREL
---help---
This option enables the 'minstrel' TX rate control algorithm
-config MAC80211_RC_MINSTREL_HT
- bool "Minstrel 802.11n support" if EXPERT
- depends on MAC80211_RC_MINSTREL
- default y
- ---help---
- This option enables the 'minstrel_ht' TX rate control algorithm
-
-config MAC80211_RC_MINSTREL_VHT
- bool "Minstrel 802.11ac support" if EXPERT
- depends on MAC80211_RC_MINSTREL_HT
- default n
- ---help---
- This option enables VHT in the 'minstrel_ht' TX rate control algorithm
-
choice
prompt "Default rate control algorithm"
depends on MAC80211_HAS_RC
@@ -62,8 +48,7 @@ endchoice
config MAC80211_RC_DEFAULT
string
- default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT
- default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
+ default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL
default ""
endif
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index e3589ade62e0..4f03ebe732fa 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -12,6 +12,7 @@ mac80211-y := \
scan.o offchannel.o \
ht.o agg-tx.o agg-rx.o \
vht.o \
+ he.o \
ibss.o \
iface.o \
rate.o \
@@ -52,13 +53,14 @@ mac80211-$(CONFIG_PM) += pm.o
CFLAGS_trace.o := -I$(src)
-rc80211_minstrel-y := rc80211_minstrel.o
-rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o
+rc80211_minstrel-y := \
+ rc80211_minstrel.o \
+ rc80211_minstrel_ht.o
-rc80211_minstrel_ht-y := rc80211_minstrel_ht.o
-rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
+rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += \
+ rc80211_minstrel_debugfs.o \
+ rc80211_minstrel_ht_debugfs.o
mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
-mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
ccflags-y += -DDEBUG
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index e83c19d4c292..6a4f154c99f6 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
};
int i, ret = -EOPNOTSUPP;
u16 status = WLAN_STATUS_REQUEST_DECLINED;
+ u16 max_buf_size;
if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
ht_dbg(sta->sdata,
@@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
goto end;
}
+ if (sta->sta.he_cap.has_he)
+ max_buf_size = IEEE80211_MAX_AMPDU_BUF;
+ else
+ max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
+
/* sanity check for incoming parameters:
* check if configuration can support the BA policy
* and if buffer size does not exceeds max value */
/* XXX: check own ht delayed BA capability?? */
if (((ba_policy != 1) &&
(!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
- (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
+ (buf_size > max_buf_size)) {
status = WLAN_STATUS_INVALID_QOS_PARAM;
ht_dbg_ratelimited(sta->sdata,
"AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n",
@@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
}
/* determine default buffer size */
if (buf_size == 0)
- buf_size = IEEE80211_MAX_AMPDU_BUF;
+ buf_size = max_buf_size;
/* make sure the size doesn't exceed the maximum supported by the hw */
if (buf_size > sta->sta.max_rx_aggregation_subframes)
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index ac4295296514..69e831bc317b 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
.timeout = 0,
};
int ret;
+ u16 buf_size;
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
sta->ampdu_mlme.addba_req_num[tid]++;
spin_unlock_bh(&sta->lock);
+ if (sta->sta.he_cap.has_he) {
+ buf_size = local->hw.max_tx_aggregation_subframes;
+ } else {
+ /*
+ * We really should use what the driver told us it will
+ * transmit as the maximum, but certain APs (e.g. the
+ * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
+ * will crash when we use a lower number.
+ */
+ buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
+ }
+
/* send AddBA request */
ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
tid_tx->dialog_token, params.ssn,
- IEEE80211_MAX_AMPDU_BUF,
- tid_tx->timeout);
+ buf_size, tid_tx->timeout);
}
/*
@@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
{
struct tid_ampdu_tx *tid_tx;
struct ieee80211_txq *txq;
- u16 capab, tid;
- u8 buf_size;
+ u16 capab, tid, buf_size;
bool amsdu;
capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index bdf6fa78d0d2..51622333d460 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -158,12 +158,10 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
if (ret)
return ret;
- if (type == NL80211_IFTYPE_AP_VLAN &&
- params && params->use_4addr == 0) {
+ if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) {
RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
ieee80211_check_fast_rx_iface(sdata);
- } else if (type == NL80211_IFTYPE_STATION &&
- params && params->use_4addr >= 0) {
+ } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) {
sdata->u.mgd.use_4addr = params->use_4addr;
}
@@ -427,7 +425,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_AP_VLAN:
/* Keys without a station are used for TX only */
- if (key->sta && test_sta_flag(key->sta, WLAN_STA_MFP))
+ if (sta && test_sta_flag(sta, WLAN_STA_MFP))
key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
break;
case NL80211_IFTYPE_ADHOC:
@@ -495,7 +493,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
goto out_unlock;
}
- ieee80211_key_free(key, true);
+ ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION);
ret = 0;
out_unlock:
@@ -792,6 +790,48 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
return 0;
}
+static int ieee80211_set_ftm_responder_params(
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *lci, size_t lci_len,
+ const u8 *civicloc, size_t civicloc_len)
+{
+ struct ieee80211_ftm_responder_params *new, *old;
+ struct ieee80211_bss_conf *bss_conf;
+ u8 *pos;
+ int len;
+
+ if ((!lci || !lci_len) && (!civicloc || !civicloc_len))
+ return 1;
+
+ bss_conf = &sdata->vif.bss_conf;
+ old = bss_conf->ftmr_params;
+ len = lci_len + civicloc_len;
+
+ new = kzalloc(sizeof(*new) + len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ pos = (u8 *)(new + 1);
+ if (lci_len) {
+ new->lci_len = lci_len;
+ new->lci = pos;
+ memcpy(pos, lci, lci_len);
+ pos += lci_len;
+ }
+
+ if (civicloc_len) {
+ new->civicloc_len = civicloc_len;
+ new->civicloc = pos;
+ memcpy(pos, civicloc, civicloc_len);
+ pos += civicloc_len;
+ }
+
+ bss_conf->ftmr_params = new;
+ kfree(old);
+
+ return 0;
+}
+
static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
struct cfg80211_beacon_data *params,
const struct ieee80211_csa_settings *csa)
@@ -865,6 +905,20 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
if (err == 0)
changed |= BSS_CHANGED_AP_PROBE_RESP;
+ if (params->ftm_responder != -1) {
+ sdata->vif.bss_conf.ftm_responder = params->ftm_responder;
+ err = ieee80211_set_ftm_responder_params(sdata,
+ params->lci,
+ params->lci_len,
+ params->civicloc,
+ params->civicloc_len);
+
+ if (err < 0)
+ return err;
+
+ changed |= BSS_CHANGED_FTM_RESPONDER;
+ }
+
rcu_assign_pointer(sdata->u.ap.beacon, new);
if (old)
@@ -911,6 +965,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+ if (params->he_cap)
+ sdata->vif.bss_conf.he_support = true;
+
mutex_lock(&local->mtx);
err = ieee80211_vif_use_channel(sdata, &params->chandef,
IEEE80211_CHANCTX_SHARED);
@@ -1062,6 +1119,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
kfree_rcu(old_probe_resp, rcu_head);
sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF;
+ kfree(sdata->vif.bss_conf.ftmr_params);
+ sdata->vif.bss_conf.ftmr_params = NULL;
+
__sta_info_flush(sdata, true);
ieee80211_free_keys(sdata, true);
@@ -1092,50 +1152,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
return 0;
}
-/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
-struct iapp_layer2_update {
- u8 da[ETH_ALEN]; /* broadcast */
- u8 sa[ETH_ALEN]; /* STA addr */
- __be16 len; /* 6 */
- u8 dsap; /* 0 */
- u8 ssap; /* 0 */
- u8 control;
- u8 xid_info[3];
-} __packed;
-
-static void ieee80211_send_layer2_update(struct sta_info *sta)
-{
- struct iapp_layer2_update *msg;
- struct sk_buff *skb;
-
- /* Send Level 2 Update Frame to update forwarding tables in layer 2
- * bridge devices */
-
- skb = dev_alloc_skb(sizeof(*msg));
- if (!skb)
- return;
- msg = skb_put(skb, sizeof(*msg));
-
- /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
- * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
-
- eth_broadcast_addr(msg->da);
- memcpy(msg->sa, sta->sta.addr, ETH_ALEN);
- msg->len = htons(6);
- msg->dsap = 0;
- msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */
- msg->control = 0xaf; /* XID response lsb.1111F101.
- * F=0 (no poll command; unsolicited frame) */
- msg->xid_info[0] = 0x81; /* XID format identifier */
- msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */
- msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */
-
- skb->dev = sta->sdata->dev;
- skb->protocol = eth_type_trans(skb, sta->sdata->dev);
- memset(skb->cb, 0, sizeof(skb->cb));
- netif_rx_ni(skb);
-}
-
static int sta_apply_auth_flags(struct ieee80211_local *local,
struct sta_info *sta,
u32 mask, u32 set)
@@ -1412,6 +1428,11 @@ static int sta_apply_parameters(struct ieee80211_local *local,
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
params->vht_capa, sta);
+ if (params->he_capa)
+ ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
+ (void *)params->he_capa,
+ params->he_capa_len, sta);
+
if (params->opmode_notif_used) {
/* returned value is only needed for rc update, but the
* rc isn't initialized here yet, so ignore it
@@ -1494,7 +1515,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
}
if (layer2_update)
- ieee80211_send_layer2_update(sta);
+ cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr);
rcu_read_unlock();
@@ -1596,7 +1617,7 @@ static int ieee80211_change_station(struct wiphy *wiphy,
if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
ieee80211_vif_inc_num_mcast(sta->sdata);
- ieee80211_send_layer2_update(sta);
+ cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr);
}
err = sta_apply_parameters(local, sta, params);
@@ -2913,6 +2934,20 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
pos += beacon->probe_resp_len;
}
+ if (beacon->ftm_responder)
+ new_beacon->ftm_responder = beacon->ftm_responder;
+ if (beacon->lci) {
+ new_beacon->lci_len = beacon->lci_len;
+ new_beacon->lci = pos;
+ memcpy(pos, beacon->lci, beacon->lci_len);
+ pos += beacon->lci_len;
+ }
+ if (beacon->civicloc) {
+ new_beacon->civicloc_len = beacon->civicloc_len;
+ new_beacon->civicloc = pos;
+ memcpy(pos, beacon->civicloc, beacon->civicloc_len);
+ pos += beacon->civicloc_len;
+ }
return new_beacon;
}
@@ -3486,7 +3521,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
}
local_bh_disable();
- ieee80211_xmit(sdata, sta, skb);
+ ieee80211_xmit(sdata, sta, skb, 0);
local_bh_enable();
ret = 0;
@@ -3803,6 +3838,17 @@ out:
return ret;
}
+static int
+ieee80211_get_ftm_responder_stats(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ return drv_get_ftm_responder_stats(local, sdata, ftm_stats);
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3897,4 +3943,5 @@ const struct cfg80211_ops mac80211_config_ops = {
.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
.tx_control_port = ieee80211_tx_control_port,
.get_txq_stats = ieee80211_get_txq_stats,
+ .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats,
};
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index b5adf3625d16..3fe541e358f3 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -3,6 +3,7 @@
*
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* GPLv2
*
@@ -214,6 +215,9 @@ static const char *hw_flag_names[] = {
FLAG(SUPPORTS_TDLS_BUFFER_STA),
FLAG(DEAUTH_NEED_MGD_TX_PREP),
FLAG(DOESNT_SUPPORT_QOS_NDP),
+ FLAG(BUFF_MMPDU_TXQ),
+ FLAG(SUPPORTS_VHT_EXT_NSS_BW),
+ FLAG(STA_MMPDU_TXQ),
#undef FLAG
};
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 4105081dc1df..af5185a836e5 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -4,6 +4,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2016 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -140,7 +141,7 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
{
struct sta_info *sta = file->private_data;
struct ieee80211_local *local = sta->local;
- size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
+ size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2);
char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
struct txq_info *txqi;
ssize_t rv;
@@ -162,7 +163,9 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
bufsz+buf-p,
"tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
- for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ if (!sta->sta.txq[i])
+ continue;
txqi = to_txq_info(sta->sta.txq[i]);
p += scnprintf(p, bufsz+buf-p,
"%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n",
@@ -487,12 +490,368 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
p += scnprintf(p, sizeof(buf)+buf-p,
"MCS TX highest: %d Mbps\n",
le16_to_cpu(vhtc->vht_mcs.tx_highest));
+#undef PFLAG
}
return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
}
STA_OPS(vht_capa);
+static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char *buf, *p;
+ size_t buf_sz = PAGE_SIZE;
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_sta_he_cap *hec = &sta->sta.he_cap;
+ struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp;
+ u8 ppe_size;
+ u8 *cap;
+ int i;
+ ssize_t ret;
+
+ buf = kmalloc(buf_sz, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = buf;
+
+ p += scnprintf(p, buf_sz + buf - p, "HE %ssupported\n",
+ hec->has_he ? "" : "not ");
+ if (!hec->has_he)
+ goto out;
+
+ cap = hec->he_cap_elem.mac_cap_info;
+ p += scnprintf(p, buf_sz + buf - p,
+ "MAC-CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
+ cap[0], cap[1], cap[2], cap[3], cap[4], cap[5]);
+
+#define PRINT(fmt, ...) \
+ p += scnprintf(p, buf_sz + buf - p, "\t\t" fmt "\n", \
+ ##__VA_ARGS__)
+
+#define PFLAG(t, n, a, b) \
+ do { \
+ if (cap[n] & IEEE80211_HE_##t##_CAP##n##_##a) \
+ PRINT("%s", b); \
+ } while (0)
+
+#define PFLAG_RANGE(t, i, n, s, m, off, fmt) \
+ do { \
+ u8 msk = IEEE80211_HE_##t##_CAP##i##_##n##_MASK; \
+ u8 idx = ((cap[i] & msk) >> (ffs(msk) - 1)) + off; \
+ PRINT(fmt, (s << idx) + (m * idx)); \
+ } while (0)
+
+#define PFLAG_RANGE_DEFAULT(t, i, n, s, m, off, fmt, a, b) \
+ do { \
+ if (cap[i] == IEEE80211_HE_##t ##_CAP##i##_##n##_##a) { \
+ PRINT("%s", b); \
+ break; \
+ } \
+ PFLAG_RANGE(t, i, n, s, m, off, fmt); \
+ } while (0)
+
+ PFLAG(MAC, 0, HTC_HE, "HTC-HE");
+ PFLAG(MAC, 0, TWT_REQ, "TWT-REQ");
+ PFLAG(MAC, 0, TWT_RES, "TWT-RES");
+ PFLAG_RANGE_DEFAULT(MAC, 0, DYNAMIC_FRAG, 0, 1, 0,
+ "DYNAMIC-FRAG-LEVEL-%d", NOT_SUPP, "NOT-SUPP");
+ PFLAG_RANGE_DEFAULT(MAC, 0, MAX_NUM_FRAG_MSDU, 1, 0, 0,
+ "MAX-NUM-FRAG-MSDU-%d", UNLIMITED, "UNLIMITED");
+
+ PFLAG_RANGE_DEFAULT(MAC, 1, MIN_FRAG_SIZE, 128, 0, -1,
+ "MIN-FRAG-SIZE-%d", UNLIMITED, "UNLIMITED");
+ PFLAG_RANGE_DEFAULT(MAC, 1, TF_MAC_PAD_DUR, 0, 8, 0,
+ "TF-MAC-PAD-DUR-%dUS", MASK, "UNKNOWN");
+ PFLAG_RANGE(MAC, 1, MULTI_TID_AGG_RX_QOS, 0, 1, 1,
+ "MULTI-TID-AGG-RX-QOS-%d");
+
+ if (cap[0] & IEEE80211_HE_MAC_CAP0_HTC_HE) {
+ switch (((cap[2] << 1) | (cap[1] >> 7)) & 0x3) {
+ case 0:
+ PRINT("LINK-ADAPTATION-NO-FEEDBACK");
+ break;
+ case 1:
+ PRINT("LINK-ADAPTATION-RESERVED");
+ break;
+ case 2:
+ PRINT("LINK-ADAPTATION-UNSOLICITED-FEEDBACK");
+ break;
+ case 3:
+ PRINT("LINK-ADAPTATION-BOTH");
+ break;
+ }
+ }
+
+ PFLAG(MAC, 2, ALL_ACK, "ALL-ACK");
+ PFLAG(MAC, 2, TRS, "TRS");
+ PFLAG(MAC, 2, BSR, "BSR");
+ PFLAG(MAC, 2, BCAST_TWT, "BCAST-TWT");
+ PFLAG(MAC, 2, 32BIT_BA_BITMAP, "32BIT-BA-BITMAP");
+ PFLAG(MAC, 2, MU_CASCADING, "MU-CASCADING");
+ PFLAG(MAC, 2, ACK_EN, "ACK-EN");
+
+ PFLAG(MAC, 3, OMI_CONTROL, "OMI-CONTROL");
+ PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA");
+
+ switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) {
+ case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT:
+ PRINT("MAX-AMPDU-LEN-EXP-USE-VHT");
+ break;
+ case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1:
+ PRINT("MAX-AMPDU-LEN-EXP-VHT-1");
+ break;
+ case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2:
+ PRINT("MAX-AMPDU-LEN-EXP-VHT-2");
+ break;
+ case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED:
+ PRINT("MAX-AMPDU-LEN-EXP-RESERVED");
+ break;
+ }
+
+ PFLAG(MAC, 3, AMSDU_FRAG, "AMSDU-FRAG");
+ PFLAG(MAC, 3, FLEX_TWT_SCHED, "FLEX-TWT-SCHED");
+ PFLAG(MAC, 3, RX_CTRL_FRAME_TO_MULTIBSS, "RX-CTRL-FRAME-TO-MULTIBSS");
+
+ PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG");
+ PFLAG(MAC, 4, QTP, "QTP");
+ PFLAG(MAC, 4, BQR, "BQR");
+ PFLAG(MAC, 4, SRP_RESP, "SRP-RESP");
+ PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP");
+ PFLAG(MAC, 4, OPS, "OPS");
+ PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU");
+
+ PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7);
+
+ PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION,
+ "SUBCHAN-SELECVITE-TRANSMISSION");
+ PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU");
+ PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX");
+
+ cap = hec->he_cap_elem.phy_cap_info;
+ p += scnprintf(p, buf_sz + buf - p,
+ "PHY CAP: %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x %#.2x\n",
+ cap[0], cap[1], cap[2], cap[3], cap[4], cap[5], cap[6],
+ cap[7], cap[8], cap[9], cap[10]);
+
+ PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_IN_2G,
+ "CHANNEL-WIDTH-SET-40MHZ-IN-2G");
+ PFLAG(PHY, 0, CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G,
+ "CHANNEL-WIDTH-SET-40MHZ-80MHZ-IN-5G");
+ PFLAG(PHY, 0, CHANNEL_WIDTH_SET_160MHZ_IN_5G,
+ "CHANNEL-WIDTH-SET-160MHZ-IN-5G");
+ PFLAG(PHY, 0, CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
+ "CHANNEL-WIDTH-SET-80PLUS80-MHZ-IN-5G");
+ PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G,
+ "CHANNEL-WIDTH-SET-RU-MAPPING-IN-2G");
+ PFLAG(PHY, 0, CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G,
+ "CHANNEL-WIDTH-SET-RU-MAPPING-IN-5G");
+
+ switch (cap[1] & IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK) {
+ case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ:
+ PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-20MHZ");
+ break;
+ case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ:
+ PRINT("PREAMBLE-PUNC-RX-80MHZ-ONLY-SECOND-40MHZ");
+ break;
+ case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ:
+ PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-20MHZ");
+ break;
+ case IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ:
+ PRINT("PREAMBLE-PUNC-RX-160MHZ-ONLY-SECOND-40MHZ");
+ break;
+ }
+
+ PFLAG(PHY, 1, DEVICE_CLASS_A,
+ "IEEE80211-HE-PHY-CAP1-DEVICE-CLASS-A");
+ PFLAG(PHY, 1, LDPC_CODING_IN_PAYLOAD,
+ "LDPC-CODING-IN-PAYLOAD");
+ PFLAG(PHY, 1, HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US,
+ "HY-CAP1-HE-LTF-AND-GI-FOR-HE-PPDUS-0-8US");
+ PRINT("MIDAMBLE-RX-MAX-NSTS-%d", ((cap[2] << 1) | (cap[1] >> 7)) & 0x3);
+
+ PFLAG(PHY, 2, NDP_4x_LTF_AND_3_2US, "NDP-4X-LTF-AND-3-2US");
+ PFLAG(PHY, 2, STBC_TX_UNDER_80MHZ, "STBC-TX-UNDER-80MHZ");
+ PFLAG(PHY, 2, STBC_RX_UNDER_80MHZ, "STBC-RX-UNDER-80MHZ");
+ PFLAG(PHY, 2, DOPPLER_TX, "DOPPLER-TX");
+ PFLAG(PHY, 2, DOPPLER_RX, "DOPPLER-RX");
+ PFLAG(PHY, 2, UL_MU_FULL_MU_MIMO, "UL-MU-FULL-MU-MIMO");
+ PFLAG(PHY, 2, UL_MU_PARTIAL_MU_MIMO, "UL-MU-PARTIAL-MU-MIMO");
+
+ switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK) {
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM:
+ PRINT("DCM-MAX-CONST-TX-NO-DCM");
+ break;
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK:
+ PRINT("DCM-MAX-CONST-TX-BPSK");
+ break;
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK:
+ PRINT("DCM-MAX-CONST-TX-QPSK");
+ break;
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM:
+ PRINT("DCM-MAX-CONST-TX-16-QAM");
+ break;
+ }
+
+ PFLAG(PHY, 3, DCM_MAX_TX_NSS_1, "DCM-MAX-TX-NSS-1");
+ PFLAG(PHY, 3, DCM_MAX_TX_NSS_2, "DCM-MAX-TX-NSS-2");
+
+ switch (cap[3] & IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK) {
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM:
+ PRINT("DCM-MAX-CONST-RX-NO-DCM");
+ break;
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK:
+ PRINT("DCM-MAX-CONST-RX-BPSK");
+ break;
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK:
+ PRINT("DCM-MAX-CONST-RX-QPSK");
+ break;
+ case IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM:
+ PRINT("DCM-MAX-CONST-RX-16-QAM");
+ break;
+ }
+
+ PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1");
+ PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2");
+ PFLAG(PHY, 3, RX_HE_MU_PPDU_FROM_NON_AP_STA,
+ "RX-HE-MU-PPDU-FROM-NON-AP-STA");
+ PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER");
+
+ PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE");
+ PFLAG(PHY, 4, MU_BEAMFORMER, "MU-BEAMFORMER");
+
+ PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_UNDER_80MHZ, 0, 1, 4,
+ "BEAMFORMEE-MAX-STS-UNDER-%d");
+ PFLAG_RANGE(PHY, 4, BEAMFORMEE_MAX_STS_ABOVE_80MHZ, 0, 1, 4,
+ "BEAMFORMEE-MAX-STS-ABOVE-%d");
+
+ PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ, 0, 1, 1,
+ "NUM-SND-DIM-UNDER-80MHZ-%d");
+ PFLAG_RANGE(PHY, 5, BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ, 0, 1, 1,
+ "NUM-SND-DIM-ABOVE-80MHZ-%d");
+ PFLAG(PHY, 5, NG16_SU_FEEDBACK, "NG16-SU-FEEDBACK");
+ PFLAG(PHY, 5, NG16_MU_FEEDBACK, "NG16-MU-FEEDBACK");
+
+ PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU");
+ PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU");
+ PFLAG(PHY, 6, TRIG_SU_BEAMFORMER_FB, "TRIG-SU-BEAMFORMER-FB");
+ PFLAG(PHY, 6, TRIG_MU_BEAMFORMER_FB, "TRIG-MU-BEAMFORMER-FB");
+ PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB");
+ PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE");
+ PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO,
+ "PARTIAL-BANDWIDTH-DL-MUMIMO");
+ PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT");
+
+ PFLAG(PHY, 7, SRP_BASED_SR, "SRP-BASED-SR");
+ PFLAG(PHY, 7, POWER_BOOST_FACTOR_AR, "POWER-BOOST-FACTOR-AR");
+ PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI,
+ "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI");
+ PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d");
+ PFLAG(PHY, 7, STBC_TX_ABOVE_80MHZ, "STBC-TX-ABOVE-80MHZ");
+ PFLAG(PHY, 7, STBC_RX_ABOVE_80MHZ, "STBC-RX-ABOVE-80MHZ");
+
+ PFLAG(PHY, 8, HE_ER_SU_PPDU_4XLTF_AND_08_US_GI,
+ "HE-ER-SU-PPDU-4XLTF-AND-08-US-GI");
+ PFLAG(PHY, 8, 20MHZ_IN_40MHZ_HE_PPDU_IN_2G,
+ "20MHZ-IN-40MHZ-HE-PPDU-IN-2G");
+ PFLAG(PHY, 8, 20MHZ_IN_160MHZ_HE_PPDU, "20MHZ-IN-160MHZ-HE-PPDU");
+ PFLAG(PHY, 8, 80MHZ_IN_160MHZ_HE_PPDU, "80MHZ-IN-160MHZ-HE-PPDU");
+ PFLAG(PHY, 8, HE_ER_SU_1XLTF_AND_08_US_GI,
+ "HE-ER-SU-1XLTF-AND-08-US-GI");
+ PFLAG(PHY, 8, MIDAMBLE_RX_TX_2X_AND_1XLTF,
+ "MIDAMBLE-RX-TX-2X-AND-1XLTF");
+
+ switch (cap[8] & IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_MASK) {
+ case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_20MHZ:
+ PRINT("DDCM-MAX-BW-20MHZ");
+ break;
+ case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_40MHZ:
+ PRINT("DCM-MAX-BW-40MHZ");
+ break;
+ case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_80MHZ:
+ PRINT("DCM-MAX-BW-80MHZ");
+ break;
+ case IEEE80211_HE_PHY_CAP8_DCM_MAX_BW_160_OR_80P80_MHZ:
+ PRINT("DCM-MAX-BW-160-OR-80P80-MHZ");
+ break;
+ }
+
+ PFLAG(PHY, 9, LONGER_THAN_16_SIGB_OFDM_SYM,
+ "LONGER-THAN-16-SIGB-OFDM-SYM");
+ PFLAG(PHY, 9, NON_TRIGGERED_CQI_FEEDBACK,
+ "NON-TRIGGERED-CQI-FEEDBACK");
+ PFLAG(PHY, 9, TX_1024_QAM_LESS_THAN_242_TONE_RU,
+ "TX-1024-QAM-LESS-THAN-242-TONE-RU");
+ PFLAG(PHY, 9, RX_1024_QAM_LESS_THAN_242_TONE_RU,
+ "RX-1024-QAM-LESS-THAN-242-TONE-RU");
+ PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB,
+ "RX-FULL-BW-SU-USING-MU-WITH-COMP-SIGB");
+ PFLAG(PHY, 9, RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB,
+ "RX-FULL-BW-SU-USING-MU-WITH-NON-COMP-SIGB");
+
+#undef PFLAG_RANGE_DEFAULT
+#undef PFLAG_RANGE
+#undef PFLAG
+
+#define PRINT_NSS_SUPP(f, n) \
+ do { \
+ int i; \
+ u16 v = le16_to_cpu(nss->f); \
+ p += scnprintf(p, buf_sz + buf - p, n ": %#.4x\n", v); \
+ for (i = 0; i < 8; i += 2) { \
+ switch ((v >> i) & 0x3) { \
+ case 0: \
+ PRINT(n "-%d-SUPPORT-0-7", i / 2); \
+ break; \
+ case 1: \
+ PRINT(n "-%d-SUPPORT-0-9", i / 2); \
+ break; \
+ case 2: \
+ PRINT(n "-%d-SUPPORT-0-11", i / 2); \
+ break; \
+ case 3: \
+ PRINT(n "-%d-NOT-SUPPORTED", i / 2); \
+ break; \
+ } \
+ } \
+ } while (0)
+
+ PRINT_NSS_SUPP(rx_mcs_80, "RX-MCS-80");
+ PRINT_NSS_SUPP(tx_mcs_80, "TX-MCS-80");
+
+ if (cap[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) {
+ PRINT_NSS_SUPP(rx_mcs_160, "RX-MCS-160");
+ PRINT_NSS_SUPP(tx_mcs_160, "TX-MCS-160");
+ }
+
+ if (cap[0] &
+ IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) {
+ PRINT_NSS_SUPP(rx_mcs_80p80, "RX-MCS-80P80");
+ PRINT_NSS_SUPP(tx_mcs_80p80, "TX-MCS-80P80");
+ }
+
+#undef PRINT_NSS_SUPP
+#undef PRINT
+
+ if (!(cap[6] & IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT))
+ goto out;
+
+ p += scnprintf(p, buf_sz + buf - p, "PPE-THRESHOLDS: %#.2x",
+ hec->ppe_thres[0]);
+
+ ppe_size = ieee80211_he_ppe_size(hec->ppe_thres[0], cap);
+ for (i = 1; i < ppe_size; i++) {
+ p += scnprintf(p, buf_sz + buf - p, " %#.2x",
+ hec->ppe_thres[i]);
+ }
+ p += scnprintf(p, buf_sz + buf - p, "\n");
+
+out:
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return ret;
+}
+STA_OPS(he_capa);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, \
@@ -538,6 +897,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
DEBUGFS_ADD(agg_status);
DEBUGFS_ADD(ht_capa);
DEBUGFS_ADD(vht_capa);
+ DEBUGFS_ADD(he_capa);
DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates);
DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 8f6998091d26..0b1747a2313d 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1173,6 +1173,32 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local,
local->ops->wake_tx_queue(&local->hw, &txq->txq);
}
+static inline int drv_can_aggregate_in_amsdu(struct ieee80211_local *local,
+ struct sk_buff *head,
+ struct sk_buff *skb)
+{
+ if (!local->ops->can_aggregate_in_amsdu)
+ return true;
+
+ return local->ops->can_aggregate_in_amsdu(&local->hw, head, skb);
+}
+
+static inline int
+drv_get_ftm_responder_stats(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+ u32 ret = -EOPNOTSUPP;
+
+ if (local->ops->get_ftm_responder_stats)
+ ret = local->ops->get_ftm_responder_stats(&local->hw,
+ &sdata->vif,
+ ftm_stats);
+ trace_drv_get_ftm_responder_stats(local, sdata, ftm_stats);
+
+ return ret;
+}
+
static inline int drv_start_nan(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
struct cfg80211_nan_conf *conf)
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 690c142a7a44..5ac743816b59 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -116,16 +116,16 @@ static void ieee80211_get_stats(struct net_device *dev,
data[i++] = sta->sta_state;
- if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))
data[i] = 100000ULL *
cfg80211_calculate_bitrate(&sinfo.txrate);
i++;
- if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))
data[i] = 100000ULL *
cfg80211_calculate_bitrate(&sinfo.rxrate);
i++;
- if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))
data[i] = (u8)sinfo.signal_avg;
i++;
} else {
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
new file mode 100644
index 000000000000..769078ed5a12
--- /dev/null
+++ b/net/mac80211/he.c
@@ -0,0 +1,55 @@
+/*
+ * HE handling
+ *
+ * Copyright(c) 2017 Intel Deutschland GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "ieee80211_i.h"
+
+void
+ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const u8 *he_cap_ie, u8 he_cap_len,
+ struct sta_info *sta)
+{
+ struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap;
+ struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie;
+ u8 he_ppe_size;
+ u8 mcs_nss_size;
+ u8 he_total_size;
+
+ memset(he_cap, 0, sizeof(*he_cap));
+
+ if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband))
+ return;
+
+ /* Make sure size is OK */
+ mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem);
+ he_ppe_size =
+ ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) +
+ mcs_nss_size],
+ he_cap_ie_elem->phy_cap_info);
+ he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size +
+ he_ppe_size;
+ if (he_cap_len < he_total_size)
+ return;
+
+ memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem));
+
+ /* HE Tx/Rx HE MCS NSS Support Field */
+ memcpy(&he_cap->he_mcs_nss_supp,
+ &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size);
+
+ /* Check if there are (optional) PPE Thresholds */
+ if (he_cap->he_cap_elem.phy_cap_info[6] &
+ IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT)
+ memcpy(he_cap->ppe_thres,
+ &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size],
+ he_ppe_size);
+
+ he_cap->has_he = true;
+}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 26a7ba3b698f..f849ea814993 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -352,7 +352,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
test_and_clear_bit(tid,
sta->ampdu_mlme.tid_rx_manage_offl))
___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
- IEEE80211_MAX_AMPDU_BUF,
+ IEEE80211_MAX_AMPDU_BUF_HT,
false, true);
if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 6449a1c2283b..0d704e8d7078 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -947,8 +947,8 @@ static void ieee80211_rx_mgmt_deauth_ibss(struct ieee80211_sub_if_data *sdata,
if (len < IEEE80211_DEAUTH_FRAME_LEN)
return;
- ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM BSSID=%pM (reason: %d)\n",
- mgmt->sa, mgmt->da, mgmt->bssid, reason);
+ ibss_dbg(sdata, "RX DeAuth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da);
+ ibss_dbg(sdata, "\tBSSID=%pM (reason: %d)\n", mgmt->bssid, reason);
sta_info_destroy_addr(sdata, mgmt->sa);
}
@@ -966,9 +966,9 @@ static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
- ibss_dbg(sdata,
- "RX Auth SA=%pM DA=%pM BSSID=%pM (auth_transaction=%d)\n",
- mgmt->sa, mgmt->da, mgmt->bssid, auth_transaction);
+ ibss_dbg(sdata, "RX Auth SA=%pM DA=%pM\n", mgmt->sa, mgmt->da);
+ ibss_dbg(sdata, "\tBSSID=%pM (auth_transaction=%d)\n",
+ mgmt->bssid, auth_transaction);
if (auth_alg != WLAN_AUTH_OPEN || auth_transaction != 1)
return;
@@ -1070,7 +1070,9 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
struct ieee80211_vht_cap cap_ie;
struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap;
- ieee80211_chandef_vht_oper(elems->vht_operation,
+ ieee80211_chandef_vht_oper(&local->hw,
+ elems->vht_operation,
+ elems->ht_operation,
&chandef);
memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
@@ -1175,10 +1177,10 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
rx_timestamp = drv_get_tsf(local, sdata);
}
- ibss_dbg(sdata,
- "RX beacon SA=%pM BSSID=%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
+ ibss_dbg(sdata, "RX beacon SA=%pM BSSID=%pM TSF=0x%llx\n",
mgmt->sa, mgmt->bssid,
- (unsigned long long)rx_timestamp,
+ (unsigned long long)rx_timestamp);
+ ibss_dbg(sdata, "\tBCN=0x%llx diff=%lld @%lu\n",
(unsigned long long)beacon_timestamp,
(unsigned long long)(rx_timestamp - beacon_timestamp),
jiffies);
@@ -1537,9 +1539,9 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
tx_last_beacon = drv_tx_last_beacon(local);
- ibss_dbg(sdata,
- "RX ProbeReq SA=%pM DA=%pM BSSID=%pM (tx_last_beacon=%d)\n",
- mgmt->sa, mgmt->da, mgmt->bssid, tx_last_beacon);
+ ibss_dbg(sdata, "RX ProbeReq SA=%pM DA=%pM\n", mgmt->sa, mgmt->da);
+ ibss_dbg(sdata, "\tBSSID=%pM (tx_last_beacon=%d)\n",
+ mgmt->bssid, tx_last_beacon);
if (!tx_last_beacon && is_multicast_ether_addr(mgmt->da))
return;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d1978aa1c15d..10a05062e4a0 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -165,6 +165,7 @@ typedef unsigned __bitwise ieee80211_tx_result;
#define TX_DROP ((__force ieee80211_tx_result) 1u)
#define TX_QUEUED ((__force ieee80211_tx_result) 2u)
+#define IEEE80211_TX_NO_SEQNO BIT(0)
#define IEEE80211_TX_UNICAST BIT(1)
#define IEEE80211_TX_PS_BUFFERED BIT(2)
@@ -364,6 +365,7 @@ enum ieee80211_sta_flags {
IEEE80211_STA_DISABLE_160MHZ = BIT(13),
IEEE80211_STA_DISABLE_WMM = BIT(14),
IEEE80211_STA_ENABLE_RRM = BIT(15),
+ IEEE80211_STA_DISABLE_HE = BIT(16),
};
struct ieee80211_mgd_auth_data {
@@ -375,6 +377,7 @@ struct ieee80211_mgd_auth_data {
u8 key[WLAN_KEY_LEN_WEP104];
u8 key_len, key_idx;
bool done;
+ bool peer_confirmed;
bool timeout_started;
u16 sae_trans, sae_status;
@@ -816,6 +819,7 @@ enum txq_info_flags {
IEEE80211_TXQ_STOP,
IEEE80211_TXQ_AMPDU,
IEEE80211_TXQ_NO_AMSDU,
+ IEEE80211_TXQ_STOP_NETIF_TX,
};
/**
@@ -1196,6 +1200,9 @@ struct ieee80211_local {
/* number of RX chains the hardware has */
u8 rx_chains;
+ /* bitmap of which sbands were copied */
+ u8 sband_allocated;
+
int tx_headroom; /* required headroom for hardware/radiotap */
/* Tasklet and skb queue to process calls from IRQ mode. All frames
@@ -1224,6 +1231,7 @@ struct ieee80211_local {
struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
struct tasklet_struct tx_pending_tasklet;
+ struct tasklet_struct wake_txqs_tasklet;
atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
@@ -1453,6 +1461,10 @@ struct ieee802_11_elems {
const struct ieee80211_vht_cap *vht_cap_elem;
const struct ieee80211_vht_operation *vht_operation;
const struct ieee80211_meshconf_ie *mesh_config;
+ const u8 *he_cap;
+ const struct ieee80211_he_operation *he_operation;
+ const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
+ const u8 *uora_element;
const u8 *mesh_id;
const u8 *peering;
const __le16 *awake_window;
@@ -1482,6 +1494,7 @@ struct ieee802_11_elems {
u8 ext_supp_rates_len;
u8 wmm_info_len;
u8 wmm_param_len;
+ u8 he_cap_len;
u8 mesh_id_len;
u8 peering_len;
u8 preq_len;
@@ -1824,6 +1837,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
enum nl80211_chan_width
ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta);
+/* HE */
+void
+ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const u8 *he_cap_ie, u8 he_cap_len,
+ struct sta_info *sta);
+
/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt,
@@ -1880,19 +1900,20 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
bool bss_notify, bool enable_qos);
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, struct sk_buff *skb);
+ struct sta_info *sta, struct sk_buff *skb,
+ u32 txdata_flags);
void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum nl80211_band band);
+ enum nl80211_band band, u32 txdata_flags);
static inline void
ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum nl80211_band band)
+ enum nl80211_band band, u32 txdata_flags)
{
rcu_read_lock();
- __ieee80211_tx_skb_tid_band(sdata, skb, tid, band);
+ __ieee80211_tx_skb_tid_band(sdata, skb, tid, band, txdata_flags);
rcu_read_unlock();
}
@@ -1910,7 +1931,7 @@ static inline void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
}
__ieee80211_tx_skb_tid_band(sdata, skb, tid,
- chanctx_conf->def.chan->band);
+ chanctx_conf->def.chan->band, 0);
rcu_read_unlock();
}
@@ -2023,6 +2044,7 @@ void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
struct txq_info *txqi);
+void ieee80211_wake_txqs(unsigned long data);
void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
u16 transaction, u16 auth_alg, u16 status,
const u8 *extra, size_t extra_len, const u8 *bssid,
@@ -2031,26 +2053,27 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
const u8 *bssid, u16 stype, u16 reason,
bool send_frame, u8 *frame_buf);
+
+enum {
+ IEEE80211_PROBE_FLAG_DIRECTED = BIT(0),
+ IEEE80211_PROBE_FLAG_MIN_CONTENT = BIT(1),
+ IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2),
+};
+
int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
size_t buffer_len,
struct ieee80211_scan_ies *ie_desc,
const u8 *ie, size_t ie_len,
u8 bands_used, u32 *rate_masks,
- struct cfg80211_chan_def *chandef);
+ struct cfg80211_chan_def *chandef,
+ u32 flags);
struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
const u8 *src, const u8 *dst,
u32 ratemask,
struct ieee80211_channel *chan,
const u8 *ssid, size_t ssid_len,
const u8 *ie, size_t ie_len,
- bool directed);
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
- const u8 *src, const u8 *dst,
- const u8 *ssid, size_t ssid_len,
- const u8 *ie, size_t ie_len,
- u32 ratemask, bool directed, u32 tx_flags,
- struct ieee80211_channel *channel, bool scan);
-
+ u32 flags);
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
enum nl80211_band band, u32 *basic_rates);
@@ -2073,6 +2096,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
u32 cap);
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
const struct cfg80211_chan_def *chandef);
+u8 *ieee80211_ie_build_he_cap(u8 *pos,
+ const struct ieee80211_sta_he_cap *he_cap,
+ u8 *end);
int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates);
@@ -2087,7 +2113,9 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
/* channel management */
bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
struct cfg80211_chan_def *chandef);
-bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
+bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
+ const struct ieee80211_vht_operation *oper,
+ const struct ieee80211_ht_operation *htop,
struct cfg80211_chan_def *chandef);
u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 555e389b7dfa..5836ddeac9e3 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1130,7 +1130,7 @@ static void ieee80211_uninit(struct net_device *dev)
static u16 ieee80211_netdev_select_queue(struct net_device *dev,
struct sk_buff *skb,
- void *accel_priv,
+ struct net_device *sb_dev,
select_queue_fallback_t fallback)
{
return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb);
@@ -1176,7 +1176,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
static u16 ieee80211_monitor_select_queue(struct net_device *dev,
struct sk_buff *skb,
- void *accel_priv,
+ struct net_device *sb_dev,
select_queue_fallback_t fallback)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -1756,7 +1756,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
if (local->ops->wake_tx_queue &&
type != NL80211_IFTYPE_AP_VLAN &&
- type != NL80211_IFTYPE_MONITOR)
+ (type != NL80211_IFTYPE_MONITOR ||
+ (params->flags & MONITOR_FLAG_ACTIVE)))
txq_size += sizeof(struct txq_info) +
local->hw.txq_data_size;
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index ee0d0cc8dc3b..4700718e010f 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -248,6 +248,7 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
(key->conf.flags & IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
increment_tailroom_need_count(sdata);
+ key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
ret = drv_set_key(key->local, DISABLE_KEY, sdata,
sta ? &sta->sta : NULL, &key->conf);
@@ -256,8 +257,65 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
"failed to remove key (%d, %pM) from hardware (%d)\n",
key->conf.keyidx,
sta ? sta->sta.addr : bcast_addr, ret);
+}
- key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+static int ieee80211_hw_key_replace(struct ieee80211_key *old_key,
+ struct ieee80211_key *new_key,
+ bool ptk0rekey)
+{
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_local *local;
+ struct sta_info *sta;
+ int ret;
+
+ /* Aggregation sessions are OK when running on SW crypto.
+ * A broken remote STA may cause issues not observed with HW
+ * crypto, though.
+ */
+ if (!(old_key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ return 0;
+
+ assert_key_lock(old_key->local);
+ sta = old_key->sta;
+
+ /* PTK only using key ID 0 needs special handling on rekey */
+ if (new_key && sta && ptk0rekey) {
+ local = old_key->local;
+ sdata = old_key->sdata;
+
+ /* Stop TX till we are on the new key */
+ old_key->flags |= KEY_FLAG_TAINTED;
+ ieee80211_clear_fast_xmit(sta);
+
+ /* Aggregation sessions during rekey are complicated due to the
+ * reorder buffer and retransmits. Side step that by blocking
+ * aggregation during rekey and tear down running sessions.
+ */
+ if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) {
+ set_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ ieee80211_sta_tear_down_BA_sessions(sta,
+ AGG_STOP_LOCAL_REQUEST);
+ }
+
+ if (!wiphy_ext_feature_isset(local->hw.wiphy,
+ NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) {
+ pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.",
+ sta->sta.addr);
+ /* Flushing the driver queues *may* help prevent
+ * the clear text leaks and freezes.
+ */
+ ieee80211_flush_queues(local, sdata, false);
+ }
+ }
+
+ ieee80211_key_disable_hw_accel(old_key);
+
+ if (new_key)
+ ret = ieee80211_key_enable_hw_accel(new_key);
+ else
+ ret = 0;
+
+ return ret;
}
static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
@@ -316,38 +374,57 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
}
-static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
+static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
bool pairwise,
struct ieee80211_key *old,
struct ieee80211_key *new)
{
int idx;
+ int ret;
bool defunikey, defmultikey, defmgmtkey;
/* caller must provide at least one old/new */
if (WARN_ON(!new && !old))
- return;
+ return 0;
if (new)
list_add_tail_rcu(&new->list, &sdata->key_list);
WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
- if (old)
+ if (old) {
idx = old->conf.keyidx;
- else
+ /* TODO: proper implement and test "Extended Key ID for
+ * Individually Addressed Frames" from IEEE 802.11-2016.
+ * Till then always assume only key ID 0 is used for
+ * pairwise keys.*/
+ ret = ieee80211_hw_key_replace(old, new, pairwise);
+ } else {
+ /* new must be provided in case old is not */
idx = new->conf.keyidx;
+ if (!new->local->wowlan)
+ ret = ieee80211_key_enable_hw_accel(new);
+ else
+ ret = 0;
+ }
+
+ if (ret)
+ return ret;
if (sta) {
if (pairwise) {
rcu_assign_pointer(sta->ptk[idx], new);
sta->ptk_idx = idx;
- ieee80211_check_fast_xmit(sta);
+ if (new) {
+ clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
+ ieee80211_check_fast_xmit(sta);
+ }
} else {
rcu_assign_pointer(sta->gtk[idx], new);
}
- ieee80211_check_fast_rx(sta);
+ if (new)
+ ieee80211_check_fast_rx(sta);
} else {
defunikey = old &&
old == key_mtx_dereference(sdata->local,
@@ -380,6 +457,8 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
if (old)
list_del_rcu(&old->list);
+
+ return 0;
}
struct ieee80211_key *
@@ -575,9 +654,6 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
static void __ieee80211_key_destroy(struct ieee80211_key *key,
bool delay_tailroom)
{
- if (key->local)
- ieee80211_key_disable_hw_accel(key);
-
if (key->local) {
struct ieee80211_sub_if_data *sdata = key->sdata;
@@ -654,13 +730,16 @@ int ieee80211_key_link(struct ieee80211_key *key,
struct ieee80211_sub_if_data *sdata,
struct sta_info *sta)
{
- struct ieee80211_local *local = sdata->local;
struct ieee80211_key *old_key;
- int idx, ret;
- bool pairwise;
-
- pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
- idx = key->conf.keyidx;
+ int idx = key->conf.keyidx;
+ bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
+ /*
+ * We want to delay tailroom updates only for station - in that
+ * case it helps roaming speed, but in other cases it hurts and
+ * can cause warnings to appear.
+ */
+ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION;
+ int ret;
mutex_lock(&sdata->local->key_mtx);
@@ -687,17 +766,13 @@ int ieee80211_key_link(struct ieee80211_key *key,
increment_tailroom_need_count(sdata);
- ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
- ieee80211_key_destroy(old_key, true);
-
- ieee80211_debugfs_key_add(key);
+ ret = ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
- if (!local->wowlan) {
- ret = ieee80211_key_enable_hw_accel(key);
- if (ret)
- ieee80211_key_free(key, true);
+ if (!ret) {
+ ieee80211_debugfs_key_add(key);
+ ieee80211_key_destroy(old_key, delay_tailroom);
} else {
- ret = 0;
+ ieee80211_key_free(key, delay_tailroom);
}
out:
@@ -930,7 +1005,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local,
ieee80211_key_replace(key->sdata, key->sta,
key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
key, NULL);
- __ieee80211_key_destroy(key, true);
+ __ieee80211_key_destroy(key, key->sdata->vif.type ==
+ NL80211_IFTYPE_STATION);
}
for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
@@ -940,7 +1016,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local,
ieee80211_key_replace(key->sdata, key->sta,
key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
key, NULL);
- __ieee80211_key_destroy(key, true);
+ __ieee80211_key_destroy(key, key->sdata->vif.type ==
+ NL80211_IFTYPE_STATION);
}
mutex_unlock(&local->key_mtx);
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
index ba0b507ea691..d6c66fc19716 100644
--- a/net/mac80211/led.c
+++ b/net/mac80211/led.c
@@ -52,13 +52,15 @@ void ieee80211_free_led_names(struct ieee80211_local *local)
kfree(local->radio_led.name);
}
-static void ieee80211_tx_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_tx_led_activate(struct led_classdev *led_cdev)
{
struct ieee80211_local *local = container_of(led_cdev->trigger,
struct ieee80211_local,
tx_led);
atomic_inc(&local->tx_led_active);
+
+ return 0;
}
static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
@@ -70,13 +72,15 @@ static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev)
atomic_dec(&local->tx_led_active);
}
-static void ieee80211_rx_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_rx_led_activate(struct led_classdev *led_cdev)
{
struct ieee80211_local *local = container_of(led_cdev->trigger,
struct ieee80211_local,
rx_led);
atomic_inc(&local->rx_led_active);
+
+ return 0;
}
static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
@@ -88,13 +92,15 @@ static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev)
atomic_dec(&local->rx_led_active);
}
-static void ieee80211_assoc_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev)
{
struct ieee80211_local *local = container_of(led_cdev->trigger,
struct ieee80211_local,
assoc_led);
atomic_inc(&local->assoc_led_active);
+
+ return 0;
}
static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
@@ -106,13 +112,15 @@ static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev)
atomic_dec(&local->assoc_led_active);
}
-static void ieee80211_radio_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_radio_led_activate(struct led_classdev *led_cdev)
{
struct ieee80211_local *local = container_of(led_cdev->trigger,
struct ieee80211_local,
radio_led);
atomic_inc(&local->radio_led_active);
+
+ return 0;
}
static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
@@ -124,13 +132,15 @@ static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev)
atomic_dec(&local->radio_led_active);
}
-static void ieee80211_tpt_led_activate(struct led_classdev *led_cdev)
+static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev)
{
struct ieee80211_local *local = container_of(led_cdev->trigger,
struct ieee80211_local,
tpt_led);
atomic_inc(&local->tpt_led_active);
+
+ return 0;
}
static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index fb73451ed85e..83e71e6b2ebe 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -3,6 +3,8 @@
* Copyright 2005-2006, Devicescape Software, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (C) 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -255,8 +257,27 @@ static void ieee80211_restart_work(struct work_struct *work)
flush_work(&local->radar_detected_work);
rtnl_lock();
- list_for_each_entry(sdata, &local->interfaces, list)
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ /*
+ * XXX: there may be more work for other vif types and even
+ * for station mode: a good thing would be to run most of
+ * the iface type's dependent _stop (ieee80211_mg_stop,
+ * ieee80211_ibss_stop) etc...
+ * For now, fix only the specific bug that was seen: race
+ * between csa_connection_drop_work and us.
+ */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ /*
+ * This worker is scheduled from the iface worker that
+ * runs on mac80211's workqueue, so we can't be
+ * scheduling this worker after the cancel right here.
+ * The exception is ieee80211_chswitch_done.
+ * Then we can have a race...
+ */
+ cancel_work_sync(&sdata->u.mgd.csa_connection_drop_work);
+ }
flush_delayed_work(&sdata->dec_tailroom_needed_wk);
+ }
ieee80211_scan_cancel(local);
/* make sure any new ROC will consider local->in_reconfig */
@@ -470,10 +491,7 @@ static const struct ieee80211_vht_cap mac80211_vht_capa_mod_mask = {
cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC |
IEEE80211_VHT_CAP_SHORT_GI_80 |
IEEE80211_VHT_CAP_SHORT_GI_160 |
- IEEE80211_VHT_CAP_RXSTBC_1 |
- IEEE80211_VHT_CAP_RXSTBC_2 |
- IEEE80211_VHT_CAP_RXSTBC_3 |
- IEEE80211_VHT_CAP_RXSTBC_4 |
+ IEEE80211_VHT_CAP_RXSTBC_MASK |
IEEE80211_VHT_CAP_TXSTBC |
IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE |
IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE |
@@ -557,10 +575,19 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
wiphy_ext_feature_set(wiphy,
NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
- if (!ops->hw_scan)
+ if (!ops->hw_scan) {
wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
NL80211_FEATURE_AP_SCAN;
-
+ /*
+ * if the driver behaves correctly using the probe request
+ * (template) from mac80211, then both of these should be
+ * supported even with hw scan - but let drivers opt in.
+ */
+ wiphy_ext_feature_set(wiphy,
+ NL80211_EXT_FEATURE_SCAN_RANDOM_SN);
+ wiphy_ext_feature_set(wiphy,
+ NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT);
+ }
if (!ops->set_key)
wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
@@ -584,12 +611,24 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
local->ops = ops;
local->use_chanctx = use_chanctx;
+ /*
+ * We need a bit of data queued to build aggregates properly, so
+ * instruct the TCP stack to allow more than a single ms of data
+ * to be queued in the stack. The value is a bit-shift of 1
+ * second, so 8 is ~4ms of queued data. Only affects local TCP
+ * sockets.
+ * This is the default, anyhow - drivers may need to override it
+ * for local reasons (longer buffers, longer completion time, or
+ * similar).
+ */
+ local->hw.tx_sk_pacing_shift = 8;
+
/* set up some defaults */
local->hw.queues = 1;
local->hw.max_rates = 1;
local->hw.max_report_rates = 0;
- local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
- local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF;
+ local->hw.max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
+ local->hw.max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HT;
local->hw.offchannel_tx_hw_queue = IEEE80211_INVAL_HW_QUEUE;
local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
@@ -658,6 +697,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
(unsigned long)local);
+ if (ops->wake_tx_queue)
+ tasklet_init(&local->wake_txqs_tasklet, ieee80211_wake_txqs,
+ (unsigned long)local);
+
tasklet_init(&local->tasklet,
ieee80211_tasklet_handler,
(unsigned long) local);
@@ -816,7 +859,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
int result, i;
enum nl80211_band band;
int channels, max_bitrates;
- bool supp_ht, supp_vht;
+ bool supp_ht, supp_vht, supp_he;
netdev_features_t feature_whitelist;
struct cfg80211_chan_def dflt_chandef = {};
@@ -896,6 +939,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
max_bitrates = 0;
supp_ht = false;
supp_vht = false;
+ supp_he = false;
for (band = 0; band < NUM_NL80211_BANDS; band++) {
struct ieee80211_supported_band *sband;
@@ -922,6 +966,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
supp_ht = supp_ht || sband->ht_cap.ht_supported;
supp_vht = supp_vht || sband->vht_cap.vht_supported;
+ if (!supp_he)
+ supp_he = !!ieee80211_get_he_sta_cap(sband);
+
if (!sband->ht_cap.ht_supported)
continue;
@@ -1011,6 +1058,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
local->scan_ies_len +=
2 + sizeof(struct ieee80211_vht_cap);
+ /* HE cap element is variable in size - set len to allow max size */
+ /*
+ * TODO: 1 is added at the end of the calculation to accommodate for
+ * the temporary placing of the HE capabilities IE under EXT.
+ * Remove it once it is placed in the final place.
+ */
+ if (supp_he)
+ local->scan_ies_len +=
+ 2 + sizeof(struct ieee80211_he_cap_elem) +
+ sizeof(struct ieee80211_he_mcs_nss_supp) +
+ IEEE80211_HE_PPE_THRES_MAX_LEN + 1;
+
if (!local->ops->hw_scan) {
/* For hw_scan, driver needs to set these up. */
local->hw.wiphy->max_scan_ssids = 4;
@@ -1112,6 +1171,53 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
goto fail_rate;
}
+ if (local->rate_ctrl) {
+ clear_bit(IEEE80211_HW_SUPPORTS_VHT_EXT_NSS_BW, hw->flags);
+ if (local->rate_ctrl->ops->capa & RATE_CTRL_CAPA_VHT_EXT_NSS_BW)
+ ieee80211_hw_set(hw, SUPPORTS_VHT_EXT_NSS_BW);
+ }
+
+ /*
+ * If the VHT capabilities don't have IEEE80211_VHT_EXT_NSS_BW_CAPABLE,
+ * or have it when we don't, copy the sband structure and set/clear it.
+ * This is necessary because rate scaling algorithms could be switched
+ * and have different support values.
+ * Print a message so that in the common case the reallocation can be
+ * avoided.
+ */
+ BUILD_BUG_ON(NUM_NL80211_BANDS > 8 * sizeof(local->sband_allocated));
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ struct ieee80211_supported_band *sband;
+ bool local_cap, ie_cap;
+
+ local_cap = ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW);
+
+ sband = local->hw.wiphy->bands[band];
+ if (!sband || !sband->vht_cap.vht_supported)
+ continue;
+
+ ie_cap = !!(sband->vht_cap.vht_mcs.tx_highest &
+ cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE));
+
+ if (local_cap == ie_cap)
+ continue;
+
+ sband = kmemdup(sband, sizeof(*sband), GFP_KERNEL);
+ if (!sband) {
+ result = -ENOMEM;
+ goto fail_rate;
+ }
+
+ wiphy_dbg(hw->wiphy, "copying sband (band %d) due to VHT EXT NSS BW flag\n",
+ band);
+
+ sband->vht_cap.vht_mcs.tx_highest ^=
+ cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE);
+
+ local->hw.wiphy->bands[band] = sband;
+ local->sband_allocated |= BIT(band);
+ }
+
/* add one default STA interface if supported */
if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_STATION) &&
!ieee80211_hw_check(hw, NO_AUTO_VIF)) {
@@ -1182,6 +1288,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
#if IS_ENABLED(CONFIG_IPV6)
unregister_inet6addr_notifier(&local->ifa6_notifier);
#endif
+ ieee80211_txq_teardown_flows(local);
rtnl_lock();
@@ -1210,7 +1317,6 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
skb_queue_purge(&local->skb_queue);
skb_queue_purge(&local->skb_queue_unreliable);
skb_queue_purge(&local->skb_queue_tdls_chsw);
- ieee80211_txq_teardown_flows(local);
destroy_workqueue(local->workqueue);
wiphy_unregister(local->hw.wiphy);
@@ -1230,6 +1336,7 @@ static int ieee80211_free_ack_frame(int id, void *p, void *data)
void ieee80211_free_hw(struct ieee80211_hw *hw)
{
struct ieee80211_local *local = hw_to_local(hw);
+ enum nl80211_band band;
mutex_destroy(&local->iflist_mtx);
mutex_destroy(&local->mtx);
@@ -1245,6 +1352,12 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
ieee80211_free_led_names(local);
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ if (!(local->sband_allocated & BIT(band)))
+ continue;
+ kfree(local->hw.wiphy->bands[band]);
+ }
+
wiphy_free(local->hw.wiphy);
}
EXPORT_SYMBOL(ieee80211_free_hw);
@@ -1262,18 +1375,12 @@ static int __init ieee80211_init(void)
if (ret)
return ret;
- ret = rc80211_minstrel_ht_init();
- if (ret)
- goto err_minstrel;
-
ret = ieee80211_iface_init();
if (ret)
goto err_netdev;
return 0;
err_netdev:
- rc80211_minstrel_ht_exit();
- err_minstrel:
rc80211_minstrel_exit();
return ret;
@@ -1281,7 +1388,6 @@ static int __init ieee80211_init(void)
static void __exit ieee80211_exit(void)
{
- rc80211_minstrel_ht_exit();
rc80211_minstrel_exit();
ieee80211s_stop();
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index d51da26e9c18..8bad414c52ad 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008, 2009 open80211s Ltd.
+ * Copyright (C) 2018 Intel Corporation
* Authors: Luis Carlos Cobo <luisca@cozybit.com>
* Javier Cardona <javier@cozybit.com>
*
@@ -98,7 +99,9 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan,
NL80211_CHAN_NO_HT);
ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def);
- ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def);
+ ieee80211_chandef_vht_oper(&sdata->local->hw,
+ ie->vht_operation, ie->ht_operation,
+ &sta_chan_def);
if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
&sta_chan_def))
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index ee56f18cad3f..21526630bf65 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -217,7 +217,8 @@ void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
void ieee80211s_init(void);
void ieee80211s_update_metric(struct ieee80211_local *local,
- struct sta_info *sta, struct sk_buff *skb);
+ struct sta_info *sta,
+ struct ieee80211_tx_status *st);
void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 35ad3983ae4b..6950cd0bf594 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -295,15 +295,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata,
}
void ieee80211s_update_metric(struct ieee80211_local *local,
- struct sta_info *sta, struct sk_buff *skb)
+ struct sta_info *sta,
+ struct ieee80211_tx_status *st)
{
- struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb);
- struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+ struct ieee80211_tx_info *txinfo = st->info;
int failed;
- if (!ieee80211_is_data(hdr->frame_control))
- return;
-
failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK);
/* moving average, scaled to 100.
@@ -572,6 +569,10 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
forward = false;
reply = true;
target_metric = 0;
+
+ if (SN_GT(target_sn, ifmsh->sn))
+ ifmsh->sn = target_sn;
+
if (time_after(jiffies, ifmsh->last_sn_update +
net_traversal_jiffies(sdata)) ||
time_before(jiffies, ifmsh->last_sn_update)) {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a59187c016e0..d2bc8d57c87e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *channel,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
+ const struct ieee80211_he_operation *he_oper,
struct cfg80211_chan_def *chandef, bool tracking)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -207,7 +208,29 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
}
vht_chandef = *chandef;
- if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper &&
+ (le32_to_cpu(he_oper->he_oper_params) &
+ IEEE80211_HE_OPERATION_VHT_OPER_INFO)) {
+ struct ieee80211_vht_operation he_oper_vht_cap;
+
+ /*
+ * Set only first 3 bytes (other 2 aren't used in
+ * ieee80211_chandef_vht_oper() anyway)
+ */
+ memcpy(&he_oper_vht_cap, he_oper->optional, 3);
+ he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0);
+
+ if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
+ &he_oper_vht_cap, ht_oper,
+ &vht_chandef)) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ sdata_info(sdata,
+ "HE AP VHT information is invalid, disable HE\n");
+ ret = IEEE80211_STA_DISABLE_HE;
+ goto out;
+ }
+ } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper,
+ ht_oper, &vht_chandef)) {
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
sdata_info(sdata,
"AP VHT information is invalid, disable VHT\n");
@@ -300,12 +323,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_cap *ht_cap,
const struct ieee80211_ht_operation *ht_oper,
const struct ieee80211_vht_operation *vht_oper,
+ const struct ieee80211_he_operation *he_oper,
const u8 *bssid, u32 *changed)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- struct ieee80211_supported_band *sband;
- struct ieee80211_channel *chan;
+ struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan;
+ struct ieee80211_supported_band *sband =
+ local->hw.wiphy->bands[chan->band];
struct cfg80211_chan_def chandef;
u16 ht_opmode;
u32 flags;
@@ -320,6 +345,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT)
vht_oper = NULL;
+ /* don't check HE if we associated as non-HE station */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HE ||
+ !ieee80211_get_he_sta_cap(sband))
+ he_oper = NULL;
+
if (WARN_ON_ONCE(!sta))
return -EINVAL;
@@ -333,12 +363,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
sdata->vif.bss_conf.ht_operation_mode = ht_opmode;
}
- chan = sdata->vif.bss_conf.chandef.chan;
- sband = local->hw.wiphy->bands[chan->band];
-
- /* calculate new channel (type) based on HT/VHT operation IEs */
+ /* calculate new channel (type) based on HT/VHT/HE operation IEs */
flags = ieee80211_determine_chantype(sdata, sband, chan,
- ht_oper, vht_oper,
+ ht_oper, vht_oper, he_oper,
&chandef, true);
/*
@@ -582,6 +609,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
ieee80211_ie_build_vht_cap(pos, &vht_cap, cap);
}
+/* This function determines HE capability flags for the association
+ * and builds the IE.
+ */
+static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct ieee80211_supported_band *sband)
+{
+ u8 *pos;
+ const struct ieee80211_sta_he_cap *he_cap = NULL;
+ u8 he_cap_size;
+
+ he_cap = ieee80211_get_he_sta_cap(sband);
+ if (!he_cap)
+ return;
+
+ /*
+ * TODO: the 1 added is because this temporarily is under the EXTENSION
+ * IE. Get rid of it when it moves.
+ */
+ he_cap_size =
+ 2 + 1 + sizeof(he_cap->he_cap_elem) +
+ ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) +
+ ieee80211_he_ppe_size(he_cap->ppe_thres[0],
+ he_cap->he_cap_elem.phy_cap_info);
+ pos = skb_put(skb, he_cap_size);
+ ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size);
+}
+
static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
@@ -643,6 +698,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
2 + 2 * sband->n_channels + /* supported channels */
2 + sizeof(struct ieee80211_ht_cap) + /* HT */
2 + sizeof(struct ieee80211_vht_cap) + /* VHT */
+ 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */
+ sizeof(struct ieee80211_he_mcs_nss_supp) +
+ IEEE80211_HE_PPE_THRES_MAX_LEN +
assoc_data->ie_len + /* extra IEs */
(assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) +
9, /* WMM */
@@ -827,11 +885,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
offset = noffset;
}
+ /* if present, add any custom IEs that go before HE */
+ if (assoc_data->ie_len) {
+ static const u8 before_he[] = {
+ /*
+ * no need to list the ones split off before VHT
+ * or generated here
+ */
+ WLAN_EID_OPMODE_NOTIF,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE,
+ /* 11ai elements */
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER,
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN,
+ /* TODO: add 11ah/11aj/11ak elements */
+ };
+
+ /* RIC already taken above, so no need to handle here anymore */
+ noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len,
+ before_he, ARRAY_SIZE(before_he),
+ offset);
+ pos = skb_put(skb, noffset - offset);
+ memcpy(pos, assoc_data->ie + offset, noffset - offset);
+ offset = noffset;
+ }
+
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
ieee80211_add_vht_ie(sdata, skb, sband,
&assoc_data->ap_vht_cap);
- /* if present, add any custom non-vendor IEs that go after HT */
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ ieee80211_add_he_ie(sdata, skb, sband);
+
+ /* if present, add any custom non-vendor IEs that go after HE */
if (assoc_data->ie_len) {
noffset = ieee80211_ie_split_vendor(assoc_data->ie,
assoc_data->ie_len,
@@ -898,6 +986,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
struct ieee80211_hdr_3addr *nullfunc;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ /* Don't send NDPs when STA is connected HE */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ return;
+
skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif,
!ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP));
if (!skb)
@@ -929,6 +1022,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local,
if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
return;
+ /* Don't send NDPs when connected HE */
+ if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE))
+ return;
+
skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30);
if (!skb)
return;
@@ -978,6 +1075,10 @@ static void ieee80211_chswitch_work(struct work_struct *work)
*/
if (sdata->reserved_chanctx) {
+ struct ieee80211_supported_band *sband = NULL;
+ struct sta_info *mgd_sta = NULL;
+ enum ieee80211_sta_rx_bandwidth bw = IEEE80211_STA_RX_BW_20;
+
/*
* with multi-vif csa driver may call ieee80211_csa_finish()
* many times while waiting for other interfaces to use their
@@ -986,6 +1087,48 @@ static void ieee80211_chswitch_work(struct work_struct *work)
if (sdata->reserved_ready)
goto out;
+ if (sdata->vif.bss_conf.chandef.width !=
+ sdata->csa_chandef.width) {
+ /*
+ * For managed interface, we need to also update the AP
+ * station bandwidth and align the rate scale algorithm
+ * on the bandwidth change. Here we only consider the
+ * bandwidth of the new channel definition (as channel
+ * switch flow does not have the full HT/VHT/HE
+ * information), assuming that if additional changes are
+ * required they would be done as part of the processing
+ * of the next beacon from the AP.
+ */
+ switch (sdata->csa_chandef.width) {
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ default:
+ bw = IEEE80211_STA_RX_BW_20;
+ break;
+ case NL80211_CHAN_WIDTH_40:
+ bw = IEEE80211_STA_RX_BW_40;
+ break;
+ case NL80211_CHAN_WIDTH_80:
+ bw = IEEE80211_STA_RX_BW_80;
+ break;
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_160:
+ bw = IEEE80211_STA_RX_BW_160;
+ break;
+ }
+
+ mgd_sta = sta_info_get(sdata, ifmgd->bssid);
+ sband =
+ local->hw.wiphy->bands[sdata->csa_chandef.chan->band];
+ }
+
+ if (sdata->vif.bss_conf.chandef.width >
+ sdata->csa_chandef.width) {
+ mgd_sta->sta.bandwidth = bw;
+ rate_control_rate_update(local, sband, mgd_sta,
+ IEEE80211_RC_BW_CHANGED);
+ }
+
ret = ieee80211_vif_use_reserved_context(sdata);
if (ret) {
sdata_info(sdata,
@@ -996,6 +1139,13 @@ static void ieee80211_chswitch_work(struct work_struct *work)
goto out;
}
+ if (sdata->vif.bss_conf.chandef.width <
+ sdata->csa_chandef.width) {
+ mgd_sta->sta.bandwidth = bw;
+ rate_control_rate_update(local, sband, mgd_sta,
+ IEEE80211_RC_BW_CHANGED);
+ }
+
goto out;
}
@@ -1217,6 +1367,16 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
cbss->beacon_interval));
return;
drop_connection:
+ /*
+ * This is just so that the disconnect flow will know that
+ * we were trying to switch channel and failed. In case the
+ * mode is 1 (we are not allowed to Tx), we will know not to
+ * send a deauthentication frame. Those two fields will be
+ * reset when the disconnection worker runs.
+ */
+ sdata->vif.csa_active = true;
+ sdata->csa_block_tx = csa_ie.mode;
+
ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work);
mutex_unlock(&local->chanctx_mtx);
mutex_unlock(&local->mtx);
@@ -1700,9 +1860,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work)
}
/* MLME */
-static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- const u8 *wmm_param, size_t wmm_param_len)
+static bool
+ieee80211_sta_wmm_params(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *wmm_param, size_t wmm_param_len,
+ const struct ieee80211_mu_edca_param_set *mu_edca)
{
struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS];
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -1749,6 +1911,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_bk;
break;
case 2: /* AC_VI */
ac = IEEE80211_AC_VI;
@@ -1756,6 +1921,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_vi;
break;
case 3: /* AC_VO */
ac = IEEE80211_AC_VO;
@@ -1763,6 +1931,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_vo;
break;
case 0: /* AC_BE */
default:
@@ -1771,6 +1942,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE)
uapsd = true;
+ params[ac].mu_edca = !!mu_edca;
+ if (mu_edca)
+ params[ac].mu_edca_param_rec = mu_edca->ac_be;
break;
}
@@ -2219,6 +2393,20 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
ieee80211_sta_reset_conn_monitor(sdata);
}
+static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ const u8 *ssid, size_t ssid_len,
+ struct ieee80211_channel *channel)
+{
+ struct sk_buff *skb;
+
+ skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel,
+ ssid, ssid_len, NULL, 0,
+ IEEE80211_PROBE_FLAG_DIRECTED);
+ if (skb)
+ ieee80211_tx_skb(sdata, skb);
+}
+
static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -2265,10 +2453,9 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
else
ssid_len = ssid[1];
- ieee80211_send_probe_req(sdata, sdata->vif.addr, dst,
- ssid + 2, ssid_len, NULL,
- 0, (u32) -1, true, 0,
- ifmgd->associated->channel, false);
+ ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst,
+ ssid + 2, ssid_len,
+ ifmgd->associated->channel);
rcu_read_unlock();
}
@@ -2370,7 +2557,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid,
(u32) -1, cbss->channel,
ssid + 2, ssid_len,
- NULL, 0, true);
+ NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED);
rcu_read_unlock();
return skb;
@@ -2400,6 +2587,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+ bool tx;
sdata_lock(sdata);
if (!ifmgd->associated) {
@@ -2407,6 +2595,8 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
return;
}
+ tx = !sdata->csa_block_tx;
+
/* AP is probably out of range (or not reachable for another reason) so
* remove the bss struct for that AP.
*/
@@ -2414,7 +2604,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY,
- true, frame_buf);
+ tx, frame_buf);
mutex_lock(&local->mtx);
sdata->vif.csa_active = false;
ifmgd->csa_waiting_bcn = false;
@@ -2425,7 +2615,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata)
}
mutex_unlock(&local->mtx);
- ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true,
+ ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx,
WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY);
sdata_unlock(sdata);
@@ -2571,13 +2761,40 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
auth_data->key_idx, tx_flags);
}
+static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata,
+ const u8 *bssid)
+{
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct sta_info *sta;
+
+ sdata_info(sdata, "authenticated\n");
+ ifmgd->auth_data->done = true;
+ ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC;
+ ifmgd->auth_data->timeout_started = true;
+ run_again(sdata, ifmgd->auth_data->timeout);
+
+ /* move station state to auth */
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get(sdata, bssid);
+ if (!sta) {
+ WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
+ return false;
+ }
+ if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
+ sdata_info(sdata, "failed moving %pM to auth\n", bssid);
+ return false;
+ }
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ return true;
+}
+
static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt, size_t len)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
u8 bssid[ETH_ALEN];
u16 auth_alg, auth_transaction, status_code;
- struct sta_info *sta;
struct ieee80211_event event = {
.type = MLME_EVENT,
.u.mlme.data = AUTH_EVENT,
@@ -2601,7 +2818,11 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
status_code = le16_to_cpu(mgmt->u.auth.status_code);
if (auth_alg != ifmgd->auth_data->algorithm ||
- auth_transaction != ifmgd->auth_data->expected_transaction) {
+ (auth_alg != WLAN_AUTH_SAE &&
+ auth_transaction != ifmgd->auth_data->expected_transaction) ||
+ (auth_alg == WLAN_AUTH_SAE &&
+ (auth_transaction < ifmgd->auth_data->expected_transaction ||
+ auth_transaction > 2))) {
sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n",
mgmt->sa, auth_alg, ifmgd->auth_data->algorithm,
auth_transaction,
@@ -2644,35 +2865,17 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
event.u.mlme.status = MLME_SUCCESS;
drv_event_callback(sdata->local, sdata, &event);
- sdata_info(sdata, "authenticated\n");
- ifmgd->auth_data->done = true;
- ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC;
- ifmgd->auth_data->timeout_started = true;
- run_again(sdata, ifmgd->auth_data->timeout);
-
- if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
- ifmgd->auth_data->expected_transaction != 2) {
- /*
- * Report auth frame to user space for processing since another
- * round of Authentication frames is still needed.
- */
- cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
- return;
+ if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE ||
+ (auth_transaction == 2 &&
+ ifmgd->auth_data->expected_transaction == 2)) {
+ if (!ieee80211_mark_sta_auth(sdata, bssid))
+ goto out_err;
+ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
+ auth_transaction == 2) {
+ sdata_info(sdata, "SAE peer confirmed\n");
+ ifmgd->auth_data->peer_confirmed = true;
}
- /* move station state to auth */
- mutex_lock(&sdata->local->sta_mtx);
- sta = sta_info_get(sdata, bssid);
- if (!sta) {
- WARN_ONCE(1, "%s: STA %pM not found", sdata->name, bssid);
- goto out_err;
- }
- if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) {
- sdata_info(sdata, "failed moving %pM to auth\n", bssid);
- goto out_err;
- }
- mutex_unlock(&sdata->local->sta_mtx);
-
cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
return;
out_err:
@@ -3008,6 +3211,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
goto out;
}
+ /*
+ * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark
+ * HE as disabled. If on the 5GHz band, make sure it supports VHT.
+ */
+ if (ifmgd->flags & IEEE80211_STA_DISABLE_HT ||
+ (sband->band == NL80211_BAND_5GHZ &&
+ ifmgd->flags & IEEE80211_STA_DISABLE_VHT) ||
+ (!elems.he_cap && !elems.he_operation))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ (!elems.he_cap || !elems.he_operation)) {
+ mutex_unlock(&sdata->local->sta_mtx);
+ sdata_info(sdata,
+ "HE AP is missing HE capability/operation\n");
+ ret = false;
+ goto out;
+ }
+
/* Set up internal HT/VHT capabilities */
if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
@@ -3017,6 +3239,45 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
elems.vht_cap_elem, sta);
+ if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ elems.he_cap) {
+ ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
+ elems.he_cap,
+ elems.he_cap_len,
+ sta);
+
+ bss_conf->he_support = sta->sta.he_cap.has_he;
+ } else {
+ bss_conf->he_support = false;
+ }
+
+ if (bss_conf->he_support) {
+ bss_conf->bss_color =
+ le32_get_bits(elems.he_operation->he_oper_params,
+ IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
+
+ bss_conf->htc_trig_based_pkt_ext =
+ le32_get_bits(elems.he_operation->he_oper_params,
+ IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK);
+ bss_conf->frame_time_rts_th =
+ le32_get_bits(elems.he_operation->he_oper_params,
+ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
+
+ bss_conf->multi_sta_back_32bit =
+ sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
+ IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP;
+
+ bss_conf->ack_enabled =
+ sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
+ IEEE80211_HE_MAC_CAP2_ACK_EN;
+
+ bss_conf->uora_exists = !!elems.uora_element;
+ if (elems.uora_element)
+ bss_conf->uora_ocw_range = elems.uora_element[0];
+
+ /* TODO: OPEN: what happens if BSS color disable is set? */
+ }
+
/*
* Some APs, e.g. Netgear WNDR3700, report invalid HT operation data
* in their association response, so ignore that data for our own
@@ -3076,7 +3337,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
ieee80211_set_wmm_default(sdata, false, false);
} else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
- elems.wmm_param_len)) {
+ elems.wmm_param_len,
+ elems.mu_edca_param_set)) {
/* still enable QoS since we might have HT/VHT */
ieee80211_set_wmm_default(sdata, false, true);
/* set the disable-WMM flag in this case to disable
@@ -3590,7 +3852,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) &&
ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
- elems.wmm_param_len))
+ elems.wmm_param_len,
+ elems.mu_edca_param_set))
changed |= BSS_CHANGED_QOS;
/*
@@ -3629,7 +3892,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (ieee80211_config_bw(sdata, sta,
elems.ht_cap_elem, elems.ht_operation,
- elems.vht_operation, bssid, &changed)) {
+ elems.vht_operation, elems.he_operation,
+ bssid, &changed)) {
mutex_unlock(&local->sta_mtx);
sdata_info(sdata,
"failed to follow AP %pM bandwidth change, disconnect\n",
@@ -4266,6 +4530,68 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata,
return chains;
}
+static bool
+ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband,
+ const struct ieee80211_he_operation *he_op)
+{
+ const struct ieee80211_sta_he_cap *sta_he_cap =
+ ieee80211_get_he_sta_cap(sband);
+ u16 ap_min_req_set;
+ int i;
+
+ if (!sta_he_cap || !he_op)
+ return false;
+
+ ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set);
+
+ /* Need to go over for 80MHz, 160MHz and for 80+80 */
+ for (i = 0; i < 3; i++) {
+ const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp =
+ &sta_he_cap->he_mcs_nss_supp;
+ u16 sta_mcs_map_rx =
+ le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]);
+ u16 sta_mcs_map_tx =
+ le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]);
+ u8 nss;
+ bool verified = true;
+
+ /*
+ * For each band there is a maximum of 8 spatial streams
+ * possible. Each of the sta_mcs_map_* is a 16-bit struct built
+ * of 2 bits per NSS (1-8), with the values defined in enum
+ * ieee80211_he_mcs_support. Need to make sure STA TX and RX
+ * capabilities aren't less than the AP's minimum requirements
+ * for this HE BSS per SS.
+ * It is enough to find one such band that meets the reqs.
+ */
+ for (nss = 8; nss > 0; nss--) {
+ u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3;
+ u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3;
+ u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3;
+
+ if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED)
+ continue;
+
+ /*
+ * Make sure the HE AP doesn't require MCSs that aren't
+ * supported by the client
+ */
+ if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
+ (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) {
+ verified = false;
+ break;
+ }
+ }
+
+ if (verified)
+ return true;
+ }
+
+ /* If here, STA doesn't meet AP's HE min requirements */
+ return false;
+}
+
static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss)
{
@@ -4274,6 +4600,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_ht_cap *ht_cap = NULL;
const struct ieee80211_ht_operation *ht_oper = NULL;
const struct ieee80211_vht_operation *vht_oper = NULL;
+ const struct ieee80211_he_operation *he_oper = NULL;
struct ieee80211_supported_band *sband;
struct cfg80211_chan_def chandef;
int ret;
@@ -4329,6 +4656,24 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ ieee80211_get_he_sta_cap(sband)) {
+ const struct cfg80211_bss_ies *ies;
+ const u8 *he_oper_ie;
+
+ ies = rcu_dereference(cbss->ies);
+ he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION,
+ ies->data, ies->len);
+ if (he_oper_ie &&
+ he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3]))
+ he_oper = (void *)(he_oper_ie + 3);
+ else
+ he_oper = NULL;
+
+ if (!ieee80211_verify_sta_he_mcs_support(sband, he_oper))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+ }
+
/* Allow VHT if at least one channel on the sband supports 80 MHz */
have_80mhz = false;
for (i = 0; i < sband->n_channels; i++) {
@@ -4345,7 +4690,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
cbss->channel,
- ht_oper, vht_oper,
+ ht_oper, vht_oper, he_oper,
&chandef, false);
sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss),
@@ -4546,6 +4891,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgd_auth_data *auth_data;
u16 auth_alg;
int err;
+ bool cont_auth;
/* prepare auth data structure */
@@ -4580,6 +4926,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
return -EOPNOTSUPP;
}
+ if (ifmgd->assoc_data)
+ return -EBUSY;
+
auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len +
req->ie_len, GFP_KERNEL);
if (!auth_data)
@@ -4599,6 +4948,13 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
auth_data->data_len += req->auth_data_len - 4;
}
+ /* Check if continuing authentication or trying to authenticate with the
+ * same BSS that we were in the process of authenticating with and avoid
+ * removal and re-addition of the STA entry in
+ * ieee80211_prep_connection().
+ */
+ cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss;
+
if (req->ie && req->ie_len) {
memcpy(&auth_data->data[auth_data->data_len],
req->ie, req->ie_len);
@@ -4615,18 +4971,26 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
/* try to authenticate/probe */
- if ((ifmgd->auth_data && !ifmgd->auth_data->done) ||
- ifmgd->assoc_data) {
- err = -EBUSY;
- goto err_free;
+ if (ifmgd->auth_data) {
+ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE) {
+ auth_data->peer_confirmed =
+ ifmgd->auth_data->peer_confirmed;
+ }
+ ieee80211_destroy_auth_data(sdata, cont_auth);
}
- if (ifmgd->auth_data)
- ieee80211_destroy_auth_data(sdata, false);
-
/* prep auth_data so we don't go into idle on disassoc */
ifmgd->auth_data = auth_data;
+ /* If this is continuation of an ongoing SAE authentication exchange
+ * (i.e., request to send SAE Confirm) and the peer has already
+ * confirmed, mark authentication completed since we are about to send
+ * out SAE Confirm.
+ */
+ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE &&
+ auth_data->peer_confirmed && auth_data->sae_trans == 2)
+ ieee80211_mark_sta_auth(sdata, req->bss->bssid);
+
if (ifmgd->associated) {
u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
@@ -4644,7 +5008,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
sdata_info(sdata, "authenticate with %pM\n", req->bss->bssid);
- err = ieee80211_prep_connection(sdata, req->bss, false, false);
+ err = ieee80211_prep_connection(sdata, req->bss, cont_auth, false);
if (err)
goto err_clear;
@@ -4665,7 +5029,6 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
mutex_lock(&sdata->local->mtx);
ieee80211_vif_release_channel(sdata);
mutex_unlock(&sdata->local->mtx);
- err_free:
kfree(auth_data);
return err;
}
@@ -4751,8 +5114,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) {
ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
netdev_info(sdata->dev,
- "disabling HT/VHT due to WEP/TKIP use\n");
+ "disabling HE/HT/VHT due to WEP/TKIP use\n");
}
}
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index f1d40b6645ff..8ef4153cd299 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -262,7 +262,7 @@ static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc,
if (roc->mgmt_tx_cookie) {
if (!WARN_ON(!roc->frame)) {
ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7,
- roc->chan->band);
+ roc->chan->band, 0);
roc->frame = NULL;
}
} else {
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 8212bfeb71d6..d59198191a79 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -95,18 +95,5 @@ static inline void rc80211_minstrel_exit(void)
}
#endif
-#ifdef CONFIG_MAC80211_RC_MINSTREL_HT
-int rc80211_minstrel_ht_init(void);
-void rc80211_minstrel_ht_exit(void);
-#else
-static inline int rc80211_minstrel_ht_init(void)
-{
- return 0;
-}
-static inline void rc80211_minstrel_ht_exit(void)
-{
-}
-#endif
-
#endif /* IEEE80211_RATE_H */
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 76048b53c5b2..a34e9c2ca626 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -167,12 +167,6 @@ minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
if (unlikely(!mrs->att_hist)) {
mrs->prob_ewma = cur_prob;
} else {
- /* update exponential weighted moving variance */
- mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv,
- cur_prob,
- mrs->prob_ewma,
- EWMA_LEVEL);
-
/*update exponential weighted moving avarage */
mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
cur_prob,
@@ -572,141 +566,6 @@ minstrel_rate_init(void *priv, struct ieee80211_supported_band *sband,
minstrel_update_rates(mp, mi);
}
-static void *
-minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
-{
- struct ieee80211_supported_band *sband;
- struct minstrel_sta_info *mi;
- struct minstrel_priv *mp = priv;
- struct ieee80211_hw *hw = mp->hw;
- int max_rates = 0;
- int i;
-
- mi = kzalloc(sizeof(struct minstrel_sta_info), gfp);
- if (!mi)
- return NULL;
-
- for (i = 0; i < NUM_NL80211_BANDS; i++) {
- sband = hw->wiphy->bands[i];
- if (sband && sband->n_bitrates > max_rates)
- max_rates = sband->n_bitrates;
- }
-
- mi->r = kcalloc(max_rates, sizeof(struct minstrel_rate), gfp);
- if (!mi->r)
- goto error;
-
- mi->sample_table = kmalloc_array(max_rates, SAMPLE_COLUMNS, gfp);
- if (!mi->sample_table)
- goto error1;
-
- mi->last_stats_update = jiffies;
- return mi;
-
-error1:
- kfree(mi->r);
-error:
- kfree(mi);
- return NULL;
-}
-
-static void
-minstrel_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
-{
- struct minstrel_sta_info *mi = priv_sta;
-
- kfree(mi->sample_table);
- kfree(mi->r);
- kfree(mi);
-}
-
-static void
-minstrel_init_cck_rates(struct minstrel_priv *mp)
-{
- static const int bitrates[4] = { 10, 20, 55, 110 };
- struct ieee80211_supported_band *sband;
- u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
- int i, j;
-
- sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ];
- if (!sband)
- return;
-
- for (i = 0, j = 0; i < sband->n_bitrates; i++) {
- struct ieee80211_rate *rate = &sband->bitrates[i];
-
- if (rate->flags & IEEE80211_RATE_ERP_G)
- continue;
-
- if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
- continue;
-
- for (j = 0; j < ARRAY_SIZE(bitrates); j++) {
- if (rate->bitrate != bitrates[j])
- continue;
-
- mp->cck_rates[j] = i;
- break;
- }
- }
-}
-
-static void *
-minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
-{
- struct minstrel_priv *mp;
-
- mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
- if (!mp)
- return NULL;
-
- /* contention window settings
- * Just an approximation. Using the per-queue values would complicate
- * the calculations and is probably unnecessary */
- mp->cw_min = 15;
- mp->cw_max = 1023;
-
- /* number of packets (in %) to use for sampling other rates
- * sample less often for non-mrr packets, because the overhead
- * is much higher than with mrr */
- mp->lookaround_rate = 5;
- mp->lookaround_rate_mrr = 10;
-
- /* maximum time that the hw is allowed to stay in one MRR segment */
- mp->segment_size = 6000;
-
- if (hw->max_rate_tries > 0)
- mp->max_retry = hw->max_rate_tries;
- else
- /* safe default, does not necessarily have to match hw properties */
- mp->max_retry = 7;
-
- if (hw->max_rates >= 4)
- mp->has_mrr = true;
-
- mp->hw = hw;
- mp->update_interval = 100;
-
-#ifdef CONFIG_MAC80211_DEBUGFS
- mp->fixed_rate_idx = (u32) -1;
- mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx",
- 0666, debugfsdir, &mp->fixed_rate_idx);
-#endif
-
- minstrel_init_cck_rates(mp);
-
- return mp;
-}
-
-static void
-minstrel_free(void *priv)
-{
-#ifdef CONFIG_MAC80211_DEBUGFS
- debugfs_remove(((struct minstrel_priv *)priv)->dbg_fixed_rate);
-#endif
- kfree(priv);
-}
-
static u32 minstrel_get_expected_throughput(void *priv_sta)
{
struct minstrel_sta_info *mi = priv_sta;
@@ -725,30 +584,8 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
}
const struct rate_control_ops mac80211_minstrel = {
- .name = "minstrel",
.tx_status_ext = minstrel_tx_status,
.get_rate = minstrel_get_rate,
.rate_init = minstrel_rate_init,
- .alloc = minstrel_alloc,
- .free = minstrel_free,
- .alloc_sta = minstrel_alloc_sta,
- .free_sta = minstrel_free_sta,
-#ifdef CONFIG_MAC80211_DEBUGFS
- .add_sta_debugfs = minstrel_add_sta_debugfs,
- .remove_sta_debugfs = minstrel_remove_sta_debugfs,
-#endif
.get_expected_throughput = minstrel_get_expected_throughput,
};
-
-int __init
-rc80211_minstrel_init(void)
-{
- return ieee80211_rate_control_register(&mac80211_minstrel);
-}
-
-void
-rc80211_minstrel_exit(void)
-{
- ieee80211_rate_control_unregister(&mac80211_minstrel);
-}
-
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index be6c3f35f48b..23ec953e3a24 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -35,19 +35,6 @@ minstrel_ewma(int old, int new, int weight)
return old + incr;
}
-/*
- * Perform EWMV (Exponentially Weighted Moving Variance) calculation
- */
-static inline int
-minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight)
-{
- int diff, incr;
-
- diff = cur_prob - prob_ewma;
- incr = (EWMA_DIV - weight) * diff / EWMA_DIV;
- return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV;
-}
-
struct minstrel_rate_stats {
/* current / last sampling period attempts/success counters */
u16 attempts, last_attempts;
@@ -56,11 +43,8 @@ struct minstrel_rate_stats {
/* total attempts/success counters */
u32 att_hist, succ_hist;
- /* statistis of packet delivery probability
- * prob_ewma - exponential weighted moving average of prob
- * prob_ewmsd - exp. weighted moving standard deviation of prob */
+ /* prob_ewma - exponential weighted moving average of prob */
u16 prob_ewma;
- u16 prob_ewmv;
/* maximum retry counts */
u8 retry_count;
@@ -109,11 +93,6 @@ struct minstrel_sta_info {
/* sampling table */
u8 *sample_table;
-
-#ifdef CONFIG_MAC80211_DEBUGFS
- struct dentry *dbg_stats;
- struct dentry *dbg_stats_csv;
-#endif
};
struct minstrel_priv {
@@ -137,7 +116,6 @@ struct minstrel_priv {
* - setting will be applied on next update
*/
u32 fixed_rate_idx;
- struct dentry *dbg_fixed_rate;
#endif
};
@@ -146,17 +124,8 @@ struct minstrel_debugfs_info {
char buf[];
};
-/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */
-static inline int
-minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs)
-{
- unsigned int ewmv = mrs->prob_ewmv;
- return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000));
-}
-
extern const struct rate_control_ops mac80211_minstrel;
void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
-void minstrel_remove_sta_debugfs(void *priv, void *priv_sta);
/* Recalculate success probabilities and counters for a given rate using EWMA */
void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs);
@@ -165,7 +134,5 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma);
/* debugfs */
int minstrel_stats_open(struct inode *inode, struct file *file);
int minstrel_stats_csv_open(struct inode *inode, struct file *file);
-ssize_t minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos);
-int minstrel_stats_release(struct inode *inode, struct file *file);
#endif
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 9ad7d63d3e5b..c8afd85b51a0 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -54,22 +54,6 @@
#include <net/mac80211.h>
#include "rc80211_minstrel.h"
-ssize_t
-minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos)
-{
- struct minstrel_debugfs_info *ms;
-
- ms = file->private_data;
- return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len);
-}
-
-int
-minstrel_stats_release(struct inode *inode, struct file *file)
-{
- kfree(file->private_data);
- return 0;
-}
-
int
minstrel_stats_open(struct inode *inode, struct file *file)
{
@@ -86,14 +70,13 @@ minstrel_stats_open(struct inode *inode, struct file *file)
p = ms->buf;
p += sprintf(p, "\n");
p += sprintf(p,
- "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n");
+ "best __________rate_________ ____statistics___ ____last_____ ______sum-of________\n");
p += sprintf(p,
- "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
+ "rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n");
for (i = 0; i < mi->n_rates; i++) {
struct minstrel_rate *mr = &mi->r[i];
struct minstrel_rate_stats *mrs = &mi->r[i].stats;
- unsigned int prob_ewmsd;
*(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' ';
*(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' ';
@@ -109,15 +92,13 @@ minstrel_stats_open(struct inode *inode, struct file *file)
tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
- prob_ewmsd = minstrel_get_ewmsd10(mrs);
- p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
+ p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u"
" %3u %3u %-3u "
"%9llu %-9llu\n",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
@@ -135,14 +116,6 @@ minstrel_stats_open(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations minstrel_stat_fops = {
- .owner = THIS_MODULE,
- .open = minstrel_stats_open,
- .read = minstrel_stats_read,
- .release = minstrel_stats_release,
- .llseek = default_llseek,
-};
-
int
minstrel_stats_csv_open(struct inode *inode, struct file *file)
{
@@ -161,7 +134,6 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
for (i = 0; i < mi->n_rates; i++) {
struct minstrel_rate *mr = &mi->r[i];
struct minstrel_rate_stats *mrs = &mi->r[i].stats;
- unsigned int prob_ewmsd;
p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : ""));
p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : ""));
@@ -177,14 +149,12 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
- prob_ewmsd = minstrel_get_ewmsd10(mrs);
- p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u,"
+ p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u,"
"%llu,%llu,%d,%d\n",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
@@ -200,33 +170,3 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
return 0;
}
-
-static const struct file_operations minstrel_stat_csv_fops = {
- .owner = THIS_MODULE,
- .open = minstrel_stats_csv_open,
- .read = minstrel_stats_read,
- .release = minstrel_stats_release,
- .llseek = default_llseek,
-};
-
-void
-minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
-{
- struct minstrel_sta_info *mi = priv_sta;
-
- mi->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, mi,
- &minstrel_stat_fops);
-
- mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, mi,
- &minstrel_stat_csv_fops);
-}
-
-void
-minstrel_remove_sta_debugfs(void *priv, void *priv_sta)
-{
- struct minstrel_sta_info *mi = priv_sta;
-
- debugfs_remove(mi->dbg_stats);
-
- debugfs_remove(mi->dbg_stats_csv);
-}
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 67ebdeaffbbc..f466ec37d161 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -52,22 +52,23 @@
_streams - 1
/* MCS rate information for an MCS group */
-#define MCS_GROUP(_streams, _sgi, _ht40) \
+#define MCS_GROUP(_streams, _sgi, _ht40, _s) \
[GROUP_IDX(_streams, _sgi, _ht40)] = { \
.streams = _streams, \
+ .shift = _s, \
.flags = \
IEEE80211_TX_RC_MCS | \
(_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
(_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \
.duration = { \
- MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234), \
- MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234) >> _s, \
+ MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) >> _s \
} \
}
@@ -80,9 +81,10 @@
#define BW2VBPS(_bw, r3, r2, r1) \
(_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
-#define VHT_GROUP(_streams, _sgi, _bw) \
+#define VHT_GROUP(_streams, _sgi, _bw, _s) \
[VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
.streams = _streams, \
+ .shift = _s, \
.flags = \
IEEE80211_TX_RC_VHT_MCS | \
(_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
@@ -90,25 +92,25 @@
_bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \
.duration = { \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 117, 54, 26)), \
+ BW2VBPS(_bw, 117, 54, 26)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 234, 108, 52)), \
+ BW2VBPS(_bw, 234, 108, 52)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 351, 162, 78)), \
+ BW2VBPS(_bw, 351, 162, 78)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 468, 216, 104)), \
+ BW2VBPS(_bw, 468, 216, 104)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 702, 324, 156)), \
+ BW2VBPS(_bw, 702, 324, 156)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 936, 432, 208)), \
+ BW2VBPS(_bw, 936, 432, 208)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 1053, 486, 234)), \
+ BW2VBPS(_bw, 1053, 486, 234)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 1170, 540, 260)), \
+ BW2VBPS(_bw, 1170, 540, 260)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 1404, 648, 312)), \
+ BW2VBPS(_bw, 1404, 648, 312)) >> _s, \
MCS_DURATION(_streams, _sgi, \
- BW2VBPS(_bw, 1560, 720, 346)) \
+ BW2VBPS(_bw, 1560, 720, 346)) >> _s \
} \
}
@@ -121,28 +123,27 @@
(CCK_DURATION((_bitrate > 10 ? 20 : 10), false, 60) + \
CCK_DURATION(_bitrate, _short, AVG_PKT_SIZE))
-#define CCK_DURATION_LIST(_short) \
- CCK_ACK_DURATION(10, _short), \
- CCK_ACK_DURATION(20, _short), \
- CCK_ACK_DURATION(55, _short), \
- CCK_ACK_DURATION(110, _short)
+#define CCK_DURATION_LIST(_short, _s) \
+ CCK_ACK_DURATION(10, _short) >> _s, \
+ CCK_ACK_DURATION(20, _short) >> _s, \
+ CCK_ACK_DURATION(55, _short) >> _s, \
+ CCK_ACK_DURATION(110, _short) >> _s
-#define CCK_GROUP \
+#define CCK_GROUP(_s) \
[MINSTREL_CCK_GROUP] = { \
- .streams = 0, \
+ .streams = 1, \
.flags = 0, \
+ .shift = _s, \
.duration = { \
- CCK_DURATION_LIST(false), \
- CCK_DURATION_LIST(true) \
+ CCK_DURATION_LIST(false, _s), \
+ CCK_DURATION_LIST(true, _s) \
} \
}
-#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
static bool minstrel_vht_only = true;
module_param(minstrel_vht_only, bool, 0644);
MODULE_PARM_DESC(minstrel_vht_only,
"Use only VHT rates when VHT is supported by sta.");
-#endif
/*
* To enable sufficiently targeted rate sampling, MCS rates are divided into
@@ -153,49 +154,47 @@ MODULE_PARM_DESC(minstrel_vht_only,
* BW -> SGI -> #streams
*/
const struct mcs_group minstrel_mcs_groups[] = {
- MCS_GROUP(1, 0, BW_20),
- MCS_GROUP(2, 0, BW_20),
- MCS_GROUP(3, 0, BW_20),
+ MCS_GROUP(1, 0, BW_20, 5),
+ MCS_GROUP(2, 0, BW_20, 4),
+ MCS_GROUP(3, 0, BW_20, 4),
- MCS_GROUP(1, 1, BW_20),
- MCS_GROUP(2, 1, BW_20),
- MCS_GROUP(3, 1, BW_20),
+ MCS_GROUP(1, 1, BW_20, 5),
+ MCS_GROUP(2, 1, BW_20, 4),
+ MCS_GROUP(3, 1, BW_20, 4),
- MCS_GROUP(1, 0, BW_40),
- MCS_GROUP(2, 0, BW_40),
- MCS_GROUP(3, 0, BW_40),
+ MCS_GROUP(1, 0, BW_40, 4),
+ MCS_GROUP(2, 0, BW_40, 4),
+ MCS_GROUP(3, 0, BW_40, 4),
- MCS_GROUP(1, 1, BW_40),
- MCS_GROUP(2, 1, BW_40),
- MCS_GROUP(3, 1, BW_40),
+ MCS_GROUP(1, 1, BW_40, 4),
+ MCS_GROUP(2, 1, BW_40, 4),
+ MCS_GROUP(3, 1, BW_40, 4),
- CCK_GROUP,
+ CCK_GROUP(8),
-#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
- VHT_GROUP(1, 0, BW_20),
- VHT_GROUP(2, 0, BW_20),
- VHT_GROUP(3, 0, BW_20),
+ VHT_GROUP(1, 0, BW_20, 5),
+ VHT_GROUP(2, 0, BW_20, 4),
+ VHT_GROUP(3, 0, BW_20, 4),
- VHT_GROUP(1, 1, BW_20),
- VHT_GROUP(2, 1, BW_20),
- VHT_GROUP(3, 1, BW_20),
+ VHT_GROUP(1, 1, BW_20, 5),
+ VHT_GROUP(2, 1, BW_20, 4),
+ VHT_GROUP(3, 1, BW_20, 4),
- VHT_GROUP(1, 0, BW_40),
- VHT_GROUP(2, 0, BW_40),
- VHT_GROUP(3, 0, BW_40),
+ VHT_GROUP(1, 0, BW_40, 4),
+ VHT_GROUP(2, 0, BW_40, 4),
+ VHT_GROUP(3, 0, BW_40, 4),
- VHT_GROUP(1, 1, BW_40),
- VHT_GROUP(2, 1, BW_40),
- VHT_GROUP(3, 1, BW_40),
+ VHT_GROUP(1, 1, BW_40, 4),
+ VHT_GROUP(2, 1, BW_40, 4),
+ VHT_GROUP(3, 1, BW_40, 4),
- VHT_GROUP(1, 0, BW_80),
- VHT_GROUP(2, 0, BW_80),
- VHT_GROUP(3, 0, BW_80),
+ VHT_GROUP(1, 0, BW_80, 4),
+ VHT_GROUP(2, 0, BW_80, 4),
+ VHT_GROUP(3, 0, BW_80, 4),
- VHT_GROUP(1, 1, BW_80),
- VHT_GROUP(2, 1, BW_80),
- VHT_GROUP(3, 1, BW_80),
-#endif
+ VHT_GROUP(1, 1, BW_80, 4),
+ VHT_GROUP(2, 1, BW_80, 4),
+ VHT_GROUP(3, 1, BW_80, 4),
};
static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly;
@@ -282,7 +281,8 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
break;
/* short preamble */
- if (!(mi->supported[group] & BIT(idx)))
+ if ((mi->supported[group] & BIT(idx + 4)) &&
+ (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE))
idx += 4;
}
return &mi->groups[group].rates[idx];
@@ -311,7 +311,8 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
if (group != MINSTREL_CCK_GROUP)
nsecs = 1000 * mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len);
- nsecs += minstrel_mcs_groups[group].duration[rate];
+ nsecs += minstrel_mcs_groups[group].duration[rate] <<
+ minstrel_mcs_groups[group].shift;
/*
* For the throughput calculation, limit the probability value to 90% to
@@ -759,12 +760,19 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
minstrel_ht_update_rates(mp, mi);
}
+static inline int
+minstrel_get_duration(int index)
+{
+ const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+ unsigned int duration = group->duration[index % MCS_GROUP_RATES];
+ return duration << group->shift;
+}
+
static void
minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
int index)
{
struct minstrel_rate_stats *mrs;
- const struct mcs_group *group;
unsigned int tx_time, tx_time_rtscts, tx_time_data;
unsigned int cw = mp->cw_min;
unsigned int ctime = 0;
@@ -783,8 +791,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
mrs->retry_count_rtscts = 2;
mrs->retry_updated = true;
- group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
- tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len / 1000;
+ tx_time_data = minstrel_get_duration(index) * ampdu_len / 1000;
/* Contention time for first 2 tries */
ctime = (t_slot * cw) >> 1;
@@ -878,20 +885,24 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
int group = mi->max_prob_rate / MCS_GROUP_RATES;
const struct mcs_group *g = &minstrel_mcs_groups[group];
int rate = mi->max_prob_rate % MCS_GROUP_RATES;
+ unsigned int duration;
/* Disable A-MSDU if max_prob_rate is bad */
if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100))
return 1;
+ duration = g->duration[rate];
+ duration <<= g->shift;
+
/* If the rate is slower than single-stream MCS1, make A-MSDU limit small */
- if (g->duration[rate] > MCS_DURATION(1, 0, 52))
+ if (duration > MCS_DURATION(1, 0, 52))
return 500;
/*
* If the rate is slower than single-stream MCS4, limit A-MSDU to usual
* data packet size
*/
- if (g->duration[rate] > MCS_DURATION(1, 0, 104))
+ if (duration > MCS_DURATION(1, 0, 104))
return 1600;
/*
@@ -899,7 +910,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
* rate success probability is less than 75%, limit A-MSDU to twice the usual
* data packet size
*/
- if (g->duration[rate] > MCS_DURATION(1, 0, 260) ||
+ if (duration > MCS_DURATION(1, 0, 260) ||
(minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) <
MINSTREL_FRAC(75, 100)))
return 3200;
@@ -946,13 +957,6 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
rate_control_set_rates(mp->hw, mi->sta, rates);
}
-static inline int
-minstrel_get_duration(int index)
-{
- const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
- return group->duration[index % MCS_GROUP_RATES];
-}
-
static int
minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
{
@@ -1000,10 +1004,13 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
return -1;
/*
- * Do not sample if the probability is already higher than 95%
- * to avoid wasting airtime.
+ * Do not sample if the probability is already higher than 95%,
+ * or if the rate is 3 times slower than the current max probability
+ * rate, to avoid wasting airtime.
*/
- if (mrs->prob_ewma > MINSTREL_FRAC(95, 100))
+ sample_dur = minstrel_get_duration(sample_idx);
+ if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) ||
+ minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur)
return -1;
/*
@@ -1013,7 +1020,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 /
MCS_GROUP_RATES].streams;
- sample_dur = minstrel_get_duration(sample_idx);
if (sample_dur >= minstrel_get_duration(tp_rate2) &&
(cur_max_tp_streams - 1 <
minstrel_mcs_groups[sample_group].streams ||
@@ -1077,18 +1083,23 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
return;
sample_group = &minstrel_mcs_groups[sample_idx / MCS_GROUP_RATES];
+ sample_idx %= MCS_GROUP_RATES;
+
+ if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] &&
+ (sample_idx >= 4) != txrc->short_preamble)
+ return;
+
info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
rate->count = 1;
- if (sample_idx / MCS_GROUP_RATES == MINSTREL_CCK_GROUP) {
+ if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) {
int idx = sample_idx % ARRAY_SIZE(mp->cck_rates);
rate->idx = mp->cck_rates[idx];
} else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) {
ieee80211_rate_set_vht(rate, sample_idx % MCS_GROUP_RATES,
sample_group->streams);
} else {
- rate->idx = sample_idx % MCS_GROUP_RATES +
- (sample_group->streams - 1) * 8;
+ rate->idx = sample_idx + (sample_group->streams - 1) * 8;
}
rate->flags = sample_group->flags;
@@ -1130,14 +1141,14 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
struct minstrel_ht_sta_priv *msp = priv_sta;
struct minstrel_ht_sta *mi = &msp->ht;
struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
- u16 sta_cap = sta->ht_cap.cap;
+ u16 ht_cap = sta->ht_cap.cap;
struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
- struct sta_info *sinfo = container_of(sta, struct sta_info, sta);
int use_vht;
int n_supported = 0;
int ack_dur;
int stbc;
int i;
+ bool ldpc;
/* fall back to the old minstrel for legacy stations */
if (!sta->ht_cap.ht_supported)
@@ -1145,12 +1156,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB);
-#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
if (vht_cap->vht_supported)
use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0);
else
-#endif
- use_vht = 0;
+ use_vht = 0;
msp->is_ht = true;
memset(mi, 0, sizeof(*mi));
@@ -1175,16 +1184,22 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
}
mi->sample_tries = 4;
- /* TODO tx_flags for vht - ATM the RC API is not fine-grained enough */
if (!use_vht) {
- stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >>
+ stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >>
IEEE80211_HT_CAP_RX_STBC_SHIFT;
- mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
- if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING)
- mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+ ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING;
+ } else {
+ stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >>
+ IEEE80211_VHT_CAP_RXSTBC_SHIFT;
+
+ ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC;
}
+ mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
+ if (ldpc)
+ mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+
for (i = 0; i < ARRAY_SIZE(mi->groups); i++) {
u32 gflags = minstrel_mcs_groups[i].flags;
int bw, nss;
@@ -1197,10 +1212,10 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
if (gflags & IEEE80211_TX_RC_SHORT_GI) {
if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) {
- if (!(sta_cap & IEEE80211_HT_CAP_SGI_40))
+ if (!(ht_cap & IEEE80211_HT_CAP_SGI_40))
continue;
} else {
- if (!(sta_cap & IEEE80211_HT_CAP_SGI_20))
+ if (!(ht_cap & IEEE80211_HT_CAP_SGI_20))
continue;
}
}
@@ -1217,10 +1232,9 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
/* HT rate */
if (gflags & IEEE80211_TX_RC_MCS) {
-#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
if (use_vht && minstrel_vht_only)
continue;
-#endif
+
mi->supported[i] = mcs->rx_mask[nss - 1];
if (mi->supported[i])
n_supported++;
@@ -1258,8 +1272,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
if (!n_supported)
goto use_legacy;
- if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE))
- mi->cck_supported_short |= mi->cck_supported_short << 4;
+ mi->supported[MINSTREL_CCK_GROUP] |= mi->cck_supported_short << 4;
/* create an initial rate table with the lowest supported rates */
minstrel_ht_update_stats(mp, mi);
@@ -1340,16 +1353,88 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
kfree(msp);
}
+static void
+minstrel_ht_init_cck_rates(struct minstrel_priv *mp)
+{
+ static const int bitrates[4] = { 10, 20, 55, 110 };
+ struct ieee80211_supported_band *sband;
+ u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef);
+ int i, j;
+
+ sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ];
+ if (!sband)
+ return;
+
+ for (i = 0; i < sband->n_bitrates; i++) {
+ struct ieee80211_rate *rate = &sband->bitrates[i];
+
+ if (rate->flags & IEEE80211_RATE_ERP_G)
+ continue;
+
+ if ((rate_flags & sband->bitrates[i].flags) != rate_flags)
+ continue;
+
+ for (j = 0; j < ARRAY_SIZE(bitrates); j++) {
+ if (rate->bitrate != bitrates[j])
+ continue;
+
+ mp->cck_rates[j] = i;
+ break;
+ }
+ }
+}
+
static void *
minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
{
- return mac80211_minstrel.alloc(hw, debugfsdir);
+ struct minstrel_priv *mp;
+
+ mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC);
+ if (!mp)
+ return NULL;
+
+ /* contention window settings
+ * Just an approximation. Using the per-queue values would complicate
+ * the calculations and is probably unnecessary */
+ mp->cw_min = 15;
+ mp->cw_max = 1023;
+
+ /* number of packets (in %) to use for sampling other rates
+ * sample less often for non-mrr packets, because the overhead
+ * is much higher than with mrr */
+ mp->lookaround_rate = 5;
+ mp->lookaround_rate_mrr = 10;
+
+ /* maximum time that the hw is allowed to stay in one MRR segment */
+ mp->segment_size = 6000;
+
+ if (hw->max_rate_tries > 0)
+ mp->max_retry = hw->max_rate_tries;
+ else
+ /* safe default, does not necessarily have to match hw properties */
+ mp->max_retry = 7;
+
+ if (hw->max_rates >= 4)
+ mp->has_mrr = true;
+
+ mp->hw = hw;
+ mp->update_interval = 100;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+ mp->fixed_rate_idx = (u32) -1;
+ debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir,
+ &mp->fixed_rate_idx);
+#endif
+
+ minstrel_ht_init_cck_rates(mp);
+
+ return mp;
}
static void
minstrel_ht_free(void *priv)
{
- mac80211_minstrel.free(priv);
+ kfree(priv);
}
static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
@@ -1384,7 +1469,6 @@ static const struct rate_control_ops mac80211_minstrel_ht = {
.free = minstrel_ht_free,
#ifdef CONFIG_MAC80211_DEBUGFS
.add_sta_debugfs = minstrel_ht_add_sta_debugfs,
- .remove_sta_debugfs = minstrel_ht_remove_sta_debugfs,
#endif
.get_expected_throughput = minstrel_ht_get_expected_throughput,
};
@@ -1409,14 +1493,14 @@ static void __init init_sample_table(void)
}
int __init
-rc80211_minstrel_ht_init(void)
+rc80211_minstrel_init(void)
{
init_sample_table();
return ieee80211_rate_control_register(&mac80211_minstrel_ht);
}
void
-rc80211_minstrel_ht_exit(void)
+rc80211_minstrel_exit(void)
{
ieee80211_rate_control_unregister(&mac80211_minstrel_ht);
}
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index de1646c42e82..26b7a3244b47 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -15,11 +15,7 @@
*/
#define MINSTREL_MAX_STREAMS 3
#define MINSTREL_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */
-#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
#define MINSTREL_VHT_STREAM_GROUPS 6 /* BW(=3) * SGI(=2) */
-#else
-#define MINSTREL_VHT_STREAM_GROUPS 0
-#endif
#define MINSTREL_HT_GROUPS_NB (MINSTREL_MAX_STREAMS * \
MINSTREL_HT_STREAM_GROUPS)
@@ -34,16 +30,13 @@
#define MINSTREL_CCK_GROUP (MINSTREL_HT_GROUP_0 + MINSTREL_HT_GROUPS_NB)
#define MINSTREL_VHT_GROUP_0 (MINSTREL_CCK_GROUP + 1)
-#ifdef CONFIG_MAC80211_RC_MINSTREL_VHT
#define MCS_GROUP_RATES 10
-#else
-#define MCS_GROUP_RATES 8
-#endif
struct mcs_group {
- u32 flags;
- unsigned int streams;
- unsigned int duration[MCS_GROUP_RATES];
+ u16 flags;
+ u8 streams;
+ u8 shift;
+ u16 duration[MCS_GROUP_RATES];
};
extern const struct mcs_group minstrel_mcs_groups[];
@@ -110,17 +103,12 @@ struct minstrel_ht_sta_priv {
struct minstrel_ht_sta ht;
struct minstrel_sta_info legacy;
};
-#ifdef CONFIG_MAC80211_DEBUGFS
- struct dentry *dbg_stats;
- struct dentry *dbg_stats_csv;
-#endif
void *ratelist;
void *sample_table;
bool is_ht;
};
void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
-void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta);
int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
int prob_ewma);
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index bfcc03152dc6..57820a5f2c16 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -15,6 +15,22 @@
#include "rc80211_minstrel.h"
#include "rc80211_minstrel_ht.h"
+static ssize_t
+minstrel_stats_read(struct file *file, char __user *buf, size_t len, loff_t *ppos)
+{
+ struct minstrel_debugfs_info *ms;
+
+ ms = file->private_data;
+ return simple_read_from_buffer(buf, len, ppos, ms->buf, ms->len);
+}
+
+static int
+minstrel_stats_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
static char *
minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
{
@@ -41,7 +57,7 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
static const int bitrates[4] = { 10, 20, 55, 110 };
int idx = i * MCS_GROUP_RATES + j;
- unsigned int prob_ewmsd;
+ unsigned int duration;
if (!(mi->supported[i] & BIT(j)))
continue;
@@ -79,21 +95,21 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
p += sprintf(p, " %3u ", idx);
/* tx_time[rate(i)] in usec */
- tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000);
+ duration = mg->duration[j];
+ duration <<= mg->shift;
+ tx_time = DIV_ROUND_CLOSEST(duration, 1000);
p += sprintf(p, "%6u ", tx_time);
tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
- prob_ewmsd = minstrel_get_ewmsd10(mrs);
- p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u"
+ p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u"
" %3u %3u %-3u "
"%9llu %-9llu\n",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
@@ -130,9 +146,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file)
p += sprintf(p, "\n");
p += sprintf(p,
- " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n");
+ " best ____________rate__________ ____statistics___ _____last____ ______sum-of________\n");
p += sprintf(p,
- "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n");
+ "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob)] [retry|suc|att] [#success | #attempts]\n");
p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p);
for (i = 0; i < MINSTREL_CCK_GROUP; i++)
@@ -187,7 +203,7 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j];
static const int bitrates[4] = { 10, 20, 55, 110 };
int idx = i * MCS_GROUP_RATES + j;
- unsigned int prob_ewmsd;
+ unsigned int duration;
if (!(mi->supported[i] & BIT(j)))
continue;
@@ -222,20 +238,21 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
}
p += sprintf(p, "%u,", idx);
- tx_time = DIV_ROUND_CLOSEST(mg->duration[j], 1000);
+
+ duration = mg->duration[j];
+ duration <<= mg->shift;
+ tx_time = DIV_ROUND_CLOSEST(duration, 1000);
p += sprintf(p, "%u,", tx_time);
tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
- prob_ewmsd = minstrel_get_ewmsd10(mrs);
- p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,"
+ p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,"
"%u,%llu,%llu,",
tp_max / 10, tp_max % 10,
tp_avg / 10, tp_avg % 10,
eprob / 10, eprob % 10,
- prob_ewmsd / 10, prob_ewmsd % 10,
mrs->retry_count,
mrs->last_success,
mrs->last_attempts,
@@ -303,17 +320,8 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
{
struct minstrel_ht_sta_priv *msp = priv_sta;
- msp->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, msp,
- &minstrel_ht_stat_fops);
- msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, msp,
- &minstrel_ht_stat_csv_fops);
-}
-
-void
-minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta)
-{
- struct minstrel_ht_sta_priv *msp = priv_sta;
-
- debugfs_remove(msp->dbg_stats);
- debugfs_remove(msp->dbg_stats_csv);
+ debugfs_create_file("rc_stats", 0444, dir, msp,
+ &minstrel_ht_stat_fops);
+ debugfs_create_file("rc_stats_csv", 0444, dir, msp,
+ &minstrel_ht_stat_csv_fops);
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 932985ca4e66..3bd3b5769797 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -115,7 +115,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
RX_FLAG_FAILED_PLCP_CRC |
- RX_FLAG_ONLY_MONITOR))
+ RX_FLAG_ONLY_MONITOR |
+ RX_FLAG_NO_PSDU))
return true;
if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
@@ -175,6 +176,29 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
len += 12;
}
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE) {
+ len = ALIGN(len, 2);
+ len += 12;
+ BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12);
+ }
+
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE_MU) {
+ len = ALIGN(len, 2);
+ len += 12;
+ BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12);
+ }
+
+ if (status->flag & RX_FLAG_NO_PSDU)
+ len += 1;
+
+ if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
+ len = ALIGN(len, 2);
+ len += 4;
+ BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_lsig) != 4);
+ }
+
if (status->chains) {
/* antenna and antenna signal fields */
len += 2 * hweight8(status->chains);
@@ -263,6 +287,25 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
int mpdulen, chain;
unsigned long chains = status->chains;
struct ieee80211_vendor_radiotap rtap = {};
+ struct ieee80211_radiotap_he he = {};
+ struct ieee80211_radiotap_he_mu he_mu = {};
+ struct ieee80211_radiotap_lsig lsig = {};
+
+ if (status->flag & RX_FLAG_RADIOTAP_HE) {
+ he = *(struct ieee80211_radiotap_he *)skb->data;
+ skb_pull(skb, sizeof(he));
+ WARN_ON_ONCE(status->encoding != RX_ENC_HE);
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_HE_MU) {
+ he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data;
+ skb_pull(skb, sizeof(he_mu));
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
+ lsig = *(struct ieee80211_radiotap_lsig *)skb->data;
+ skb_pull(skb, sizeof(lsig));
+ }
if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
@@ -520,6 +563,104 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
*pos++ = flags;
}
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE) {
+#define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f)
+
+ if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) {
+ he.data6 |= HE_PREP(DATA6_NSTS,
+ FIELD_GET(RX_ENC_FLAG_STBC_MASK,
+ status->enc_flags));
+ he.data3 |= HE_PREP(DATA3_STBC, 1);
+ } else {
+ he.data6 |= HE_PREP(DATA6_NSTS, status->nss);
+ }
+
+#define CHECK_GI(s) \
+ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \
+ (int)NL80211_RATE_INFO_HE_GI_##s)
+
+ CHECK_GI(0_8);
+ CHECK_GI(1_6);
+ CHECK_GI(3_2);
+
+ he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx);
+ he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm);
+ he.data3 |= HE_PREP(DATA3_CODING,
+ !!(status->enc_flags & RX_ENC_FLAG_LDPC));
+
+ he.data5 |= HE_PREP(DATA5_GI, status->he_gi);
+
+ switch (status->bw) {
+ case RATE_INFO_BW_20:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ);
+ break;
+ case RATE_INFO_BW_40:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ);
+ break;
+ case RATE_INFO_BW_80:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ);
+ break;
+ case RATE_INFO_BW_160:
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ);
+ break;
+ case RATE_INFO_BW_HE_RU:
+#define CHECK_RU_ALLOC(s) \
+ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \
+ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4)
+
+ CHECK_RU_ALLOC(26);
+ CHECK_RU_ALLOC(52);
+ CHECK_RU_ALLOC(106);
+ CHECK_RU_ALLOC(242);
+ CHECK_RU_ALLOC(484);
+ CHECK_RU_ALLOC(996);
+ CHECK_RU_ALLOC(2x996);
+
+ he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ status->he_ru + 4);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid SU BW %d\n", status->bw);
+ }
+
+ /* ensure 2 byte alignment */
+ while ((pos - (u8 *)rthdr) & 1)
+ pos++;
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE);
+ memcpy(pos, &he, sizeof(he));
+ pos += sizeof(he);
+ }
+
+ if (status->encoding == RX_ENC_HE &&
+ status->flag & RX_FLAG_RADIOTAP_HE_MU) {
+ /* ensure 2 byte alignment */
+ while ((pos - (u8 *)rthdr) & 1)
+ pos++;
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU);
+ memcpy(pos, &he_mu, sizeof(he_mu));
+ pos += sizeof(he_mu);
+ }
+
+ if (status->flag & RX_FLAG_NO_PSDU) {
+ rthdr->it_present |=
+ cpu_to_le32(1 << IEEE80211_RADIOTAP_ZERO_LEN_PSDU);
+ *pos++ = status->zero_length_psdu_type;
+ }
+
+ if (status->flag & RX_FLAG_RADIOTAP_LSIG) {
+ /* ensure 2 byte alignment */
+ while ((pos - (u8 *)rthdr) & 1)
+ pos++;
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_LSIG);
+ memcpy(pos, &lsig, sizeof(lsig));
+ pos += sizeof(lsig);
+ }
+
for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
*pos++ = status->chain_signal[chain];
*pos++ = chain;
@@ -613,6 +754,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
rcu_dereference(local->monitor_sdata);
bool only_monitor = false;
+ if (status->flag & RX_FLAG_RADIOTAP_HE)
+ rtap_space += sizeof(struct ieee80211_radiotap_he);
+
+ if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
+ rtap_space += sizeof(struct ieee80211_radiotap_he_mu);
+
if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
@@ -1389,7 +1536,7 @@ static void sta_ps_start(struct sta_info *sta)
if (!sta->sta.txq[0])
return;
- for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
+ for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
if (txq_has_queue(sta->sta.txq[tid]))
set_bit(tid, &sta->txq_buffered_tids);
else
@@ -1612,6 +1759,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
*/
if (!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS) &&
!ieee80211_has_morefrags(hdr->frame_control) &&
+ !is_multicast_ether_addr(hdr->addr1) &&
(ieee80211_is_mgmt(hdr->frame_control) ||
ieee80211_is_data(hdr->frame_control)) &&
!(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
@@ -1929,6 +2077,7 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
idx = sdata->fragment_next;
for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
struct ieee80211_hdr *f_hdr;
+ struct sk_buff *f_skb;
idx--;
if (idx < 0)
@@ -1940,7 +2089,8 @@ ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
entry->last_frag + 1 != frag)
continue;
- f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data;
+ f_skb = __skb_peek(&entry->skb_list);
+ f_hdr = (struct ieee80211_hdr *) f_skb->data;
/*
* Check ftype and addresses are equal, else check next fragment
@@ -2197,7 +2347,7 @@ __ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control)
if (!sdata->u.mgd.use_4addr)
return -1;
- else
+ else if (!ether_addr_equal(hdr->addr1, sdata->vif.addr))
check_port_control = true;
}
@@ -2308,8 +2458,9 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
if (!xmit_skb)
net_info_ratelimited("%s: failed to clone multicast frame\n",
dev->name);
- } else if (!is_multicast_ether_addr(ehdr->h_dest)) {
- dsta = sta_info_get(sdata, skb->data);
+ } else if (!is_multicast_ether_addr(ehdr->h_dest) &&
+ !ether_addr_equal(ehdr->h_dest, ehdr->h_source)) {
+ dsta = sta_info_get(sdata, ehdr->h_dest);
if (dsta) {
/*
* The destination station is associated to
@@ -3238,7 +3389,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
}
__ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7,
- status->band);
+ status->band, 0);
}
dev_kfree_skb(rx->skb);
return RX_QUEUED;
@@ -3383,8 +3534,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
status = IEEE80211_SKB_RXCB((rx->skb));
sband = rx->local->hw.wiphy->bands[status->band];
- if (!(status->encoding == RX_ENC_HT) &&
- !(status->encoding == RX_ENC_VHT))
+ if (status->encoding == RX_ENC_LEGACY)
rate = &sband->bitrates[status->rate_idx];
ieee80211_rx_cooked_monitor(rx, rate);
@@ -4091,11 +4241,10 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
if (fast_rx->internal_forward) {
struct sk_buff *xmit_skb = NULL;
- bool multicast = is_multicast_ether_addr(skb->data);
-
- if (multicast) {
+ if (is_multicast_ether_addr(addrs.da)) {
xmit_skb = skb_copy(skb, GFP_ATOMIC);
- } else if (sta_info_get(rx->sdata, skb->data)) {
+ } else if (!ether_addr_equal(addrs.da, addrs.sa) &&
+ sta_info_get(rx->sdata, addrs.da)) {
xmit_skb = skb;
skb = NULL;
}
@@ -4383,6 +4532,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
status->rate_idx, status->nss))
goto drop;
break;
+ case RX_ENC_HE:
+ if (WARN_ONCE(status->rate_idx > 11 ||
+ !status->nss ||
+ status->nss > 8,
+ "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n",
+ status->rate_idx, status->nss))
+ goto drop;
+ break;
default:
WARN_ON_ONCE(1);
/* fall through */
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 2e917a6d239d..5d2a11777718 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -20,6 +20,7 @@
#include <net/sch_generic.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/random.h>
#include <net/mac80211.h>
#include "ieee80211_i.h"
@@ -293,6 +294,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
struct cfg80211_chan_def chandef;
u8 bands_used = 0;
int i, ielen, n_chans;
+ u32 flags = 0;
req = rcu_dereference_protected(local->scan_req,
lockdep_is_held(&local->mtx));
@@ -331,12 +333,16 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
local->hw_scan_req->req.n_channels = n_chans;
ieee80211_prepare_scan_chandef(&chandef, req->scan_width);
+ if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
+ flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
+
ielen = ieee80211_build_preq_ies(local,
(u8 *)local->hw_scan_req->req.ie,
local->hw_scan_ies_bufsize,
&local->hw_scan_req->ies,
req->ie, req->ie_len,
- bands_used, req->rates, &chandef);
+ bands_used, req->rates, &chandef,
+ flags);
local->hw_scan_req->req.ie_len = ielen;
local->hw_scan_req->req.no_cck = req->no_cck;
ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr);
@@ -528,6 +534,35 @@ void ieee80211_run_deferred_scan(struct ieee80211_local *local)
round_jiffies_relative(0));
}
+static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata,
+ const u8 *src, const u8 *dst,
+ const u8 *ssid, size_t ssid_len,
+ const u8 *ie, size_t ie_len,
+ u32 ratemask, u32 flags, u32 tx_flags,
+ struct ieee80211_channel *channel)
+{
+ struct sk_buff *skb;
+ u32 txdata_flags = 0;
+
+ skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
+ ssid, ssid_len,
+ ie, ie_len, flags);
+
+ if (skb) {
+ if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) {
+ struct ieee80211_hdr *hdr = (void *)skb->data;
+ u16 sn = get_random_u32();
+
+ txdata_flags |= IEEE80211_TX_NO_SEQNO;
+ hdr->seq_ctrl =
+ cpu_to_le16(IEEE80211_SN_TO_SEQ(sn));
+ }
+ IEEE80211_SKB_CB(skb)->flags |= tx_flags;
+ ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band,
+ txdata_flags);
+ }
+}
+
static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
unsigned long *next_delay)
{
@@ -535,7 +570,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata;
struct cfg80211_scan_request *scan_req;
enum nl80211_band band = local->hw.conf.chandef.chan->band;
- u32 tx_flags;
+ u32 flags = 0, tx_flags;
scan_req = rcu_dereference_protected(local->scan_req,
lockdep_is_held(&local->mtx));
@@ -543,17 +578,21 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local,
tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
if (scan_req->no_cck)
tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
+ if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
+ flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
+ if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN)
+ flags |= IEEE80211_PROBE_FLAG_RANDOM_SN;
sdata = rcu_dereference_protected(local->scan_sdata,
lockdep_is_held(&local->mtx));
for (i = 0; i < scan_req->n_ssids; i++)
- ieee80211_send_probe_req(
+ ieee80211_send_scan_probe_req(
sdata, local->scan_addr, scan_req->bssid,
scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len,
scan_req->ie, scan_req->ie_len,
- scan_req->rates[band], false,
- tx_flags, local->hw.conf.chandef.chan, true);
+ scan_req->rates[band], flags,
+ tx_flags, local->hw.conf.chandef.chan);
/*
* After sending probe requests, wait for probe responses
@@ -1141,6 +1180,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
u32 rate_masks[NUM_NL80211_BANDS] = {};
u8 bands_used = 0;
u8 *ie;
+ u32 flags = 0;
iebufsz = local->scan_ies_len + req->ie_len;
@@ -1157,6 +1197,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
}
}
+ if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT)
+ flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT;
+
ie = kcalloc(iebufsz, num_bands, GFP_KERNEL);
if (!ie) {
ret = -ENOMEM;
@@ -1167,7 +1210,8 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata,
ieee80211_build_preq_ies(local, ie, num_bands * iebufsz,
&sched_scan_ies, req->ie,
- req->ie_len, bands_used, rate_masks, &chandef);
+ req->ie_len, bands_used, rate_masks, &chandef,
+ flags);
ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies);
if (ret == 0) {
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 029334835747..4e4902bdbef8 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -144,6 +144,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
wide_bw_chansw_ie->new_center_freq_seg1,
/* .basic_mcs_set doesn't matter */
};
+ struct ieee80211_ht_operation ht_oper = {};
/* default, for the case of IEEE80211_VHT_CHANWIDTH_USE_HT,
* to the previously parsed chandef
@@ -151,7 +152,9 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
new_vht_chandef = csa_ie->chandef;
/* ignore if parsing fails */
- if (!ieee80211_chandef_vht_oper(&vht_oper, &new_vht_chandef))
+ if (!ieee80211_chandef_vht_oper(&sdata->local->hw,
+ &vht_oper, &ht_oper,
+ &new_vht_chandef))
new_vht_chandef.chan = NULL;
if (sta_flags & IEEE80211_STA_DISABLE_80P80MHZ &&
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 6428f1ac37b6..fb8c2252ac0e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -113,7 +113,12 @@ static void __cleanup_single_sta(struct sta_info *sta)
if (sta->sta.txq[0]) {
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- struct txq_info *txqi = to_txq_info(sta->sta.txq[i]);
+ struct txq_info *txqi;
+
+ if (!sta->sta.txq[i])
+ continue;
+
+ txqi = to_txq_info(sta->sta.txq[i]);
spin_lock_bh(&fq->lock);
ieee80211_txq_purge(local, txqi);
@@ -374,6 +379,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
struct txq_info *txq = txq_data + i * size;
+ /* might not do anything for the bufferable MMPDU TXQ */
ieee80211_txq_init(sdata, sta, txq, i);
}
}
@@ -1239,13 +1245,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
if (!ieee80211_hw_check(&local->hw, AP_LINK_PS))
drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta);
- if (sta->sta.txq[0]) {
- for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
- if (!txq_has_queue(sta->sta.txq[i]))
- continue;
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i]))
+ continue;
- drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
- }
+ drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
}
skb_queue_head_init(&pending);
@@ -1323,6 +1327,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
struct ieee80211_tx_info *info;
struct ieee80211_chanctx_conf *chanctx_conf;
+ /* Don't send NDPs when STA is connected HE */
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE))
+ return;
+
if (qos) {
fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
IEEE80211_STYPE_QOS_NULLFUNC |
@@ -1391,7 +1400,7 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid,
}
info->band = chanctx_conf->def.chan->band;
- ieee80211_xmit(sdata, sta, skb);
+ ieee80211_xmit(sdata, sta, skb, 0);
rcu_read_unlock();
}
@@ -1678,7 +1687,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
return;
for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
- if (!(driver_release_tids & BIT(tid)) ||
+ if (!sta->sta.txq[tid] ||
+ !(driver_release_tids & BIT(tid)) ||
txq_has_queue(sta->sta.txq[tid]))
continue;
@@ -1968,7 +1978,7 @@ sta_get_last_rx_stats(struct sta_info *sta)
return stats;
}
-static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
+static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
struct rate_info *rinfo)
{
rinfo->bw = STA_STATS_GET(BW, rate);
@@ -2005,6 +2015,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
break;
}
+ case STA_STATS_RATE_TYPE_HE:
+ rinfo->flags = RATE_INFO_FLAGS_HE_MCS;
+ rinfo->mcs = STA_STATS_GET(HE_MCS, rate);
+ rinfo->nss = STA_STATS_GET(HE_NSS, rate);
+ rinfo->he_gi = STA_STATS_GET(HE_GI, rate);
+ rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate);
+ rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate);
+ break;
}
}
@@ -2101,38 +2119,38 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
drv_sta_statistics(local, sdata, &sta->sta, sinfo);
- sinfo->filled |= BIT(NL80211_STA_INFO_INACTIVE_TIME) |
- BIT(NL80211_STA_INFO_STA_FLAGS) |
- BIT(NL80211_STA_INFO_BSS_PARAM) |
- BIT(NL80211_STA_INFO_CONNECTED_TIME) |
- BIT(NL80211_STA_INFO_RX_DROP_MISC);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) |
+ BIT_ULL(NL80211_STA_INFO_STA_FLAGS) |
+ BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
+ BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) |
+ BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC);
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
sinfo->beacon_loss_count = sdata->u.mgd.beacon_loss_count;
- sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_LOSS);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS);
}
sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
sinfo->inactive_time =
jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));
- if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) |
- BIT(NL80211_STA_INFO_TX_BYTES)))) {
+ if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) |
+ BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) {
sinfo->tx_bytes = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
sinfo->tx_bytes += sta->tx_stats.bytes[ac];
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_BYTES64);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_PACKETS))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) {
sinfo->tx_packets = 0;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
sinfo->tx_packets += sta->tx_stats.packets[ac];
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_PACKETS);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS);
}
- if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) |
- BIT(NL80211_STA_INFO_RX_BYTES)))) {
+ if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) |
+ BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) {
sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats);
if (sta->pcpu_rx_stats) {
@@ -2144,10 +2162,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
}
}
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) {
sinfo->rx_packets = sta->rx_stats.packets;
if (sta->pcpu_rx_stats) {
for_each_possible_cpu(cpu) {
@@ -2157,17 +2175,17 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
sinfo->rx_packets += cpurxs->packets;
}
}
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_RETRIES))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) {
sinfo->tx_retries = sta->status_stats.retry_count;
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_RETRIES);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_FAILED))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) {
sinfo->tx_failed = sta->status_stats.retry_failed;
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_FAILED);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED);
}
sinfo->rx_dropped_misc = sta->rx_stats.dropped;
@@ -2182,23 +2200,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
!(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
- sinfo->filled |= BIT(NL80211_STA_INFO_BEACON_RX) |
- BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) |
+ BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
}
if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) {
sinfo->signal = (s8)last_rxstats->last_signal;
- sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL);
}
if (!sta->pcpu_rx_stats &&
- !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) {
+ !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) {
sinfo->signal_avg =
-ewma_signal_read(&sta->rx_stats_avg.signal);
- sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG);
}
}
@@ -2207,11 +2225,11 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
* pcpu statistics
*/
if (last_rxstats->chains &&
- !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) |
- BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
- sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL);
+ !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) |
+ BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL);
if (!sta->pcpu_rx_stats)
- sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
sinfo->chains = last_rxstats->chains;
@@ -2223,15 +2241,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
}
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) {
sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate,
&sinfo->txrate);
- sinfo->filled |= BIT(NL80211_STA_INFO_TX_BITRATE);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
}
- if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) {
if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
- sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE);
}
if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
@@ -2244,18 +2262,18 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
if (ieee80211_vif_is_mesh(&sdata->vif)) {
#ifdef CONFIG_MAC80211_MESH
- sinfo->filled |= BIT(NL80211_STA_INFO_LLID) |
- BIT(NL80211_STA_INFO_PLID) |
- BIT(NL80211_STA_INFO_PLINK_STATE) |
- BIT(NL80211_STA_INFO_LOCAL_PM) |
- BIT(NL80211_STA_INFO_PEER_PM) |
- BIT(NL80211_STA_INFO_NONPEER_PM);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) |
+ BIT_ULL(NL80211_STA_INFO_PLID) |
+ BIT_ULL(NL80211_STA_INFO_PLINK_STATE) |
+ BIT_ULL(NL80211_STA_INFO_LOCAL_PM) |
+ BIT_ULL(NL80211_STA_INFO_PEER_PM) |
+ BIT_ULL(NL80211_STA_INFO_NONPEER_PM);
sinfo->llid = sta->mesh->llid;
sinfo->plid = sta->mesh->plid;
sinfo->plink_state = sta->mesh->plink_state;
if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) {
- sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET);
sinfo->t_offset = sta->mesh->t_offset;
}
sinfo->local_pm = sta->mesh->local_pm;
@@ -2300,7 +2318,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
thr = sta_get_expected_throughput(sta);
if (thr != 0) {
- sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
+ sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
sinfo->expected_throughput = thr;
}
@@ -2310,13 +2328,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
}
- if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) &&
- !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) {
+ if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) &&
+ sta->status_stats.ack_signal_filled) {
sinfo->avg_ack_signal =
-(s8)ewma_avg_signal_read(
&sta->status_stats.avg_ack_signal);
sinfo->filled |=
- BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG);
+ BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG);
}
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 81b35f623792..9a04327d71d1 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -170,7 +170,7 @@ struct tid_ampdu_tx {
u8 dialog_token;
u8 stop_initiator;
bool tx_stop;
- u8 buf_size;
+ u16 buf_size;
u16 failed_bar_ssn;
bool bar_pending;
@@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats {
int last_signal;
u8 chains;
s8 chain_signal_last[IEEE80211_MAX_CHAINS];
- u16 last_rate;
+ u32 last_rate;
struct u64_stats_sync syncp;
u64 bytes;
u64 msdu[IEEE80211_NUM_TIDS + 1];
@@ -764,6 +764,7 @@ enum sta_stats_type {
STA_STATS_RATE_TYPE_LEGACY,
STA_STATS_RATE_TYPE_HT,
STA_STATS_RATE_TYPE_VHT,
+ STA_STATS_RATE_TYPE_HE,
};
#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0)
@@ -771,9 +772,14 @@ enum sta_stats_type {
#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4)
#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0)
#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4)
+#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0)
+#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4)
#define STA_STATS_FIELD_BW GENMASK(11, 8)
#define STA_STATS_FIELD_SGI GENMASK(12, 12)
#define STA_STATS_FIELD_TYPE GENMASK(15, 13)
+#define STA_STATS_FIELD_HE_RU GENMASK(18, 16)
+#define STA_STATS_FIELD_HE_GI GENMASK(20, 19)
+#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21)
#define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v)
#define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v)
@@ -782,7 +788,7 @@ enum sta_stats_type {
static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
{
- u16 r;
+ u32 r;
r = STA_STATS_FIELD(BW, s->bw);
@@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
r |= STA_STATS_FIELD(LEGACY_BAND, s->band);
r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx);
break;
+ case RX_ENC_HE:
+ r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE);
+ r |= STA_STATS_FIELD(HE_NSS, s->nss);
+ r |= STA_STATS_FIELD(HE_MCS, s->rate_idx);
+ r |= STA_STATS_FIELD(HE_GI, s->he_gi);
+ r |= STA_STATS_FIELD(HE_RU, s->he_ru);
+ r |= STA_STATS_FIELD(HE_DCM, s->he_dcm);
+ break;
default:
WARN_ON(1);
return STA_STATS_RATE_INVALID;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 9a6d7208bf4f..aa4afbf0abaf 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -479,11 +479,6 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
if (!skb)
return;
- if (dropped) {
- dev_kfree_skb_any(skb);
- return;
- }
-
if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) {
u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie;
struct ieee80211_sub_if_data *sdata;
@@ -507,6 +502,8 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
rcu_read_unlock();
dev_kfree_skb_any(skb);
+ } else if (dropped) {
+ dev_kfree_skb_any(skb);
} else {
/* consumes skb */
skb_complete_wifi_ack(skb, acked);
@@ -811,7 +808,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
rate_control_tx_status(local, sband, status);
if (ieee80211_vif_is_mesh(&sta->sdata->vif))
- ieee80211s_update_metric(local, sta, skb);
+ ieee80211s_update_metric(local, sta, status);
if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked)
ieee80211_frame_acked(sta, skb);
@@ -972,6 +969,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
}
rate_control_tx_status(local, sband, status);
+ if (ieee80211_vif_is_mesh(&sta->sdata->vif))
+ ieee80211s_update_metric(local, sta, status);
}
if (acked || noack_success) {
@@ -988,6 +987,25 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(ieee80211_tx_status_ext);
+void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
+ struct ieee80211_sta *pubsta,
+ struct ieee80211_tx_info *info)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_supported_band *sband = hw->wiphy->bands[info->band];
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct ieee80211_tx_status status = {
+ .info = info,
+ .sta = pubsta,
+ };
+
+ rate_control_tx_status(local, sband, &status);
+
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
+ sta->tx_stats.last_rate = info->status.rates[0];
+}
+EXPORT_SYMBOL(ieee80211_tx_rate_update);
+
void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
{
struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index 5cd5e6e5834e..6c647f425e05 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -16,6 +16,7 @@
#include "ieee80211_i.h"
#include "driver-ops.h"
#include "rate.h"
+#include "wme.h"
/* give usermode some time for retries in setting up the TDLS session */
#define TDLS_PEER_SETUP_TIMEOUT (15 * HZ)
@@ -1010,14 +1011,13 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev,
switch (action_code) {
case WLAN_TDLS_SETUP_REQUEST:
case WLAN_TDLS_SETUP_RESPONSE:
- skb_set_queue_mapping(skb, IEEE80211_AC_BK);
- skb->priority = 2;
+ skb->priority = 256 + 2;
break;
default:
- skb_set_queue_mapping(skb, IEEE80211_AC_VI);
- skb->priority = 5;
+ skb->priority = 256 + 5;
break;
}
+ skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb));
/*
* Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress.
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 80a7edf8d314..588c51a67c89 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -92,7 +92,7 @@
STA_ENTRY \
__field(u16, tid) \
__field(u16, ssn) \
- __field(u8, buf_size) \
+ __field(u16, buf_size) \
__field(bool, amsdu) \
__field(u16, timeout) \
__field(u16, action)
@@ -2600,6 +2600,29 @@ TRACE_EVENT(drv_wake_tx_queue,
)
);
+TRACE_EVENT(drv_get_ftm_responder_stats,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_ftm_responder_stats *ftm_stats),
+
+ TP_ARGS(local, sdata, ftm_stats),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG
+ )
+);
+
#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index fa1f1e63a264..e0ccee23fbcd 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -214,6 +214,7 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx)
{
struct ieee80211_local *local = tx->local;
struct ieee80211_if_managed *ifmgd;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
/* driver doesn't support power save */
if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS))
@@ -242,6 +243,9 @@ ieee80211_tx_h_dynamic_ps(struct ieee80211_tx_data *tx)
if (tx->sdata->vif.type != NL80211_IFTYPE_STATION)
return TX_CONTINUE;
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_OFFCHAN_TX_OK))
+ return TX_CONTINUE;
+
ifmgd = &tx->sdata->u.mgd;
/*
@@ -825,6 +829,8 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
*/
if (!ieee80211_is_data_qos(hdr->frame_control) ||
is_multicast_ether_addr(hdr->addr1)) {
+ if (tx->flags & IEEE80211_TX_NO_SEQNO)
+ return TX_CONTINUE;
/* driver should assign sequence number */
info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
/* for pure STA mode without beacons, we can do it */
@@ -1247,10 +1253,18 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
return NULL;
- if (!ieee80211_is_data(hdr->frame_control))
- return NULL;
-
- if (sta) {
+ if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
+ if ((!ieee80211_is_mgmt(hdr->frame_control) ||
+ ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
+ vif->type == NL80211_IFTYPE_STATION) &&
+ sta && sta->uploaded) {
+ /*
+ * This will be NULL if the driver didn't set the
+ * opt-in hardware flag.
+ */
+ txq = sta->sta.txq[IEEE80211_NUM_TIDS];
+ }
+ } else if (sta) {
u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
if (!sta->uploaded)
@@ -1438,16 +1452,33 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
txqi->txq.vif = &sdata->vif;
- if (sta) {
- txqi->txq.sta = &sta->sta;
- sta->sta.txq[tid] = &txqi->txq;
- txqi->txq.tid = tid;
- txqi->txq.ac = ieee80211_ac_from_tid(tid);
- } else {
+ if (!sta) {
sdata->vif.txq = &txqi->txq;
txqi->txq.tid = 0;
txqi->txq.ac = IEEE80211_AC_BE;
+
+ return;
}
+
+ if (tid == IEEE80211_NUM_TIDS) {
+ if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ /* Drivers need to opt in to the management MPDU TXQ */
+ if (!ieee80211_hw_check(&sdata->local->hw,
+ STA_MMPDU_TXQ))
+ return;
+ } else if (!ieee80211_hw_check(&sdata->local->hw,
+ BUFF_MMPDU_TXQ)) {
+ /* Drivers need to opt in to the bufferable MMPDU TXQ */
+ return;
+ }
+ txqi->txq.ac = IEEE80211_AC_VO;
+ } else {
+ txqi->txq.ac = ieee80211_ac_from_tid(tid);
+ }
+
+ txqi->txq.sta = &sta->sta;
+ txqi->txq.tid = tid;
+ sta->sta.txq[tid] = &txqi->txq;
}
void ieee80211_txq_purge(struct ieee80211_local *local,
@@ -1854,7 +1885,7 @@ EXPORT_SYMBOL(ieee80211_tx_prepare_skb);
*/
static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, struct sk_buff *skb,
- bool txpending)
+ bool txpending, u32 txdata_flags)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_data tx;
@@ -1872,6 +1903,8 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
led_len = skb->len;
res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb);
+ tx.flags |= txdata_flags;
+
if (unlikely(res_prepare == TX_DROP)) {
ieee80211_free_txskb(&local->hw, skb);
return true;
@@ -1886,7 +1919,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
if (invoke_tx_handlers_early(&tx))
- return false;
+ return true;
if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
return true;
@@ -1933,7 +1966,8 @@ static int ieee80211_skb_resize(struct ieee80211_sub_if_data *sdata,
}
void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, struct sk_buff *skb)
+ struct sta_info *sta, struct sk_buff *skb,
+ u32 txdata_flags)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
@@ -1968,7 +2002,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
}
ieee80211_set_qos_hdr(sdata, skb);
- ieee80211_tx(sdata, sta, skb, false);
+ ieee80211_tx(sdata, sta, skb, false, txdata_flags);
}
static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
@@ -2289,7 +2323,7 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
if (!ieee80211_parse_tx_radiotap(local, skb))
goto fail_rcu;
- ieee80211_xmit(sdata, NULL, skb);
+ ieee80211_xmit(sdata, NULL, skb, 0);
rcu_read_unlock();
return NETDEV_TX_OK;
@@ -2946,6 +2980,10 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)
if (!(build.key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
goto out;
+ /* Key is being removed */
+ if (build.key->flags & KEY_FLAG_TAINTED)
+ goto out;
+
switch (build.key->conf.cipher) {
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
@@ -3073,27 +3111,18 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta)
}
static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local,
- struct sk_buff *skb, int headroom,
- int *subframe_len)
+ struct sk_buff *skb, int headroom)
{
- int amsdu_len = *subframe_len + sizeof(struct ethhdr);
- int padding = (4 - amsdu_len) & 3;
-
- if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) {
+ if (skb_headroom(skb) < headroom) {
I802_DEBUG_INC(local->tx_expand_skb_head);
- if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) {
+ if (pskb_expand_head(skb, headroom, 0, GFP_ATOMIC)) {
wiphy_debug(local->hw.wiphy,
"failed to reallocate TX buffer\n");
return false;
}
}
- if (padding) {
- *subframe_len += padding;
- skb_put_zero(skb, padding);
- }
-
return true;
}
@@ -3117,8 +3146,7 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata,
if (info->control.flags & IEEE80211_TX_CTRL_AMSDU)
return true;
- if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr),
- &subframe_len))
+ if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr)))
return false;
data = skb_push(skb, sizeof(*amsdu_hdr));
@@ -3184,7 +3212,8 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
void *data;
bool ret = false;
unsigned int orig_len;
- int n = 1, nfrags;
+ int n = 2, nfrags, pad = 0;
+ u16 hdrlen;
if (!ieee80211_hw_check(&local->hw, TX_AMSDU))
return false;
@@ -3200,6 +3229,10 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
max_amsdu_len = min_t(int, max_amsdu_len,
sta->sta.max_rc_amsdu_len);
+ if (sta->sta.max_tid_amsdu_len[tid])
+ max_amsdu_len = min_t(int, max_amsdu_len,
+ sta->sta.max_tid_amsdu_len[tid]);
+
spin_lock_bh(&fq->lock);
/* TODO: Ideally aggregation should be done on dequeue to remain
@@ -3217,9 +3250,6 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
if (skb->len + head->len > max_amsdu_len)
goto out;
- if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
- goto out;
-
nfrags = 1 + skb_shinfo(skb)->nr_frags;
nfrags += 1 + skb_shinfo(head)->nr_frags;
frag_tail = &skb_shinfo(head)->frag_list;
@@ -3235,10 +3265,27 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
if (max_frags && nfrags > max_frags)
goto out;
- if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2,
- &subframe_len))
+ if (!drv_can_aggregate_in_amsdu(local, head, skb))
+ goto out;
+
+ if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head))
goto out;
+ /*
+ * Pad out the previous subframe to a multiple of 4 by adding the
+ * padding to the next one, that's being added. Note that head->len
+ * is the length of the full A-MSDU, but that works since each time
+ * we add a new subframe we pad out the previous one to a multiple
+ * of 4 and thus it no longer matters in the next round.
+ */
+ hdrlen = fast_tx->hdr_len - sizeof(rfc1042_header);
+ if ((head->len - hdrlen) & 3)
+ pad = 4 - ((head->len - hdrlen) & 3);
+
+ if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) +
+ 2 + pad))
+ goto out_recalc;
+
ret = true;
data = skb_push(skb, ETH_ALEN + 2);
memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN);
@@ -3248,15 +3295,19 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata,
memcpy(data, &len, 2);
memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header));
+ memset(skb_push(skb, pad), 0, pad);
+
head->len += skb->len;
head->data_len += skb->len;
*frag_tail = skb;
- flow->backlog += head->len - orig_len;
- tin->backlog_bytes += head->len - orig_len;
-
- fq_recalc_backlog(fq, tin, flow);
+out_recalc:
+ if (head->len != orig_len) {
+ flow->backlog += head->len - orig_len;
+ tin->backlog_bytes += head->len - orig_len;
+ fq_recalc_backlog(fq, tin, flow);
+ }
out:
spin_unlock_bh(&fq->lock);
@@ -3461,12 +3512,18 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
struct ieee80211_tx_info *info;
struct ieee80211_tx_data tx;
ieee80211_tx_result r;
- struct ieee80211_vif *vif;
+ struct ieee80211_vif *vif = txq->vif;
spin_lock_bh(&fq->lock);
- if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
+ if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ||
+ test_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags))
+ goto out;
+
+ if (vif->txqs_stopped[ieee80211_ac_from_tid(txq->tid)]) {
+ set_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags);
goto out;
+ }
/* Make sure fragments stay together. */
skb = __skb_dequeue(&txqi->frags);
@@ -3562,6 +3619,7 @@ begin:
}
IEEE80211_SKB_CB(skb)->control.vif = vif;
+
out:
spin_unlock_bh(&fq->lock);
@@ -3590,13 +3648,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
if (!IS_ERR_OR_NULL(sta)) {
struct ieee80211_fast_tx *fast_tx;
- /* We need a bit of data queued to build aggregates properly, so
- * instruct the TCP stack to allow more than a single ms of data
- * to be queued in the stack. The value is a bit-shift of 1
- * second, so 8 is ~4ms of queued data. Only affects local TCP
- * sockets.
- */
- sk_pacing_shift_update(skb->sk, 8);
+ sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift);
fast_tx = rcu_dereference(sta->fast_tx);
@@ -3648,7 +3700,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
ieee80211_tx_stats(dev, skb->len);
- ieee80211_xmit(sdata, sta, skb);
+ ieee80211_xmit(sdata, sta, skb, 0);
}
goto out;
out_free:
@@ -3867,7 +3919,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
return true;
}
info->band = chanctx_conf->def.chan->band;
- result = ieee80211_tx(sdata, NULL, skb, true);
+ result = ieee80211_tx(sdata, NULL, skb, true, 0);
} else {
struct sk_buff_head skbs;
@@ -4783,7 +4835,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid);
void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
- enum nl80211_band band)
+ enum nl80211_band band, u32 txdata_flags)
{
int ac = ieee80211_ac_from_tid(tid);
@@ -4800,7 +4852,7 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
*/
local_bh_disable();
IEEE80211_SKB_CB(skb)->band = band;
- ieee80211_xmit(sdata, NULL, skb);
+ ieee80211_xmit(sdata, NULL, skb, txdata_flags);
local_bh_enable();
}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index d02fbfec3783..bec424316ea4 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -240,6 +240,102 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(ieee80211_ctstoself_duration);
+static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_vif *vif = &sdata->vif;
+ struct fq *fq = &local->fq;
+ struct ps_data *ps = NULL;
+ struct txq_info *txqi;
+ struct sta_info *sta;
+ int i;
+
+ spin_lock_bh(&fq->lock);
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ ps = &sdata->bss->ps;
+
+ sdata->vif.txqs_stopped[ac] = false;
+
+ list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ if (sdata != sta->sdata)
+ continue;
+
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ struct ieee80211_txq *txq = sta->sta.txq[i];
+
+ if (!txq)
+ continue;
+
+ txqi = to_txq_info(txq);
+
+ if (ac != txq->ac)
+ continue;
+
+ if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX,
+ &txqi->flags))
+ continue;
+
+ spin_unlock_bh(&fq->lock);
+ drv_wake_tx_queue(local, txqi);
+ spin_lock_bh(&fq->lock);
+ }
+ }
+
+ if (!vif->txq)
+ goto out;
+
+ txqi = to_txq_info(vif->txq);
+
+ if (!test_and_clear_bit(IEEE80211_TXQ_STOP_NETIF_TX, &txqi->flags) ||
+ (ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac)
+ goto out;
+
+ spin_unlock_bh(&fq->lock);
+
+ drv_wake_tx_queue(local, txqi);
+ return;
+out:
+ spin_unlock_bh(&fq->lock);
+}
+
+void ieee80211_wake_txqs(unsigned long data)
+{
+ struct ieee80211_local *local = (struct ieee80211_local *)data;
+ struct ieee80211_sub_if_data *sdata;
+ int n_acs = IEEE80211_NUM_ACS;
+ unsigned long flags;
+ int i;
+
+ rcu_read_lock();
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+ if (local->hw.queues < IEEE80211_NUM_ACS)
+ n_acs = 1;
+
+ for (i = 0; i < local->hw.queues; i++) {
+ if (local->queue_stop_reasons[i])
+ continue;
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ int ac;
+
+ for (ac = 0; ac < n_acs; ac++) {
+ int ac_queue = sdata->vif.hw_queue[ac];
+
+ if (ac_queue == i ||
+ sdata->vif.cab_queue == i)
+ __ieee80211_wake_txqs(sdata, ac);
+ }
+ }
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+ }
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+ rcu_read_unlock();
+}
+
void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue)
{
struct ieee80211_sub_if_data *sdata;
@@ -308,6 +404,9 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
rcu_read_unlock();
} else
tasklet_schedule(&local->tx_pending_tasklet);
+
+ if (local->ops->wake_tx_queue)
+ tasklet_schedule(&local->wake_txqs_tasklet);
}
void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
@@ -351,9 +450,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
if (__test_and_set_bit(reason, &local->queue_stop_reasons[queue]))
return;
- if (local->ops->wake_tx_queue)
- return;
-
if (local->hw.queues < IEEE80211_NUM_ACS)
n_acs = 1;
@@ -366,8 +462,15 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
for (ac = 0; ac < n_acs; ac++) {
if (sdata->vif.hw_queue[ac] == queue ||
- sdata->vif.cab_queue == queue)
- netif_stop_subqueue(sdata->dev, ac);
+ sdata->vif.cab_queue == queue) {
+ if (!local->ops->wake_tx_queue) {
+ netif_stop_subqueue(sdata->dev, ac);
+ continue;
+ }
+ spin_lock(&local->fq.lock);
+ sdata->vif.txqs_stopped[ac] = true;
+ spin_unlock(&local->fq.lock);
+ }
}
}
rcu_read_unlock();
@@ -1095,6 +1198,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
if (elen >= sizeof(*elems->max_idle_period_ie))
elems->max_idle_period_ie = (void *)pos;
break;
+ case WLAN_EID_EXTENSION:
+ if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA &&
+ elen >= (sizeof(*elems->mu_edca_param_set) + 1)) {
+ elems->mu_edca_param_set = (void *)&pos[1];
+ } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) {
+ elems->he_cap = (void *)&pos[1];
+ elems->he_cap_len = elen - 1;
+ } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION &&
+ elen >= sizeof(*elems->he_operation) &&
+ elen >= ieee80211_he_oper_size(&pos[1])) {
+ elems->he_operation = (void *)&pos[1];
+ } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) {
+ elems->uora_element = (void *)&pos[1];
+ }
+ break;
default:
break;
}
@@ -1120,7 +1238,7 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_chanctx_conf *chanctx_conf;
const struct ieee80211_reg_rule *rrule;
- struct ieee80211_wmm_ac *wmm_ac;
+ const struct ieee80211_wmm_ac *wmm_ac;
u16 center_freq = 0;
if (sdata->vif.type != NL80211_IFTYPE_AP &&
@@ -1139,20 +1257,19 @@ void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq));
- if (IS_ERR_OR_NULL(rrule) || !rrule->wmm_rule) {
+ if (IS_ERR_OR_NULL(rrule) || !rrule->has_wmm) {
rcu_read_unlock();
return;
}
if (sdata->vif.type == NL80211_IFTYPE_AP)
- wmm_ac = &rrule->wmm_rule->ap[ac];
+ wmm_ac = &rrule->wmm_rule.ap[ac];
else
- wmm_ac = &rrule->wmm_rule->client[ac];
+ wmm_ac = &rrule->wmm_rule.client[ac];
qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min);
qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max);
qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn);
- qparam->txop = !qparam->txop ? wmm_ac->cot / 32 :
- min_t(u16, qparam->txop, wmm_ac->cot / 32);
+ qparam->txop = min_t(u16, qparam->txop, wmm_ac->cot / 32);
rcu_read_unlock();
}
@@ -1353,9 +1470,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
enum nl80211_band band,
u32 rate_mask,
struct cfg80211_chan_def *chandef,
- size_t *offset)
+ size_t *offset, u32 flags)
{
struct ieee80211_supported_band *sband;
+ const struct ieee80211_sta_he_cap *he_cap;
u8 *pos = buffer, *end = buffer + buffer_len;
size_t noffset;
int supp_rates_len, i;
@@ -1433,6 +1551,9 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
chandef->chan->center_freq);
}
+ if (flags & IEEE80211_PROBE_FLAG_MIN_CONTENT)
+ goto done;
+
/* insert custom IEs that go before HT */
if (ie && ie_len) {
static const u8 before_ht[] = {
@@ -1460,11 +1581,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
sband->ht_cap.cap);
}
- /*
- * If adding more here, adjust code in main.c
- * that calculates local->scan_ies_len.
- */
-
/* insert custom IEs that go before VHT */
if (ie && ie_len) {
static const u8 before_vht[] = {
@@ -1507,9 +1623,43 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local,
sband->vht_cap.cap);
}
+ /* insert custom IEs that go before HE */
+ if (ie && ie_len) {
+ static const u8 before_he[] = {
+ /*
+ * no need to list the ones split off before VHT
+ * or generated here
+ */
+ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS,
+ WLAN_EID_AP_CSN,
+ /* TODO: add 11ah/11aj/11ak elements */
+ };
+ noffset = ieee80211_ie_split(ie, ie_len,
+ before_he, ARRAY_SIZE(before_he),
+ *offset);
+ if (end - pos < noffset - *offset)
+ goto out_err;
+ memcpy(pos, ie + *offset, noffset - *offset);
+ pos += noffset - *offset;
+ *offset = noffset;
+ }
+
+ he_cap = ieee80211_get_he_sta_cap(sband);
+ if (he_cap) {
+ pos = ieee80211_ie_build_he_cap(pos, he_cap, end);
+ if (!pos)
+ goto out_err;
+ }
+
+ /*
+ * If adding more here, adjust code in main.c
+ * that calculates local->scan_ies_len.
+ */
+
return pos - buffer;
out_err:
WARN_ONCE(1, "not enough space for preq IEs\n");
+ done:
return pos - buffer;
}
@@ -1518,7 +1668,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
struct ieee80211_scan_ies *ie_desc,
const u8 *ie, size_t ie_len,
u8 bands_used, u32 *rate_masks,
- struct cfg80211_chan_def *chandef)
+ struct cfg80211_chan_def *chandef,
+ u32 flags)
{
size_t pos = 0, old_pos = 0, custom_ie_offset = 0;
int i;
@@ -1533,7 +1684,8 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
ie, ie_len, i,
rate_masks[i],
chandef,
- &custom_ie_offset);
+ &custom_ie_offset,
+ flags);
ie_desc->ies[i] = buffer + old_pos;
ie_desc->len[i] = pos - old_pos;
old_pos = pos;
@@ -1561,7 +1713,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
struct ieee80211_channel *chan,
const u8 *ssid, size_t ssid_len,
const u8 *ie, size_t ie_len,
- bool directed)
+ u32 flags)
{
struct ieee80211_local *local = sdata->local;
struct cfg80211_chan_def chandef;
@@ -1577,7 +1729,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
* badly-behaved APs don't respond when this parameter is included.
*/
chandef.width = sdata->vif.bss_conf.chandef.width;
- if (directed)
+ if (flags & IEEE80211_PROBE_FLAG_DIRECTED)
chandef.chan = NULL;
else
chandef.chan = chan;
@@ -1591,7 +1743,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb),
skb_tailroom(skb), &dummy_ie_desc,
ie, ie_len, BIT(chan->band),
- rate_masks, &chandef);
+ rate_masks, &chandef, flags);
skb_put(skb, ies_len);
if (dst) {
@@ -1605,27 +1757,6 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata,
return skb;
}
-void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata,
- const u8 *src, const u8 *dst,
- const u8 *ssid, size_t ssid_len,
- const u8 *ie, size_t ie_len,
- u32 ratemask, bool directed, u32 tx_flags,
- struct ieee80211_channel *channel, bool scan)
-{
- struct sk_buff *skb;
-
- skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel,
- ssid, ssid_len,
- ie, ie_len, directed);
- if (skb) {
- IEEE80211_SKB_CB(skb)->flags |= tx_flags;
- if (scan)
- ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band);
- else
- ieee80211_tx_skb(sdata, skb);
- }
-}
-
u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems,
enum nl80211_band band, u32 *basic_rates)
@@ -2047,6 +2178,11 @@ int ieee80211_reconfig(struct ieee80211_local *local)
case NL80211_IFTYPE_AP:
changed |= BSS_CHANGED_SSID | BSS_CHANGED_P2P_PS;
+ if (sdata->vif.bss_conf.ftm_responder == 1 &&
+ wiphy_ext_feature_isset(sdata->local->hw.wiphy,
+ NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
+ changed |= BSS_CHANGED_FTM_RESPONDER;
+
if (sdata->vif.type == NL80211_IFTYPE_AP) {
changed |= BSS_CHANGED_AP_PROBE_RESP;
@@ -2413,6 +2549,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
return pos;
}
+u8 *ieee80211_ie_build_he_cap(u8 *pos,
+ const struct ieee80211_sta_he_cap *he_cap,
+ u8 *end)
+{
+ u8 n;
+ u8 ie_len;
+ u8 *orig_pos = pos;
+
+ /* Make sure we have place for the IE */
+ /*
+ * TODO: the 1 added is because this temporarily is under the EXTENSION
+ * IE. Get rid of it when it moves.
+ */
+ if (!he_cap)
+ return orig_pos;
+
+ n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem);
+ ie_len = 2 + 1 +
+ sizeof(he_cap->he_cap_elem) + n +
+ ieee80211_he_ppe_size(he_cap->ppe_thres[0],
+ he_cap->he_cap_elem.phy_cap_info);
+
+ if ((end - pos) < ie_len)
+ return orig_pos;
+
+ *pos++ = WLAN_EID_EXTENSION;
+ pos++; /* We'll set the size later below */
+ *pos++ = WLAN_EID_EXT_HE_CAPABILITY;
+
+ /* Fixed data */
+ memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem));
+ pos += sizeof(he_cap->he_cap_elem);
+
+ memcpy(pos, &he_cap->he_mcs_nss_supp, n);
+ pos += n;
+
+ /* Check if PPE Threshold should be present */
+ if ((he_cap->he_cap_elem.phy_cap_info[6] &
+ IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0)
+ goto end;
+
+ /*
+ * Calculate how many PPET16/PPET8 pairs are to come. Algorithm:
+ * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK)
+ */
+ n = hweight8(he_cap->ppe_thres[0] &
+ IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK);
+ n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >>
+ IEEE80211_PPE_THRES_NSS_POS));
+
+ /*
+ * Each pair is 6 bits, and we need to add the 7 "header" bits to the
+ * total size.
+ */
+ n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7;
+ n = DIV_ROUND_UP(n, 8);
+
+ /* Copy PPE Thresholds */
+ memcpy(pos, &he_cap->ppe_thres, n);
+ pos += n;
+
+end:
+ orig_pos[1] = (pos - orig_pos) - 2;
+ return pos;
+}
+
u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap,
const struct cfg80211_chan_def *chandef,
u16 prot_mode, bool rifs_mode)
@@ -2563,49 +2765,65 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
return true;
}
-bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
+bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
+ const struct ieee80211_vht_operation *oper,
+ const struct ieee80211_ht_operation *htop,
struct cfg80211_chan_def *chandef)
{
struct cfg80211_chan_def new = *chandef;
- int cf1, cf2;
+ int cf0, cf1;
+ int ccfs0, ccfs1, ccfs2;
+ int ccf0, ccf1;
- if (!oper)
+ if (!oper || !htop)
return false;
- cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg0_idx,
- chandef->chan->band);
- cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
- chandef->chan->band);
+ ccfs0 = oper->center_freq_seg0_idx;
+ ccfs1 = oper->center_freq_seg1_idx;
+ ccfs2 = (le16_to_cpu(htop->operation_mode) &
+ IEEE80211_HT_OP_MODE_CCFS2_MASK)
+ >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT;
+
+ /* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */
+ ccf0 = ccfs0;
+ ccf1 = ccfs1;
+ if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW))
+ ccf1 = ccfs2;
+
+ cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band);
+ cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band);
switch (oper->chan_width) {
case IEEE80211_VHT_CHANWIDTH_USE_HT:
+ /* just use HT information directly */
break;
case IEEE80211_VHT_CHANWIDTH_80MHZ:
new.width = NL80211_CHAN_WIDTH_80;
- new.center_freq1 = cf1;
+ new.center_freq1 = cf0;
/* If needed, adjust based on the newer interop workaround. */
- if (oper->center_freq_seg1_idx) {
+ if (ccf1) {
unsigned int diff;
- diff = abs(oper->center_freq_seg1_idx -
- oper->center_freq_seg0_idx);
+ diff = abs(ccf1 - ccf0);
if (diff == 8) {
new.width = NL80211_CHAN_WIDTH_160;
- new.center_freq1 = cf2;
+ new.center_freq1 = cf1;
} else if (diff > 8) {
new.width = NL80211_CHAN_WIDTH_80P80;
- new.center_freq2 = cf2;
+ new.center_freq2 = cf1;
}
}
break;
case IEEE80211_VHT_CHANWIDTH_160MHZ:
+ /* deprecated encoding */
new.width = NL80211_CHAN_WIDTH_160;
- new.center_freq1 = cf1;
+ new.center_freq1 = cf0;
break;
case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
+ /* deprecated encoding */
new.width = NL80211_CHAN_WIDTH_80P80;
- new.center_freq1 = cf1;
- new.center_freq2 = cf2;
+ new.center_freq1 = cf0;
+ new.center_freq2 = cf1;
break;
default:
return false;
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 259325cbcc31..006d82e4a397 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -3,6 +3,7 @@
*
* Portions of this file
* Copyright(c) 2015 - 2016 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -231,6 +232,13 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs,
sizeof(struct ieee80211_vht_mcs_info));
+ /* copy EXT_NSS_BW Support value or remove the capability */
+ if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW))
+ vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
+ else
+ vht_cap->vht_mcs.tx_highest &=
+ ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE);
+
/* but also restrict MCSes */
for (i = 0; i < 8; i++) {
u16 own_rx, own_tx, peer_rx, peer_tx;
@@ -294,6 +302,18 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
break;
default:
sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80;
+
+ if (!(vht_cap->vht_mcs.tx_highest &
+ cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
+ break;
+
+ /*
+ * If this is non-zero, then it does support 160 MHz after all,
+ * in one form or the other. We don't distinguish here (or even
+ * above) between 160 and 80+80 yet.
+ */
+ if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)
+ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160;
}
sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 7e253455f9dd..bcd1a5e6ebf4 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -63,8 +63,21 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
int ret;
if (!(local->hw.flags & IEEE802154_HW_TX_OMIT_CKSUM)) {
- u16 crc = crc_ccitt(0, skb->data, skb->len);
+ struct sk_buff *nskb;
+ u16 crc;
+
+ if (unlikely(skb_tailroom(skb) < IEEE802154_FCS_LEN)) {
+ nskb = skb_copy_expand(skb, 0, IEEE802154_FCS_LEN,
+ GFP_ATOMIC);
+ if (likely(nskb)) {
+ consume_skb(skb);
+ skb = nskb;
+ } else {
+ goto err_tx;
+ }
+ }
+ crc = crc_ccitt(0, skb->data, skb->len);
put_unaligned_le16(crc, skb_put(skb, 2));
}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..7d55d4c04088 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1223,7 +1223,7 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb,
int err;
err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
- devconf_mpls_policy, NULL);
+ devconf_mpls_policy, extack);
if (err < 0)
goto errout;
@@ -1263,6 +1263,7 @@ errout:
static int mpls_netconf_dump_devconf(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct hlist_head *head;
struct net_device *dev;
@@ -1270,6 +1271,21 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb,
int idx, s_idx;
int h, s_h;
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
+ return -EINVAL;
+ }
+ }
+
s_h = cb->args[0];
s_idx = idx = cb->args[1];
@@ -1286,7 +1302,7 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb,
goto cont;
if (mpls_netconf_fill_devconf(skb, mdev,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
NETCONFA_ALL) < 0) {
@@ -1533,10 +1549,14 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
unsigned int flags;
if (event == NETDEV_REGISTER) {
- /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+
+ /* For now just support Ethernet, IPGRE, IP6GRE, SIT and
+ * IPIP devices
+ */
if (dev->type == ARPHRD_ETHER ||
dev->type == ARPHRD_LOOPBACK ||
dev->type == ARPHRD_IPGRE ||
+ dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
@@ -2011,30 +2031,140 @@ nla_put_failure:
return -EMSGSIZE;
}
+#if IS_ENABLED(CONFIG_INET)
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+ struct fib_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ return ip_valid_fib_dump_req(net, nlh, filter, cb);
+}
+#else
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+ struct fib_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[RTA_MAX + 1];
+ struct rtmsg *rtm;
+ int err, i;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
+ return -EINVAL;
+ }
+
+ rtm = nlmsg_data(nlh);
+ if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
+ rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type ||
+ rtm->rtm_flags) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_protocol) {
+ filter->protocol = rtm->rtm_protocol;
+ filter->filter_set = 1;
+ cb->answer_flags = NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_mpls_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= RTA_MAX; ++i) {
+ int ifindex;
+
+ if (i == RTA_OIF) {
+ ifindex = nla_get_u32(tb[i]);
+ filter->dev = __dev_get_by_index(net, ifindex);
+ if (!filter->dev)
+ return -ENODEV;
+ filter->filter_set = 1;
+ } else if (tb[i]) {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+#endif
+
+static bool mpls_rt_uses_dev(struct mpls_route *rt,
+ const struct net_device *dev)
+{
+ struct net_device *nh_dev;
+
+ if (rt->rt_nhn == 1) {
+ struct mpls_nh *nh = rt->rt_nh;
+
+ nh_dev = rtnl_dereference(nh->nh_dev);
+ if (dev == nh_dev)
+ return true;
+ } else {
+ for_nexthops(rt) {
+ nh_dev = rtnl_dereference(nh->nh_dev);
+ if (nh_dev == dev)
+ return true;
+ } endfor_nexthops(rt);
+ }
+
+ return false;
+}
+
static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct mpls_route __rcu **platform_label;
+ struct fib_dump_filter filter = {};
+ unsigned int flags = NLM_F_MULTI;
size_t platform_labels;
unsigned int index;
ASSERT_RTNL();
+ if (cb->strict_check) {
+ int err;
+
+ err = mpls_valid_fib_dump_req(net, nlh, &filter, cb);
+ if (err < 0)
+ return err;
+
+ /* for MPLS, there is only 1 table with fixed type and flags.
+ * If either are set in the filter then return nothing.
+ */
+ if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) ||
+ (filter.rt_type && filter.rt_type != RTN_UNICAST) ||
+ filter.flags)
+ return skb->len;
+ }
+
index = cb->args[0];
if (index < MPLS_LABEL_FIRST_UNRESERVED)
index = MPLS_LABEL_FIRST_UNRESERVED;
platform_label = rtnl_dereference(net->mpls.platform_label);
platform_labels = net->mpls.platform_labels;
+
+ if (filter.filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
for (; index < platform_labels; index++) {
struct mpls_route *rt;
+
rt = rtnl_dereference(platform_label[index]);
if (!rt)
continue;
+ if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) ||
+ (filter.protocol && rt->rt_protocol != filter.protocol))
+ continue;
+
if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- index, rt, NLM_F_MULTI) < 0)
+ index, rt, flags) < 0)
break;
}
cb->args[0] = index;
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 6e558a419f60..94f53a9b7d1a 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -224,7 +224,7 @@ static int mpls_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
struct mpls_iptunnel_encap *tun_encap_info;
-
+
tun_encap_info = mpls_lwtunnel_encap(lwtstate);
if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
index 08a8a6031fd7..7f2b46108a24 100644
--- a/net/ncsi/Kconfig
+++ b/net/ncsi/Kconfig
@@ -10,3 +10,9 @@ config NET_NCSI
support. Enable this only if your system connects to a network
device via NCSI and the ethernet driver you're using supports
the protocol explicitly.
+config NCSI_OEM_CMD_GET_MAC
+ bool "Get NCSI OEM MAC Address"
+ depends on NET_NCSI
+ ---help---
+ This allows to get MAC address from NCSI firmware and set them back to
+ controller.
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8055e3965cef..1dae77c54009 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -68,6 +68,17 @@ enum {
NCSI_MODE_MAX
};
+/* OEM Vendor Manufacture ID */
+#define NCSI_OEM_MFR_MLX_ID 0x8119
+#define NCSI_OEM_MFR_BCM_ID 0x113d
+/* Broadcom specific OEM Command */
+#define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */
+/* OEM Command payload lengths*/
+#define NCSI_OEM_BCM_CMD_GMA_LEN 12
+/* Mac address offset in OEM response */
+#define BCM_MAC_ADDR_OFFSET 28
+
+
struct ncsi_channel_version {
u32 version; /* Supported BCD encoded NCSI version */
u32 alpha2; /* Supported BCD encoded NCSI version */
@@ -171,6 +182,8 @@ struct ncsi_package;
#define NCSI_RESERVED_CHANNEL 0x1f
#define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1))
#define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c))
+#define NCSI_MAX_PACKAGE 8
+#define NCSI_MAX_CHANNEL 32
struct ncsi_channel {
unsigned char id;
@@ -216,11 +229,15 @@ struct ncsi_request {
bool used; /* Request that has been assigned */
unsigned int flags; /* NCSI request property */
#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
+#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2
struct ncsi_dev_priv *ndp; /* Associated NCSI device */
struct sk_buff *cmd; /* Associated NCSI command packet */
struct sk_buff *rsp; /* Associated NCSI response packet */
struct timer_list timer; /* Timer on waiting for response */
bool enabled; /* Time has been enabled or not */
+ u32 snd_seq; /* netlink sending sequence number */
+ u32 snd_portid; /* netlink portid of sender */
+ struct nlmsghdr nlhdr; /* netlink message header */
};
enum {
@@ -236,6 +253,7 @@ enum {
ncsi_dev_state_probe_dp,
ncsi_dev_state_config_sp = 0x0301,
ncsi_dev_state_config_cis,
+ ncsi_dev_state_config_oem_gma,
ncsi_dev_state_config_clear_vids,
ncsi_dev_state_config_svf,
ncsi_dev_state_config_ev,
@@ -269,6 +287,7 @@ struct ncsi_dev_priv {
#define NCSI_DEV_PROBED 1 /* Finalized NCSI topology */
#define NCSI_DEV_HWA 2 /* Enabled HW arbitration */
#define NCSI_DEV_RESHUFFLE 4
+ unsigned int gma_flag; /* OEM GMA flag */
spinlock_t lock; /* Protect the NCSI device */
#if IS_ENABLED(CONFIG_IPV6)
unsigned int inet6_addr_num; /* Number of IPv6 addresses */
@@ -305,6 +324,8 @@ struct ncsi_cmd_arg {
unsigned short words[8];
unsigned int dwords[4];
};
+ unsigned char *data; /* NCSI OEM data */
+ struct genl_info *info; /* Netlink information */
};
extern struct list_head ncsi_dev_list;
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 7567ca63aae2..356af474e43c 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -17,6 +17,7 @@
#include <net/ncsi.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/genetlink.h>
#include "internal.h"
#include "ncsi-pkt.h"
@@ -211,6 +212,25 @@ static int ncsi_cmd_handler_snfc(struct sk_buff *skb,
return 0;
}
+static int ncsi_cmd_handler_oem(struct sk_buff *skb,
+ struct ncsi_cmd_arg *nca)
+{
+ struct ncsi_cmd_oem_pkt *cmd;
+ unsigned int len;
+
+ len = sizeof(struct ncsi_cmd_pkt_hdr) + 4;
+ if (nca->payload < 26)
+ len += 26;
+ else
+ len += nca->payload;
+
+ cmd = skb_put_zero(skb, len);
+ memcpy(&cmd->mfr_id, nca->data, nca->payload);
+ ncsi_cmd_build_header(&cmd->cmd.common, nca);
+
+ return 0;
+}
+
static struct ncsi_cmd_handler {
unsigned char type;
int payload;
@@ -244,7 +264,7 @@ static struct ncsi_cmd_handler {
{ NCSI_PKT_CMD_GNS, 0, ncsi_cmd_handler_default },
{ NCSI_PKT_CMD_GNPTS, 0, ncsi_cmd_handler_default },
{ NCSI_PKT_CMD_GPS, 0, ncsi_cmd_handler_default },
- { NCSI_PKT_CMD_OEM, 0, NULL },
+ { NCSI_PKT_CMD_OEM, -1, ncsi_cmd_handler_oem },
{ NCSI_PKT_CMD_PLDM, 0, NULL },
{ NCSI_PKT_CMD_GPUUID, 0, ncsi_cmd_handler_default }
};
@@ -316,12 +336,24 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
return -ENOENT;
}
- /* Get packet payload length and allocate the request */
- nca->payload = nch->payload;
+ /* Get packet payload length and allocate the request
+ * It is expected that if length set as negative in
+ * handler structure means caller is initializing it
+ * and setting length in nca before calling xmit function
+ */
+ if (nch->payload >= 0)
+ nca->payload = nch->payload;
nr = ncsi_alloc_command(nca);
if (!nr)
return -ENOMEM;
+ /* track netlink information */
+ if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+ nr->snd_seq = nca->info->snd_seq;
+ nr->snd_portid = nca->info->snd_portid;
+ nr->nlhdr = *nca->info->nlhdr;
+ }
+
/* Prepare the packet */
nca->id = nr->id;
ret = nch->handler(nr->cmd, nca);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 091284760d21..bfc43b28c7a6 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -19,6 +19,7 @@
#include <net/addrconf.h>
#include <net/ipv6.h>
#include <net/if_inet6.h>
+#include <net/genetlink.h>
#include "internal.h"
#include "ncsi-pkt.h"
@@ -406,6 +407,9 @@ static void ncsi_request_timeout(struct timer_list *t)
{
struct ncsi_request *nr = from_timer(nr, t, timer);
struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_cmd_pkt *cmd;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
unsigned long flags;
/* If the request already had associated response,
@@ -419,6 +423,18 @@ static void ncsi_request_timeout(struct timer_list *t)
}
spin_unlock_irqrestore(&ndp->lock, flags);
+ if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+ if (nr->cmd) {
+ /* Find the package */
+ cmd = (struct ncsi_cmd_pkt *)
+ skb_network_header(nr->cmd);
+ ncsi_find_package_and_channel(ndp,
+ cmd->cmd.common.channel,
+ &np, &nc);
+ ncsi_send_netlink_timeout(nr, np, nc);
+ }
+ }
+
/* Release the request */
ncsi_free_request(nr);
}
@@ -635,6 +651,72 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
return 0;
}
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+
+/* NCSI OEM Command APIs */
+static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
+{
+ unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN];
+ int ret = 0;
+
+ nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
+
+ memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
+ *(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
+ data[5] = NCSI_OEM_BCM_CMD_GMA;
+
+ nca->data = data;
+
+ ret = ncsi_xmit_cmd(nca);
+ if (ret)
+ netdev_err(nca->ndp->ndev.dev,
+ "NCSI: Failed to transmit cmd 0x%x during configure\n",
+ nca->type);
+ return ret;
+}
+
+/* OEM Command handlers initialization */
+static struct ncsi_oem_gma_handler {
+ unsigned int mfr_id;
+ int (*handler)(struct ncsi_cmd_arg *nca);
+} ncsi_oem_gma_handlers[] = {
+ { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
+};
+
+static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
+{
+ struct ncsi_oem_gma_handler *nch = NULL;
+ int i;
+
+ /* This function should only be called once, return if flag set */
+ if (nca->ndp->gma_flag == 1)
+ return -1;
+
+ /* Find gma handler for given manufacturer id */
+ for (i = 0; i < ARRAY_SIZE(ncsi_oem_gma_handlers); i++) {
+ if (ncsi_oem_gma_handlers[i].mfr_id == mf_id) {
+ if (ncsi_oem_gma_handlers[i].handler)
+ nch = &ncsi_oem_gma_handlers[i];
+ break;
+ }
+ }
+
+ if (!nch) {
+ netdev_err(nca->ndp->ndev.dev,
+ "NCSI: No GMA handler available for MFR-ID (0x%x)\n",
+ mf_id);
+ return -1;
+ }
+
+ /* Set the flag for GMA command which should only be called once */
+ nca->ndp->gma_flag = 1;
+
+ /* Get Mac address from NCSI device */
+ return nch->handler(nca);
+}
+
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+
static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
{
struct ncsi_dev *nd = &ndp->ndev;
@@ -685,7 +767,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
goto error;
}
+ nd->state = ncsi_dev_state_config_oem_gma;
+ break;
+ case ncsi_dev_state_config_oem_gma:
nd->state = ncsi_dev_state_config_clear_vids;
+ ret = -1;
+
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+ nca.type = NCSI_PKT_CMD_OEM;
+ nca.package = np->id;
+ nca.channel = nc->id;
+ ndp->pending_req_num = 1;
+ ret = ncsi_gma_handler(&nca, nc->version.mf_id);
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+
+ if (ret < 0)
+ schedule_work(&ndp->work);
+
break;
case ncsi_dev_state_config_clear_vids:
case ncsi_dev_state_config_svf:
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 82e6edf9c5d9..33314381b4f5 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -12,7 +12,6 @@
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/etherdevice.h>
-#include <linux/module.h>
#include <net/genetlink.h>
#include <net/ncsi.h>
#include <linux/skbuff.h>
@@ -20,6 +19,7 @@
#include <uapi/linux/ncsi.h>
#include "internal.h"
+#include "ncsi-pkt.h"
#include "ncsi-netlink.h"
static struct genl_family ncsi_genl_family;
@@ -29,6 +29,7 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
[NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED },
[NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 },
[NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 },
+ [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 },
};
static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
@@ -100,7 +101,7 @@ static int ncsi_write_package_info(struct sk_buff *skb,
bool found;
int rc;
- if (id > ndp->package_num) {
+ if (id > ndp->package_num - 1) {
netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id);
return -ENODEV;
}
@@ -240,7 +241,7 @@ static int ncsi_pkg_info_all_nl(struct sk_buff *skb,
return 0; /* done */
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
- &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO);
+ &ncsi_genl_family, NLM_F_MULTI, NCSI_CMD_PKG_INFO);
if (!hdr) {
rc = -EMSGSIZE;
goto err;
@@ -366,6 +367,202 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
return 0;
}
+static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info)
+{
+ struct ncsi_dev_priv *ndp;
+ struct ncsi_pkt_hdr *hdr;
+ struct ncsi_cmd_arg nca;
+ unsigned char *data;
+ u32 package_id;
+ u32 channel_id;
+ int len, ret;
+
+ if (!info || !info->attrs) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!info->attrs[NCSI_ATTR_IFINDEX]) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!info->attrs[NCSI_ATTR_DATA]) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+ nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+ if (!ndp) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+ channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
+
+ if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) {
+ ret = -ERANGE;
+ goto out_netlink;
+ }
+
+ len = nla_len(info->attrs[NCSI_ATTR_DATA]);
+ if (len < sizeof(struct ncsi_pkt_hdr)) {
+ netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n",
+ package_id);
+ ret = -EINVAL;
+ goto out_netlink;
+ } else {
+ data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]);
+ }
+
+ hdr = (struct ncsi_pkt_hdr *)data;
+
+ nca.ndp = ndp;
+ nca.package = (unsigned char)package_id;
+ nca.channel = (unsigned char)channel_id;
+ nca.type = hdr->type;
+ nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN;
+ nca.info = info;
+ nca.payload = ntohs(hdr->length);
+ nca.data = data + sizeof(*hdr);
+
+ ret = ncsi_xmit_cmd(&nca);
+out_netlink:
+ if (ret != 0) {
+ netdev_err(ndp->ndev.dev,
+ "NCSI: Error %d sending command\n",
+ ret);
+ ncsi_send_netlink_err(ndp->ndev.dev,
+ info->snd_seq,
+ info->snd_portid,
+ info->nlhdr,
+ ret);
+ }
+out:
+ return ret;
+}
+
+int ncsi_send_netlink_rsp(struct ncsi_request *nr,
+ struct ncsi_package *np,
+ struct ncsi_channel *nc)
+{
+ struct sk_buff *skb;
+ struct net *net;
+ void *hdr;
+ int rc;
+
+ net = dev_net(nr->rsp->dev);
+
+ skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
+ &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
+ if (!hdr) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex);
+ if (np)
+ nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
+ if (nc)
+ nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
+ else
+ nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
+
+ rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data);
+ if (rc)
+ goto err;
+
+ genlmsg_end(skb, hdr);
+ return genlmsg_unicast(net, skb, nr->snd_portid);
+
+err:
+ kfree_skb(skb);
+ return rc;
+}
+
+int ncsi_send_netlink_timeout(struct ncsi_request *nr,
+ struct ncsi_package *np,
+ struct ncsi_channel *nc)
+{
+ struct sk_buff *skb;
+ struct net *net;
+ void *hdr;
+
+ skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq,
+ &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD);
+ if (!hdr) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ net = dev_net(nr->cmd->dev);
+
+ nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex);
+
+ if (np)
+ nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id);
+ else
+ nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID,
+ NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *)
+ nr->cmd->data)->channel)));
+
+ if (nc)
+ nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id);
+ else
+ nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL);
+
+ genlmsg_end(skb, hdr);
+ return genlmsg_unicast(net, skb, nr->snd_portid);
+}
+
+int ncsi_send_netlink_err(struct net_device *dev,
+ u32 snd_seq,
+ u32 snd_portid,
+ struct nlmsghdr *nlhdr,
+ int err)
+{
+ struct nlmsghdr *nlh;
+ struct nlmsgerr *nle;
+ struct sk_buff *skb;
+ struct net *net;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ net = dev_net(dev);
+
+ nlh = nlmsg_put(skb, snd_portid, snd_seq,
+ NLMSG_ERROR, sizeof(*nle), 0);
+ nle = (struct nlmsgerr *)nlmsg_data(nlh);
+ nle->error = err;
+ memcpy(&nle->msg, nlhdr, sizeof(*nlh));
+
+ nlmsg_end(skb, nlh);
+
+ return nlmsg_unicast(net->genl_sock, skb, snd_portid);
+}
+
static const struct genl_ops ncsi_ops[] = {
{
.cmd = NCSI_CMD_PKG_INFO,
@@ -386,6 +583,12 @@ static const struct genl_ops ncsi_ops[] = {
.doit = ncsi_clear_interface_nl,
.flags = GENL_ADMIN_PERM,
},
+ {
+ .cmd = NCSI_CMD_SEND_CMD,
+ .policy = ncsi_genl_policy,
+ .doit = ncsi_send_cmd_nl,
+ .flags = GENL_ADMIN_PERM,
+ },
};
static struct genl_family ncsi_genl_family __ro_after_init = {
diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h
index 91a5c256f8c4..c4a46887a932 100644
--- a/net/ncsi/ncsi-netlink.h
+++ b/net/ncsi/ncsi-netlink.h
@@ -14,6 +14,18 @@
#include "internal.h"
+int ncsi_send_netlink_rsp(struct ncsi_request *nr,
+ struct ncsi_package *np,
+ struct ncsi_channel *nc);
+int ncsi_send_netlink_timeout(struct ncsi_request *nr,
+ struct ncsi_package *np,
+ struct ncsi_channel *nc);
+int ncsi_send_netlink_err(struct net_device *dev,
+ u32 snd_seq,
+ u32 snd_portid,
+ struct nlmsghdr *nlhdr,
+ int err);
+
int ncsi_init_netlink(struct net_device *dev);
int ncsi_unregister_netlink(struct net_device *dev);
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index 91b4b66438df..4d3f06be38bd 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -151,6 +151,28 @@ struct ncsi_cmd_snfc_pkt {
unsigned char pad[22];
};
+/* OEM Request Command as per NCSI Specification */
+struct ncsi_cmd_oem_pkt {
+ struct ncsi_cmd_pkt_hdr cmd; /* Command header */
+ __be32 mfr_id; /* Manufacture ID */
+ unsigned char data[]; /* OEM Payload Data */
+};
+
+/* OEM Response Packet as per NCSI Specification */
+struct ncsi_rsp_oem_pkt {
+ struct ncsi_rsp_pkt_hdr rsp; /* Command header */
+ __be32 mfr_id; /* Manufacture ID */
+ unsigned char data[]; /* Payload data */
+};
+
+/* Broadcom Response Data */
+struct ncsi_rsp_oem_bcm_pkt {
+ unsigned char ver; /* Payload Version */
+ unsigned char type; /* OEM Command type */
+ __be16 len; /* Payload Length */
+ unsigned char data[]; /* Cmd specific Data */
+};
+
/* Get Link Status */
struct ncsi_rsp_gls_pkt {
struct ncsi_rsp_pkt_hdr rsp; /* Response header */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 930c1d3796f0..77e07ba3f493 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -16,9 +16,11 @@
#include <net/ncsi.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/genetlink.h>
#include "internal.h"
#include "ncsi-pkt.h"
+#include "ncsi-netlink.h"
static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
unsigned short payload)
@@ -32,15 +34,25 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
* before calling this function.
*/
h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp);
- if (h->common.revision != NCSI_PKT_REVISION)
+
+ if (h->common.revision != NCSI_PKT_REVISION) {
+ netdev_dbg(nr->ndp->ndev.dev,
+ "NCSI: unsupported header revision\n");
return -EINVAL;
- if (ntohs(h->common.length) != payload)
+ }
+ if (ntohs(h->common.length) != payload) {
+ netdev_dbg(nr->ndp->ndev.dev,
+ "NCSI: payload length mismatched\n");
return -EINVAL;
+ }
/* Check on code and reason */
if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED ||
- ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR)
- return -EINVAL;
+ ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) {
+ netdev_dbg(nr->ndp->ndev.dev,
+ "NCSI: non zero response/reason code\n");
+ return -EPERM;
+ }
/* Validate checksum, which might be zeroes if the
* sender doesn't support checksum according to NCSI
@@ -52,8 +64,11 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
checksum = ncsi_calculate_checksum((unsigned char *)h,
sizeof(*h) + payload - 4);
- if (*pchecksum != htonl(checksum))
+
+ if (*pchecksum != htonl(checksum)) {
+ netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n");
return -EINVAL;
+ }
return 0;
}
@@ -596,6 +611,87 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
return 0;
}
+/* Response handler for Broadcom command Get Mac Address */
+static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
+{
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct net_device *ndev = ndp->ndev.dev;
+ const struct net_device_ops *ops = ndev->netdev_ops;
+ struct ncsi_rsp_oem_pkt *rsp;
+ struct sockaddr saddr;
+ int ret = 0;
+
+ /* Get the response header */
+ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+
+ saddr.sa_family = ndev->type;
+ ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN);
+ /* Increase mac address by 1 for BMC's address */
+ saddr.sa_data[ETH_ALEN - 1]++;
+ ret = ops->ndo_set_mac_address(ndev, &saddr);
+ if (ret < 0)
+ netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
+
+ return ret;
+}
+
+/* Response handler for Broadcom card */
+static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_oem_bcm_pkt *bcm;
+ struct ncsi_rsp_oem_pkt *rsp;
+
+ /* Get the response header */
+ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+ bcm = (struct ncsi_rsp_oem_bcm_pkt *)(rsp->data);
+
+ if (bcm->type == NCSI_OEM_BCM_CMD_GMA)
+ return ncsi_rsp_handler_oem_bcm_gma(nr);
+ return 0;
+}
+
+static struct ncsi_rsp_oem_handler {
+ unsigned int mfr_id;
+ int (*handler)(struct ncsi_request *nr);
+} ncsi_rsp_oem_handlers[] = {
+ { NCSI_OEM_MFR_MLX_ID, NULL },
+ { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
+};
+
+/* Response handler for OEM command */
+static int ncsi_rsp_handler_oem(struct ncsi_request *nr)
+{
+ struct ncsi_rsp_oem_handler *nrh = NULL;
+ struct ncsi_rsp_oem_pkt *rsp;
+ unsigned int mfr_id, i;
+
+ /* Get the response header */
+ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+ mfr_id = ntohl(rsp->mfr_id);
+
+ /* Check for manufacturer id and Find the handler */
+ for (i = 0; i < ARRAY_SIZE(ncsi_rsp_oem_handlers); i++) {
+ if (ncsi_rsp_oem_handlers[i].mfr_id == mfr_id) {
+ if (ncsi_rsp_oem_handlers[i].handler)
+ nrh = &ncsi_rsp_oem_handlers[i];
+ else
+ nrh = NULL;
+
+ break;
+ }
+ }
+
+ if (!nrh) {
+ netdev_err(nr->ndp->ndev.dev, "Received unrecognized OEM packet with MFR-ID (0x%x)\n",
+ mfr_id);
+ return -ENOENT;
+ }
+
+ /* Process the packet */
+ return nrh->handler(nr);
+}
+
static int ncsi_rsp_handler_gvi(struct ncsi_request *nr)
{
struct ncsi_rsp_gvi_pkt *rsp;
@@ -900,6 +996,26 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr)
return 0;
}
+static int ncsi_rsp_handler_netlink(struct ncsi_request *nr)
+{
+ struct ncsi_dev_priv *ndp = nr->ndp;
+ struct ncsi_rsp_pkt *rsp;
+ struct ncsi_package *np;
+ struct ncsi_channel *nc;
+ int ret;
+
+ /* Find the package */
+ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
+ ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel,
+ &np, &nc);
+ if (!np)
+ return -ENODEV;
+
+ ret = ncsi_send_netlink_rsp(nr, np, nc);
+
+ return ret;
+}
+
static struct ncsi_rsp_handler {
unsigned char type;
int payload;
@@ -932,7 +1048,7 @@ static struct ncsi_rsp_handler {
{ NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns },
{ NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts },
{ NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps },
- { NCSI_PKT_RSP_OEM, 0, NULL },
+ { NCSI_PKT_RSP_OEM, -1, ncsi_rsp_handler_oem },
{ NCSI_PKT_RSP_PLDM, 0, NULL },
{ NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid }
};
@@ -1002,6 +1118,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
netdev_warn(ndp->ndev.dev,
"NCSI: 'bad' packet ignored for type 0x%x\n",
hdr->type);
+
+ if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+ if (ret == -EPERM)
+ goto out_netlink;
+ else
+ ncsi_send_netlink_err(ndp->ndev.dev,
+ nr->snd_seq,
+ nr->snd_portid,
+ &nr->nlhdr,
+ ret);
+ }
goto out;
}
@@ -1011,6 +1138,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
netdev_err(ndp->ndev.dev,
"NCSI: Handler for packet type 0x%x returned %d\n",
hdr->type, ret);
+
+out_netlink:
+ if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) {
+ ret = ncsi_rsp_handler_netlink(nr);
+ if (ret) {
+ netdev_err(ndp->ndev.dev,
+ "NCSI: Netlink handler for packet type 0x%x returned %d\n",
+ hdr->type, ret);
+ }
+ }
+
out:
ncsi_free_request(nr);
return ret;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f0a1c536ef15..2ab870ef233a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -46,9 +46,19 @@ config NETFILTER_NETLINK_LOG
and is also scheduled to replace the old syslog-based ipt_LOG
and ip6t_LOG modules.
+config NETFILTER_NETLINK_OSF
+ tristate "Netfilter OSF over NFNETLINK interface"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for passive OS fingerprint via NFNETLINK.
+
config NF_CONNTRACK
tristate "Netfilter connection tracking support"
default m if NETFILTER_ADVANCED=n
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IPV6 != n
help
Connection tracking keeps a record of what packets have passed
through your machine, in order to figure out how they are related
@@ -96,7 +106,6 @@ config NF_CONNTRACK_SECMARK
config NF_CONNTRACK_ZONES
bool 'Connection tracking zones'
depends on NETFILTER_ADVANCED
- depends on NETFILTER_XT_TARGET_CT
help
This option enables support for connection tracking zones.
Normally, each connection needs to have a unique system wide
@@ -148,10 +157,11 @@ config NF_CONNTRACK_TIMESTAMP
If unsure, say `N'.
config NF_CONNTRACK_LABELS
- bool
+ bool "Connection tracking labels"
help
This option enables support for assigning user-defined flag bits
- to connection tracking entries. It selected by the connlabel match.
+ to connection tracking entries. It can be used with xtables connlabel
+ match and the nftables ct expression.
config NF_CT_PROTO_DCCP
bool 'DCCP protocol connection tracking support'
@@ -355,6 +365,7 @@ config NF_CT_NETLINK_TIMEOUT
tristate 'Connection tracking timeout tuning via Netlink'
select NETFILTER_NETLINK
depends on NETFILTER_ADVANCED
+ depends on NF_CONNTRACK_TIMEOUT
help
This option enables support for connection tracking timeout
fine-grain tuning. This allows you to attach specific timeout
@@ -440,9 +451,6 @@ config NETFILTER_SYNPROXY
endif # NF_CONNTRACK
-config NF_OSF
- tristate
-
config NF_TABLES
select NETFILTER_NETLINK
tristate "Netfilter nf_tables support"
@@ -551,6 +559,12 @@ config NFT_NAT
This option adds the "nat" expression that you can use to perform
typical Network Address Translation (NAT) packet transformations.
+config NFT_TUNNEL
+ tristate "Netfilter nf_tables tunnel module"
+ help
+ This option adds the "tunnel" expression that you can use to set
+ tunneling policies.
+
config NFT_OBJREF
tristate "Netfilter nf_tables stateful object reference module"
help
@@ -611,15 +625,39 @@ config NFT_FIB_INET
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_XFRM
+ tristate "Netfilter nf_tables xfrm/IPSec security association matching"
+ depends on XFRM
+ help
+ This option adds an expression that you can use to extract properties
+ of a packets security association.
+
config NFT_SOCKET
tristate "Netfilter nf_tables socket match support"
depends on IPV6 || IPV6=n
select NF_SOCKET_IPV4
- select NF_SOCKET_IPV6 if IPV6
+ select NF_SOCKET_IPV6 if NF_TABLES_IPV6
help
This option allows matching for the presence or absence of a
corresponding socket and its attributes.
+config NFT_OSF
+ tristate "Netfilter nf_tables passive OS fingerprint support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_OSF
+ help
+ This option allows matching packets from an specific OS.
+
+config NFT_TPROXY
+ tristate "Netfilter nf_tables tproxy support"
+ depends on IPV6 || IPV6=n
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if NF_TABLES_IPV6
+ select NF_TPROXY_IPV4
+ select NF_TPROXY_IPV6 if NF_TABLES_IPV6
+ help
+ This makes transparent proxy support available in nftables.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -740,13 +778,13 @@ config NETFILTER_XT_TARGET_CHECKSUM
depends on NETFILTER_ADVANCED
---help---
This option adds a `CHECKSUM' target, which can be used in the iptables mangle
- table.
+ table to work around buggy DHCP clients in virtualized environments.
- You can use this target to compute and fill in the checksum in
- a packet that lacks a checksum. This is particularly useful,
- if you need to work around old applications such as dhcp clients,
- that do not work well with checksum offloads, but don't want to disable
- checksum offload in your device.
+ Some old DHCP clients drop packets because they are not aware
+ that the checksum would normally be offloaded to hardware and
+ thus should be considered valid.
+ This target can be used to fill in the checksum using iptables
+ when such packets are sent via a virtual network device.
To compile it as a module, choose M here. If unsure, say N.
@@ -881,7 +919,7 @@ config NETFILTER_XT_TARGET_LOG
tristate "LOG target support"
select NF_LOG_COMMON
select NF_LOG_IPV4
- select NF_LOG_IPV6 if IPV6
+ select NF_LOG_IPV6 if IP6_NF_IPTABLES
default m if NETFILTER_ADVANCED=n
help
This option adds a `LOG' target, which allows you to create rules in
@@ -973,7 +1011,7 @@ config NETFILTER_XT_TARGET_TEE
depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV4
- select NF_DUP_IPV6 if IPV6
+ select NF_DUP_IPV6 if IP6_NF_IPTABLES
---help---
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -1366,8 +1404,8 @@ config NETFILTER_XT_MATCH_NFACCT
config NETFILTER_XT_MATCH_OSF
tristate '"osf" Passive OS fingerprint match'
- depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
- select NF_OSF
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_OSF
help
This option selects the Passive OS Fingerprinting match module
that allows to passively match the remote operating system by
@@ -1481,8 +1519,8 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_ADVANCED
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
- depends on NF_SOCKET_IPV4
- depends on NF_SOCKET_IPV6
+ select NF_SOCKET_IPV4
+ select NF_SOCKET_IPV6 if IP6_NF_IPTABLES
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 8a76dced974d..4ddf3ef51ece 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,7 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
-nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \
+ nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \
+ nf_conntrack_proto_icmp.o \
+ nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+
+nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
@@ -15,6 +20,7 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
+obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
@@ -95,6 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
@@ -103,8 +110,10 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
obj-$(CONFIG_NFT_FIB) += nft_fib.o
obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
-obj-$(CONFIG_NF_OSF) += nf_osf.o
obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
+obj-$(CONFIG_NFT_OSF) += nft_osf.o
+obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o
+obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 168af54db975..dc240cb47ddf 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
}
EXPORT_SYMBOL(nf_conntrack_destroy);
+bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+{
+ struct nf_ct_hook *ct_hook;
+ bool ret = false;
+
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ret = ct_hook->get_tuple_skb(dst_tuple, skb);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(nf_ct_get_tuple_skb);
+
/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
.id = NF_CT_DEFAULT_ZONE_ID,
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 8a33dac4e805..e287da68d5fa 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -15,7 +15,7 @@
#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
#define ipset_dereference_protected(p, set) \
- __ipset_dereference_protected(p, spin_is_locked(&(set)->lock))
+ __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 05dc1b77e466..cad48d07c818 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -296,10 +296,10 @@ config IP_VS_MH_TAB_INDEX
stored in a hash table. This table is assigned by a preference
list of the positions to each destination until all slots in
the table are filled. The index determines the prime for size of
- the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
- 65521 or 131071. When using weights to allow destinations to
- receive more connections, the table is assigned an amount
- proportional to the weights specified. The table needs to be large
+ the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+ 65521 or 131071. When using weights to allow destinations to
+ receive more connections, the table is assigned an amount
+ proportional to the weights specified. The table needs to be large
enough to effectively fit all the destinations multiplied by their
respective weights.
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 99e0aa350dc5..5b2b17867cb1 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -825,12 +825,23 @@ static void ip_vs_conn_expire(struct timer_list *t)
/* Unlink conn if not referenced anymore */
if (likely(ip_vs_conn_unlink(cp))) {
+ struct ip_vs_conn *ct = cp->control;
+
/* delete the timer if it is activated by other users */
del_timer(&cp->timer);
/* does anybody control me? */
- if (cp->control)
+ if (ct) {
ip_vs_control_del(cp);
+ /* Drop CTL or non-assured TPL if not used anymore */
+ if (!cp->timeout && !atomic_read(&ct->n_control) &&
+ (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
+ !(ct->state & IP_VS_CTPL_S_ASSURED))) {
+ IP_VS_DBG(4, "drop controlling connection\n");
+ ct->timeout = 0;
+ ip_vs_conn_expire_now(ct);
+ }
+ }
if ((cp->flags & IP_VS_CONN_F_NFCT) &&
!(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
@@ -872,6 +883,10 @@ static void ip_vs_conn_expire(struct timer_list *t)
/* Modify timer, so that it expires as soon as possible.
* Can be called without reference only if under RCU lock.
+ * We can have such chain of conns linked with ->control: DATA->CTL->TPL
+ * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
+ * - cp->timeout=0 indicates all conns from chain should be dropped but
+ * TPL is not dropped if in assured state
*/
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
@@ -1102,24 +1117,28 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
- "%s %04X %-11s %7lu%s\n",
+ "%s %04X %-11s %7u%s\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ, pe_data);
+ ip_vs_state_name(cp),
+ jiffies_delta_to_msecs(cp->timer.expires -
+ jiffies) / 1000,
+ pe_data);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X"
- " %s %04X %-11s %7lu%s\n",
+ " %s %04X %-11s %7u%s\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
- (cp->timer.expires-jiffies)/HZ, pe_data);
+ ip_vs_state_name(cp),
+ jiffies_delta_to_msecs(cp->timer.expires -
+ jiffies) / 1000,
+ pe_data);
}
return 0;
}
@@ -1164,26 +1183,28 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
- "%s %04X %-11s %-6s %7lu\n",
+ "%s %04X %-11s %-6s %7u\n",
ip_vs_proto_name(cp->protocol),
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_state_name(cp),
ip_vs_origin_name(cp->flags),
- (cp->timer.expires-jiffies)/HZ);
+ jiffies_delta_to_msecs(cp->timer.expires -
+ jiffies) / 1000);
else
#endif
seq_printf(seq,
"%-3s %08X %04X %08X %04X "
- "%s %04X %-11s %-6s %7lu\n",
+ "%s %04X %-11s %-6s %7u\n",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_state_name(cp),
ip_vs_origin_name(cp->flags),
- (cp->timer.expires-jiffies)/HZ);
+ jiffies_delta_to_msecs(cp->timer.expires -
+ jiffies) / 1000);
}
return 0;
}
@@ -1197,8 +1218,11 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
#endif
-/*
- * Randomly drop connection entries before running out of memory
+/* Randomly drop connection entries before running out of memory
+ * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
+ * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
+ * - traffic for services not in OPS mode does not increase ct->in_pkts in
+ * all cases, so it is not supported
*/
static inline int todrop_entry(struct ip_vs_conn *cp)
{
@@ -1242,7 +1266,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
int idx;
- struct ip_vs_conn *cp, *cp_c;
+ struct ip_vs_conn *cp;
rcu_read_lock();
/*
@@ -1254,13 +1278,15 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
continue;
+ if (atomic_read(&cp->n_control))
+ continue;
if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
- if (atomic_read(&cp->n_control) ||
- !ip_vs_conn_ops_mode(cp))
- continue;
- else
- /* connection template of OPS */
+ /* connection template of OPS */
+ if (ip_vs_conn_ops_mode(cp))
goto try_drop;
+ if (!(cp->state & IP_VS_CTPL_S_ASSURED))
+ goto drop;
+ continue;
}
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
@@ -1294,15 +1320,10 @@ try_drop:
continue;
}
- IP_VS_DBG(4, "del connection\n");
+drop:
+ IP_VS_DBG(4, "drop connection\n");
+ cp->timeout = 0;
ip_vs_conn_expire_now(cp);
- cp_c = cp->control;
- /* cp->control is valid only with reference to cp */
- if (cp_c && __ip_vs_conn_get(cp)) {
- IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp_c);
- __ip_vs_conn_put(cp);
- }
}
cond_resched_rcu();
}
@@ -1325,15 +1346,19 @@ flush_again:
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (cp->ipvs != ipvs)
continue;
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
+ /* As timers are expired in LIFO order, restart
+ * the timer of controlling connection first, so
+ * that it is expired after us.
+ */
cp_c = cp->control;
/* cp->control is valid only with reference to cp */
if (cp_c && __ip_vs_conn_get(cp)) {
- IP_VS_DBG(4, "del conn template\n");
+ IP_VS_DBG(4, "del controlling connection\n");
ip_vs_conn_expire_now(cp_c);
__ip_vs_conn_put(cp);
}
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
}
cond_resched_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 0679dd101e72..fe9abf3cc10a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1686,8 +1686,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
skb_reset_network_header(skb);
IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
- ipv4_update_pmtu(skb, ipvs->net,
- mtu, 0, 0, 0, 0);
+ ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
goto ignore_ipip;
@@ -1972,13 +1971,20 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
- if (sysctl_expire_nodest_conn(ipvs)) {
+ __u32 flags = cp->flags;
+
+ /* when timer already started, silently drop the packet.*/
+ if (timer_pending(&cp->timer))
+ __ip_vs_conn_put(cp);
+ else
+ ip_vs_conn_put(cp);
+
+ if (sysctl_expire_nodest_conn(ipvs) &&
+ !(flags & IP_VS_CONN_F_ONE_PACKET)) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
}
- /* don't restart its timer, and silently
- drop the packet. */
- __ip_vs_conn_put(cp);
+
return NF_DROP;
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index dd21782e2f12..83395bf6dc35 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -134,7 +134,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
} else {
atomic_set(&ipvs->dropentry, 0);
ipvs->sysctl_drop_entry = 1;
- };
+ }
break;
case 3:
atomic_set(&ipvs->dropentry, 1);
@@ -3234,7 +3234,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
/* Try to find the service for which to dump destinations */
if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX,
- ip_vs_cmd_policy, NULL))
+ ip_vs_cmd_policy, cb->extack))
goto out_err;
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index 0f795b186eb3..94d9d349ebb0 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -5,10 +5,10 @@
*
*/
-/* The mh algorithm is to assign a preference list of all the lookup
+/* The mh algorithm is to assign a preference list of all the lookup
* table positions to each destination and populate the table with
* the most-preferred position of destinations. Then it is to select
- * destination with the hash key of source IP address through looking
+ * destination with the hash key of source IP address through looking
* up a the lookup table.
*
* The algorithm is detailed in:
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index ca880a3ad033..54ee84adf0bd 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -42,6 +42,11 @@
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+/* States for conn templates: NONE or words separated with ",", max 15 chars */
+static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = {
+ [IP_VS_CTPL_S_NONE] = "NONE",
+ [IP_VS_CTPL_S_ASSURED] = "ASSURED",
+};
/*
* register an ipvs protocol
@@ -193,12 +198,20 @@ ip_vs_create_timeout_table(int *table, int size)
}
-const char * ip_vs_state_name(__u16 proto, int state)
+const char *ip_vs_state_name(const struct ip_vs_conn *cp)
{
- struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+ unsigned int state = cp->state;
+ struct ip_vs_protocol *pp;
+
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ if (state >= IP_VS_CTPL_S_LAST)
+ return "ERR!";
+ return ip_vs_ctpl_state_name_table[state] ? : "?";
+ }
+ pp = ip_vs_proto_get(cp->protocol);
if (pp == NULL || pp->state_name == NULL)
- return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+ return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!";
return pp->state_name(state);
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 3250c4a1111e..b0cd7d08f2a7 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
+ if (next_state == IP_VS_SCTP_S_ESTABLISHED)
+ ip_vs_control_assure_ct(cp);
}
if (likely(pd))
cp->timeout = pd->timeout_table[cp->state = next_state];
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 80d10ad12a15..1770fc6ce960 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
+ if (new_state == IP_VS_TCP_S_ESTABLISHED)
+ ip_vs_control_assure_ct(cp);
}
if (likely(pd))
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index e0ef11c3691e..0f53c49025f8 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
}
cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+ if (direction == IP_VS_DIR_OUTPUT)
+ ip_vs_control_assure_ct(cp);
}
static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 001501e25625..d4020c5e831d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer
continue;
}
} else {
- /* protocol in templates is not used for state/timeout */
- if (state > 0) {
- IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
- state);
- state = 0;
- }
+ if (state >= IP_VS_CTPL_S_LAST)
+ IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
+ state);
}
ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
@@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m
goto out;
}
} else {
- /* protocol in templates is not used for state/timeout */
- if (state > 0) {
- IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
- state);
- state = 0;
- }
+ if (state >= IP_VS_CTPL_S_LAST)
+ IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
+ state);
}
if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
pe_data_len, pe_name, pe_name_len)) {
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 510039862aa9..02ca7df793f5 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -44,17 +44,19 @@
/* we will save the tuples of all connections we care about */
struct nf_conncount_tuple {
- struct hlist_node node;
+ struct list_head node;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_zone zone;
int cpu;
u32 jiffies32;
+ struct rcu_head rcu_head;
};
struct nf_conncount_rb {
struct rb_node node;
- struct hlist_head hhead; /* connections/hosts in same subnet */
+ struct nf_conncount_list list;
u32 key[MAX_KEYLEN];
+ struct rcu_head rcu_head;
};
static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
@@ -62,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i
struct nf_conncount_data {
unsigned int keylen;
struct rb_root root[CONNCOUNT_SLOTS];
+ struct net *net;
+ struct work_struct gc_work;
+ unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
+ unsigned int gc_tree;
};
static u_int32_t conncount_rnd __read_mostly;
@@ -82,26 +88,70 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
return memcmp(a, b, klen * sizeof(u32));
}
-bool nf_conncount_add(struct hlist_head *head,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+enum nf_conncount_list_add
+nf_conncount_add(struct nf_conncount_list *list,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
struct nf_conncount_tuple *conn;
+ if (WARN_ON_ONCE(list->count > INT_MAX))
+ return NF_CONNCOUNT_ERR;
+
conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
if (conn == NULL)
- return false;
+ return NF_CONNCOUNT_ERR;
+
conn->tuple = *tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
- hlist_add_head(&conn->node, head);
- return true;
+ spin_lock(&list->list_lock);
+ if (list->dead == true) {
+ kmem_cache_free(conncount_conn_cachep, conn);
+ spin_unlock(&list->list_lock);
+ return NF_CONNCOUNT_SKIP;
+ }
+ list_add_tail(&conn->node, &list->head);
+ list->count++;
+ spin_unlock(&list->list_lock);
+ return NF_CONNCOUNT_ADDED;
}
EXPORT_SYMBOL_GPL(nf_conncount_add);
+static void __conn_free(struct rcu_head *h)
+{
+ struct nf_conncount_tuple *conn;
+
+ conn = container_of(h, struct nf_conncount_tuple, rcu_head);
+ kmem_cache_free(conncount_conn_cachep, conn);
+}
+
+static bool conn_free(struct nf_conncount_list *list,
+ struct nf_conncount_tuple *conn)
+{
+ bool free_entry = false;
+
+ spin_lock(&list->list_lock);
+
+ if (list->count == 0) {
+ spin_unlock(&list->list_lock);
+ return free_entry;
+ }
+
+ list->count--;
+ list_del_rcu(&conn->node);
+ if (list->count == 0)
+ free_entry = true;
+
+ spin_unlock(&list->list_lock);
+ call_rcu(&conn->rcu_head, __conn_free);
+ return free_entry;
+}
+
static const struct nf_conntrack_tuple_hash *
-find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
+find_or_evict(struct net *net, struct nf_conncount_list *list,
+ struct nf_conncount_tuple *conn, bool *free_entry)
{
const struct nf_conntrack_tuple_hash *found;
unsigned long a, b;
@@ -121,34 +171,37 @@ find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
*/
age = a - b;
if (conn->cpu == cpu || age >= 2) {
- hlist_del(&conn->node);
- kmem_cache_free(conncount_conn_cachep, conn);
+ *free_entry = conn_free(list, conn);
return ERR_PTR(-ENOENT);
}
return ERR_PTR(-EAGAIN);
}
-unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone,
- bool *addit)
+void nf_conncount_lookup(struct net *net,
+ struct nf_conncount_list *list,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone,
+ bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
- struct nf_conncount_tuple *conn;
+ struct nf_conncount_tuple *conn, *conn_n;
struct nf_conn *found_ct;
- struct hlist_node *n;
- unsigned int length = 0;
+ unsigned int collect = 0;
+ bool free_entry = false;
+ /* best effort only */
*addit = tuple ? true : false;
/* check the saved connections */
- hlist_for_each_entry_safe(conn, n, head, node) {
- found = find_or_evict(net, conn);
+ list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+ if (collect > CONNCOUNT_GC_MAX_NODES)
+ break;
+
+ found = find_or_evict(net, list, conn, &free_entry);
if (IS_ERR(found)) {
/* Not found, but might be about to be confirmed */
if (PTR_ERR(found) == -EAGAIN) {
- length++;
if (!tuple)
continue;
@@ -156,7 +209,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
nf_ct_zone_id(zone, zone->dir))
*addit = false;
- }
+ } else if (PTR_ERR(found) == -ENOENT)
+ collect++;
continue;
}
@@ -165,9 +219,10 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
nf_ct_zone_equal(found_ct, zone, zone->dir)) {
/*
- * Just to be sure we have it only once in the list.
* We should not see tuples twice unless someone hooks
* this into a table without "-p tcp --syn".
+ *
+ * Attempt to avoid a re-add in this case.
*/
*addit = false;
} else if (already_closed(found_ct)) {
@@ -176,19 +231,75 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
* closed already -> ditch it
*/
nf_ct_put(found_ct);
- hlist_del(&conn->node);
- kmem_cache_free(conncount_conn_cachep, conn);
+ conn_free(list, conn);
+ collect++;
continue;
}
nf_ct_put(found_ct);
- length++;
}
-
- return length;
}
EXPORT_SYMBOL_GPL(nf_conncount_lookup);
+void nf_conncount_list_init(struct nf_conncount_list *list)
+{
+ spin_lock_init(&list->list_lock);
+ INIT_LIST_HEAD(&list->head);
+ list->count = 1;
+ list->dead = false;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_list_init);
+
+/* Return true if the list is empty */
+bool nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
+{
+ const struct nf_conntrack_tuple_hash *found;
+ struct nf_conncount_tuple *conn, *conn_n;
+ struct nf_conn *found_ct;
+ unsigned int collected = 0;
+ bool free_entry = false;
+
+ list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+ found = find_or_evict(net, list, conn, &free_entry);
+ if (IS_ERR(found)) {
+ if (PTR_ERR(found) == -ENOENT) {
+ if (free_entry)
+ return true;
+ collected++;
+ }
+ continue;
+ }
+
+ found_ct = nf_ct_tuplehash_to_ctrack(found);
+ if (already_closed(found_ct)) {
+ /*
+ * we do not care about connections which are
+ * closed already -> ditch it
+ */
+ nf_ct_put(found_ct);
+ if (conn_free(list, conn))
+ return true;
+ collected++;
+ continue;
+ }
+
+ nf_ct_put(found_ct);
+ if (collected > CONNCOUNT_GC_MAX_NODES)
+ return false;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
+
+static void __tree_nodes_free(struct rcu_head *h)
+{
+ struct nf_conncount_rb *rbconn;
+
+ rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
+ kmem_cache_free(conncount_rb_cachep, rbconn);
+}
+
static void tree_nodes_free(struct rb_root *root,
struct nf_conncount_rb *gc_nodes[],
unsigned int gc_count)
@@ -197,32 +308,46 @@ static void tree_nodes_free(struct rb_root *root,
while (gc_count) {
rbconn = gc_nodes[--gc_count];
- rb_erase(&rbconn->node, root);
- kmem_cache_free(conncount_rb_cachep, rbconn);
+ spin_lock(&rbconn->list.list_lock);
+ if (rbconn->list.count == 0 && rbconn->list.dead == false) {
+ rbconn->list.dead = true;
+ rb_erase(&rbconn->node, root);
+ call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+ }
+ spin_unlock(&rbconn->list.list_lock);
}
}
+static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
+{
+ set_bit(tree, data->pending_trees);
+ schedule_work(&data->gc_work);
+}
+
static unsigned int
-count_tree(struct net *net, struct rb_root *root,
- const u32 *key, u8 keylen,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+insert_tree(struct net *net,
+ struct nf_conncount_data *data,
+ struct rb_root *root,
+ unsigned int hash,
+ const u32 *key,
+ u8 keylen,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
+ enum nf_conncount_list_add ret;
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
struct rb_node **rbnode, *parent;
struct nf_conncount_rb *rbconn;
struct nf_conncount_tuple *conn;
- unsigned int gc_count;
- bool no_gc = false;
+ unsigned int count = 0, gc_count = 0;
+ bool node_found = false;
+
+ spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
- restart:
- gc_count = 0;
parent = NULL;
rbnode = &(root->rb_node);
while (*rbnode) {
int diff;
- bool addit;
-
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
parent = *rbnode;
@@ -232,33 +357,30 @@ count_tree(struct net *net, struct rb_root *root,
} else if (diff > 0) {
rbnode = &((*rbnode)->rb_right);
} else {
- /* same source network -> be counted! */
- unsigned int count;
-
- count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
- zone, &addit);
-
- tree_nodes_free(root, gc_nodes, gc_count);
- if (!addit)
- return count;
-
- if (!nf_conncount_add(&rbconn->hhead, tuple, zone))
- return 0; /* hotdrop */
-
- return count + 1;
+ /* unlikely: other cpu added node already */
+ node_found = true;
+ ret = nf_conncount_add(&rbconn->list, tuple, zone);
+ if (ret == NF_CONNCOUNT_ERR) {
+ count = 0; /* hotdrop */
+ } else if (ret == NF_CONNCOUNT_ADDED) {
+ count = rbconn->list.count;
+ } else {
+ /* NF_CONNCOUNT_SKIP, rbconn is already
+ * reclaimed by gc, insert a new tree node
+ */
+ node_found = false;
+ }
+ break;
}
- if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+ if (gc_count >= ARRAY_SIZE(gc_nodes))
continue;
- /* only used for GC on hhead, retval and 'addit' ignored */
- nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
- if (hlist_empty(&rbconn->hhead))
+ if (nf_conncount_gc_list(net, &rbconn->list))
gc_nodes[gc_count++] = rbconn;
}
if (gc_count) {
- no_gc = true;
tree_nodes_free(root, gc_nodes, gc_count);
/* tree_node_free before new allocation permits
* allocator to re-use newly free'd object.
@@ -266,58 +388,146 @@ count_tree(struct net *net, struct rb_root *root,
* This is a rare event; in most cases we will find
* existing node to re-use. (or gc_count is 0).
*/
- goto restart;
+
+ if (gc_count >= ARRAY_SIZE(gc_nodes))
+ schedule_gc_worker(data, hash);
}
- if (!tuple)
- return 0;
+ if (node_found)
+ goto out_unlock;
- /* no match, need to insert new node */
+ /* expected case: match, insert new node */
rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
if (rbconn == NULL)
- return 0;
+ goto out_unlock;
conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
if (conn == NULL) {
kmem_cache_free(conncount_rb_cachep, rbconn);
- return 0;
+ goto out_unlock;
}
conn->tuple = *tuple;
conn->zone = *zone;
memcpy(rbconn->key, key, sizeof(u32) * keylen);
- INIT_HLIST_HEAD(&rbconn->hhead);
- hlist_add_head(&conn->node, &rbconn->hhead);
+ nf_conncount_list_init(&rbconn->list);
+ list_add(&conn->node, &rbconn->list.head);
+ count = 1;
rb_link_node(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, root);
- return 1;
+out_unlock:
+ spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+ return count;
}
-/* Count and return number of conntrack entries in 'net' with particular 'key'.
- * If 'tuple' is not null, insert it into the accounting data structure.
- */
-unsigned int nf_conncount_count(struct net *net,
- struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+static unsigned int
+count_tree(struct net *net,
+ struct nf_conncount_data *data,
+ const u32 *key,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
+ enum nf_conncount_list_add ret;
struct rb_root *root;
- int count;
- u32 hash;
+ struct rb_node *parent;
+ struct nf_conncount_rb *rbconn;
+ unsigned int hash;
+ u8 keylen = data->keylen;
hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+ parent = rcu_dereference_raw(root->rb_node);
+ while (parent) {
+ int diff;
+ bool addit;
- count = count_tree(net, root, key, data->keylen, tuple, zone);
+ rbconn = rb_entry(parent, struct nf_conncount_rb, node);
- spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+ diff = key_diff(key, rbconn->key, keylen);
+ if (diff < 0) {
+ parent = rcu_dereference_raw(parent->rb_left);
+ } else if (diff > 0) {
+ parent = rcu_dereference_raw(parent->rb_right);
+ } else {
+ /* same source network -> be counted! */
+ nf_conncount_lookup(net, &rbconn->list, tuple, zone,
+ &addit);
- return count;
+ if (!addit)
+ return rbconn->list.count;
+
+ ret = nf_conncount_add(&rbconn->list, tuple, zone);
+ if (ret == NF_CONNCOUNT_ERR) {
+ return 0; /* hotdrop */
+ } else if (ret == NF_CONNCOUNT_ADDED) {
+ return rbconn->list.count;
+ } else {
+ /* NF_CONNCOUNT_SKIP, rbconn is already
+ * reclaimed by gc, insert a new tree node
+ */
+ break;
+ }
+ }
+ }
+
+ if (!tuple)
+ return 0;
+
+ return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
+}
+
+static void tree_gc_worker(struct work_struct *work)
+{
+ struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
+ struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
+ struct rb_root *root;
+ struct rb_node *node;
+ unsigned int tree, next_tree, gc_count = 0;
+
+ tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
+ root = &data->root[tree];
+
+ rcu_read_lock();
+ for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+ rbconn = rb_entry(node, struct nf_conncount_rb, node);
+ if (nf_conncount_gc_list(data->net, &rbconn->list))
+ gc_nodes[gc_count++] = rbconn;
+ }
+ rcu_read_unlock();
+
+ spin_lock_bh(&nf_conncount_locks[tree]);
+
+ if (gc_count) {
+ tree_nodes_free(root, gc_nodes, gc_count);
+ }
+
+ clear_bit(tree, data->pending_trees);
+
+ next_tree = (tree + 1) % CONNCOUNT_SLOTS;
+ next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
+
+ if (next_tree < CONNCOUNT_SLOTS) {
+ data->gc_tree = next_tree;
+ schedule_work(work);
+ }
+
+ spin_unlock_bh(&nf_conncount_locks[tree]);
+}
+
+/* Count and return number of conntrack entries in 'net' with particular 'key'.
+ * If 'tuple' is not null, insert it into the accounting data structure.
+ * Call with RCU read lock.
+ */
+unsigned int nf_conncount_count(struct net *net,
+ struct nf_conncount_data *data,
+ const u32 *key,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
+{
+ return count_tree(net, data, key, tuple, zone);
}
EXPORT_SYMBOL_GPL(nf_conncount_count);
@@ -348,17 +558,18 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
data->root[i] = RB_ROOT;
data->keylen = keylen / sizeof(u32);
+ data->net = net;
+ INIT_WORK(&data->gc_work, tree_gc_worker);
return data;
}
EXPORT_SYMBOL_GPL(nf_conncount_init);
-void nf_conncount_cache_free(struct hlist_head *hhead)
+void nf_conncount_cache_free(struct nf_conncount_list *list)
{
- struct nf_conncount_tuple *conn;
- struct hlist_node *n;
+ struct nf_conncount_tuple *conn, *conn_n;
- hlist_for_each_entry_safe(conn, n, hhead, node)
+ list_for_each_entry_safe(conn, conn_n, &list->head, node)
kmem_cache_free(conncount_conn_cachep, conn);
}
EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
@@ -373,7 +584,7 @@ static void destroy_tree(struct rb_root *r)
rb_erase(node, r);
- nf_conncount_cache_free(&rbconn->hhead);
+ nf_conncount_cache_free(&rbconn->list);
kmem_cache_free(conncount_rb_cachep, rbconn);
}
@@ -384,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family,
{
unsigned int i;
+ cancel_work_sync(&data->gc_work);
nf_ct_netns_put(net, family);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index a1086bdec242..5423b197d98a 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
__be32 mask = 0;
/* we're only interested in locally generated packets */
- if (skb->sk == NULL)
+ if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk)))
goto out;
if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3d5280425027..ca1168d67fac 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -37,7 +37,6 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -55,6 +54,7 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netns/hash.h>
+#include <net/ip.h>
#include "nf_internals.h"
@@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net,
return scale_hash(hash_conntrack_raw(tuple, net));
}
-bool
+static bool
nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int nhoff,
unsigned int dataoff,
@@ -230,45 +230,159 @@ nf_ct_get_tuple(const struct sk_buff *skb,
u_int8_t protonum,
struct net *net,
struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
+ unsigned int size;
+ const __be32 *ap;
+ __be32 _addrs[8];
+ struct {
+ __be16 sport;
+ __be16 dport;
+ } _inet_hdr, *inet_hdr;
+
memset(tuple, 0, sizeof(*tuple));
tuple->src.l3num = l3num;
- if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+ switch (l3num) {
+ case NFPROTO_IPV4:
+ nhoff += offsetof(struct iphdr, saddr);
+ size = 2 * sizeof(__be32);
+ break;
+ case NFPROTO_IPV6:
+ nhoff += offsetof(struct ipv6hdr, saddr);
+ size = sizeof(_addrs);
+ break;
+ default:
+ return true;
+ }
+
+ ap = skb_header_pointer(skb, nhoff, size, _addrs);
+ if (!ap)
return false;
+ switch (l3num) {
+ case NFPROTO_IPV4:
+ tuple->src.u3.ip = ap[0];
+ tuple->dst.u3.ip = ap[1];
+ break;
+ case NFPROTO_IPV6:
+ memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
+ memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
+ break;
+ }
+
tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
+ if (unlikely(l4proto->pkt_to_tuple))
+ return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
+
+ /* Actually only need first 4 bytes to get ports. */
+ inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
+ if (!inet_hdr)
+ return false;
+
+ tuple->src.u.udp.port = inet_hdr->sport;
+ tuple->dst.u.udp.port = inet_hdr->dport;
+ return true;
+}
+
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ u_int8_t *protonum)
+{
+ int dataoff = -1;
+ const struct iphdr *iph;
+ struct iphdr _iph;
+
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return -1;
+
+ /* Conntrack defragments packets, we might still see fragments
+ * inside ICMP packets though.
+ */
+ if (iph->frag_off & htons(IP_OFFSET))
+ return -1;
+
+ dataoff = nhoff + (iph->ihl << 2);
+ *protonum = iph->protocol;
+
+ /* Check bogus IP headers */
+ if (dataoff > skb->len) {
+ pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -1;
+ }
+ return dataoff;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ u8 *protonum)
+{
+ int protoff = -1;
+ unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
+ __be16 frag_off;
+ u8 nexthdr;
+
+ if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
+ &nexthdr, sizeof(nexthdr)) != 0) {
+ pr_debug("can't get nexthdr\n");
+ return -1;
+ }
+ protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
+ /*
+ * (protoff == skb->len) means the packet has not data, just
+ * IPv6 and possibly extensions headers, but it is tracked anyway
+ */
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("can't find proto in pkt\n");
+ return -1;
+ }
+
+ *protonum = nexthdr;
+ return protoff;
+}
+#endif
+
+static int get_l4proto(const struct sk_buff *skb,
+ unsigned int nhoff, u8 pf, u8 *l4num)
+{
+ switch (pf) {
+ case NFPROTO_IPV4:
+ return ipv4_get_l4proto(skb, nhoff, l4num);
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ return ipv6_get_l4proto(skb, nhoff, l4num);
+#endif
+ default:
+ *l4num = 0;
+ break;
+ }
+ return -1;
}
-EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
u_int16_t l3num,
struct net *net, struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
- unsigned int protoff;
- u_int8_t protonum;
+ u8 protonum;
+ int protoff;
int ret;
rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(l3num);
- ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
- if (ret != NF_ACCEPT) {
+ protoff = get_l4proto(skb, nhoff, l3num, &protonum);
+ if (protoff <= 0) {
rcu_read_unlock();
return false;
}
- l4proto = __nf_ct_l4proto_find(l3num, protonum);
+ l4proto = __nf_ct_l4proto_find(protonum);
ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
- l3proto, l4proto);
+ l4proto);
rcu_read_unlock();
return ret;
@@ -278,19 +392,35 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
memset(inverse, 0, sizeof(*inverse));
inverse->src.l3num = orig->src.l3num;
- if (l3proto->invert_tuple(inverse, orig) == 0)
- return false;
+
+ switch (orig->src.l3num) {
+ case NFPROTO_IPV4:
+ inverse->src.u3.ip = orig->dst.u3.ip;
+ inverse->dst.u3.ip = orig->src.u3.ip;
+ break;
+ case NFPROTO_IPV6:
+ inverse->src.u3.in6 = orig->dst.u3.in6;
+ inverse->dst.u3.in6 = orig->src.u3.in6;
+ break;
+ default:
+ break;
+ }
inverse->dst.dir = !orig->dst.dir;
inverse->dst.protonum = orig->dst.protonum;
- return l4proto->invert_tuple(inverse, orig);
+
+ if (unlikely(l4proto->invert_tuple))
+ return l4proto->invert_tuple(inverse, orig);
+
+ inverse->src.u.all = orig->dst.u.all;
+ inverse->dst.u.all = orig->src.u.all;
+ return true;
}
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
@@ -409,7 +539,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
nf_ct_tmpl_free(ct);
return;
}
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
if (l4proto->destroy)
l4proto->destroy(ct);
@@ -502,6 +632,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
net_eq(net, nf_ct_net(ct));
}
+static inline bool
+nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+ return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+ nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+ nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
+ nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
+ net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
+}
+
/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
@@ -695,19 +837,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
const struct nf_conntrack_l4proto *l4proto;
+ enum ip_conntrack_info oldinfo;
+ struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
if (l4proto->allow_clash &&
- ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
!nf_ct_is_dying(ct) &&
atomic_inc_not_zero(&ct->ct_general.use)) {
- enum ip_conntrack_info oldinfo;
- struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
-
- nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
- nf_ct_set(skb, ct, oldinfo);
- return NF_ACCEPT;
+ if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+ nf_ct_match(ct, loser_ct)) {
+ nf_ct_acct_merge(ct, ctinfo, loser_ct);
+ nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_set(skb, ct, oldinfo);
+ return NF_ACCEPT;
+ }
+ nf_ct_put(ct);
}
NF_CT_STAT_INC(net, drop);
return NF_DROP;
@@ -965,7 +1109,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
if (!test_bit(IPS_ASSURED_BIT, &ct->status))
return true;
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
return true;
@@ -1195,7 +1339,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
@@ -1208,9 +1351,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
- unsigned int *timeouts;
- if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) {
pr_debug("Can't invert tuple.\n");
return NULL;
}
@@ -1227,19 +1369,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
- if (timeout_ext) {
- timeouts = nf_ct_timeout_data(timeout_ext);
- if (unlikely(!timeouts))
- timeouts = l4proto->get_timeouts(net);
- } else {
- timeouts = l4proto->get_timeouts(net);
- }
-
- if (!l4proto->new(ct, skb, dataoff, timeouts)) {
- nf_conntrack_free(ct);
- pr_debug("can't track with proto module\n");
- return NULL;
- }
if (timeout_ext)
nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
@@ -1266,8 +1395,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
ct->master = exp->master;
if (exp->helper) {
- help = nf_ct_helper_ext_add(ct, exp->helper,
- GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help)
rcu_assign_pointer(help->helper, exp->helper);
}
@@ -1302,13 +1430,12 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
/* On success, returns 0, sets skb->_nfct | ctinfo */
static int
-resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
+resolve_normal_ct(struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
- u_int16_t l3num,
u_int8_t protonum,
- const struct nf_conntrack_l3proto *l3proto,
- const struct nf_conntrack_l4proto *l4proto)
+ const struct nf_conntrack_l4proto *l4proto,
+ const struct nf_hook_state *state)
{
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple tuple;
@@ -1319,18 +1446,18 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
- dataoff, l3num, protonum, net, &tuple, l3proto,
- l4proto)) {
+ dataoff, state->pf, protonum, state->net,
+ &tuple, l4proto)) {
pr_debug("Can't get tuple\n");
return 0;
}
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple, net);
- h = __nf_conntrack_find_get(net, zone, &tuple, hash);
+ hash = hash_conntrack_raw(&tuple, state->net);
+ h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
if (!h) {
- h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+ h = init_conntrack(state->net, tmpl, &tuple, l4proto,
skb, dataoff, hash);
if (!h)
return 0;
@@ -1359,52 +1486,75 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
return 0;
}
+/*
+ * icmp packets need special treatment to handle error messages that are
+ * related to a connection.
+ *
+ * Callers need to check if skb has a conntrack assigned when this
+ * helper returns; in such case skb belongs to an already known connection.
+ */
+static unsigned int __cold
+nf_conntrack_handle_icmp(struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ u8 protonum,
+ const struct nf_hook_state *state)
+{
+ int ret;
+
+ if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
+ ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
+ ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
+#endif
+ else
+ return NF_ACCEPT;
+
+ if (ret <= 0) {
+ NF_CT_STAT_INC_ATOMIC(state->net, error);
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+ }
+
+ return ret;
+}
+
unsigned int
-nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
- struct sk_buff *skb)
+nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
- struct nf_conn *ct, *tmpl;
enum ip_conntrack_info ctinfo;
- unsigned int *timeouts;
- unsigned int dataoff;
+ struct nf_conn *ct, *tmpl;
u_int8_t protonum;
- int ret;
+ int dataoff, ret;
tmpl = nf_ct_get(skb, &ctinfo);
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
ctinfo == IP_CT_UNTRACKED) {
- NF_CT_STAT_INC_ATOMIC(net, ignore);
+ NF_CT_STAT_INC_ATOMIC(state->net, ignore);
return NF_ACCEPT;
}
skb->_nfct = 0;
}
/* rcu_read_lock()ed by nf_hook_thresh */
- l3proto = __nf_ct_l3proto_find(pf);
- ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
- &dataoff, &protonum);
- if (ret <= 0) {
+ dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
+ if (dataoff <= 0) {
pr_debug("not prepared to track yet or error occurred\n");
- NF_CT_STAT_INC_ATOMIC(net, error);
- NF_CT_STAT_INC_ATOMIC(net, invalid);
- ret = -ret;
+ NF_CT_STAT_INC_ATOMIC(state->net, error);
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+ ret = NF_ACCEPT;
goto out;
}
- l4proto = __nf_ct_l4proto_find(pf, protonum);
+ l4proto = __nf_ct_l4proto_find(protonum);
- /* It may be an special packet, error, unclean...
- * inverse of the return code tells to the netfilter
- * core what to do with the packet. */
- if (l4proto->error != NULL) {
- ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum);
+ if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
+ ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
+ protonum, state);
if (ret <= 0) {
- NF_CT_STAT_INC_ATOMIC(net, error);
- NF_CT_STAT_INC_ATOMIC(net, invalid);
ret = -ret;
goto out;
}
@@ -1413,11 +1563,11 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
goto out;
}
repeat:
- ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
- l3proto, l4proto);
+ ret = resolve_normal_ct(tmpl, skb, dataoff,
+ protonum, l4proto, state);
if (ret < 0) {
/* Too stressed to deal. */
- NF_CT_STAT_INC_ATOMIC(net, drop);
+ NF_CT_STAT_INC_ATOMIC(state->net, drop);
ret = NF_DROP;
goto out;
}
@@ -1425,24 +1575,21 @@ repeat:
ct = nf_ct_get(skb, &ctinfo);
if (!ct) {
/* Not valid part of a connection */
- NF_CT_STAT_INC_ATOMIC(net, invalid);
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
}
- /* Decide what timeout policy we want to apply to this flow. */
- timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
-
- ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts);
+ ret = l4proto->packet(ct, skb, dataoff, ctinfo, state);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
nf_conntrack_put(&ct->ct_general);
skb->_nfct = 0;
- NF_CT_STAT_INC_ATOMIC(net, invalid);
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
if (ret == -NF_DROP)
- NF_CT_STAT_INC_ATOMIC(net, drop);
+ NF_CT_STAT_INC_ATOMIC(state->net, drop);
/* Special case: TCP tracker reports an attempt to reopen a
* closed/aborted connection. We have to go back and create a
* fresh conntrack.
@@ -1471,9 +1618,7 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
rcu_read_lock();
ret = nf_ct_invert_tuple(inverse, orig,
- __nf_ct_l3proto_find(orig->src.l3num),
- __nf_ct_l4proto_find(orig->src.l3num,
- orig->dst.protonum));
+ __nf_ct_l4proto_find(orig->dst.protonum));
rcu_read_unlock();
return ret;
}
@@ -1609,14 +1754,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
enum ip_conntrack_info ctinfo;
struct nf_nat_hook *nat_hook;
- unsigned int dataoff, status;
+ unsigned int status;
struct nf_conn *ct;
+ int dataoff;
u16 l3num;
u8 l4num;
@@ -1625,16 +1770,15 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
return 0;
l3num = nf_ct_l3num(ct);
- l3proto = nf_ct_l3proto_find_get(l3num);
- if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
- &l4num) <= 0)
+ dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
+ if (dataoff <= 0)
return -1;
- l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+ l4proto = nf_ct_l4proto_find_get(l4num);
if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- l4num, net, &tuple, l3proto, l4proto))
+ l4num, net, &tuple, l4proto))
return -1;
if (ct->status & IPS_SRC_NAT) {
@@ -1683,6 +1827,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
return 0;
}
+static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+{
+ const struct nf_conntrack_tuple *src_tuple;
+ const struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple srctuple;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
+ memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+ return true;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ NFPROTO_IPV4, dev_net(skb->dev),
+ &srctuple))
+ return false;
+
+ hash = nf_conntrack_find_get(dev_net(skb->dev),
+ &nf_ct_zone_dflt,
+ &srctuple);
+ if (!hash)
+ return false;
+
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
+ memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+ nf_ct_put(ct);
+
+ return true;
+}
+
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -1866,16 +2045,6 @@ static int kill_all(struct nf_conn *i, void *data)
return net_eq(nf_ct_net(i), data);
}
-void nf_ct_free_hashtable(void *hash, unsigned int size)
-{
- if (is_vmalloc_addr(hash))
- vfree(hash);
- else
- free_pages((unsigned long)hash,
- get_order(sizeof(struct hlist_head) * size));
-}
-EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-
void nf_conntrack_cleanup_start(void)
{
conntrack_gc_work.exiting = true;
@@ -1886,7 +2055,7 @@ void nf_conntrack_cleanup_end(void)
{
RCU_INIT_POINTER(nf_ct_hook, NULL);
cancel_delayed_work_sync(&conntrack_gc_work.dwork);
- nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+ kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
nf_conntrack_seqadj_fini();
@@ -1952,7 +2121,6 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
- size_t sz;
if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
return NULL;
@@ -1960,14 +2128,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
- if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
- return NULL;
-
- sz = nr_slots * sizeof(struct hlist_nulls_head);
- hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
- get_order(sz));
- if (!hash)
- hash = vzalloc(sz);
+ hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
+ GFP_KERNEL | __GFP_ZERO);
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -1994,7 +2156,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
- nf_ct_free_hashtable(hash, hashsize);
+ kvfree(hash);
return 0;
}
@@ -2030,7 +2192,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
local_bh_enable();
synchronize_net();
- nf_ct_free_hashtable(old_hash, old_size);
+ kvfree(old_hash);
return 0;
}
@@ -2054,9 +2216,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
}
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
- &nf_conntrack_htable_size, 0600);
-
static __always_inline unsigned int total_extension_size(void)
{
/* remember to add new extensions below */
@@ -2197,13 +2356,14 @@ err_acct:
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
err_cachep:
- nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+ kvfree(nf_conntrack_hash);
return ret;
}
static struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
.destroy = destroy_conntrack,
+ .get_tuple_skb = nf_conntrack_get_tuple_skb,
};
void nf_conntrack_init_end(void)
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 853b23206bb7..3034038bfdf0 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -610,9 +610,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
expect->tuple.src.l3num,
expect->tuple.dst.protonum);
print_tuple(s, &expect->tuple,
- __nf_ct_l3proto_find(expect->tuple.src.l3num),
- __nf_ct_l4proto_find(expect->tuple.src.l3num,
- expect->tuple.dst.protonum));
+ __nf_ct_l4proto_find(expect->tuple.dst.protonum));
if (expect->flags & NF_CT_EXPECT_PERMANENT) {
seq_puts(s, "PERMANENT");
@@ -713,5 +711,5 @@ void nf_conntrack_expect_fini(void)
{
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
- nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
+ kvfree(nf_ct_expect_hash);
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a75b11c39312..e24b762ffa1d 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -24,7 +24,6 @@
#include <linux/rtnetlink.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -193,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
struct nf_conn_help *
-nf_ct_helper_ext_add(struct nf_conn *ct,
- struct nf_conntrack_helper *helper, gfp_t gfp)
+nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
{
struct nf_conn_help *help;
@@ -263,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
if (help == NULL) {
- help = nf_ct_helper_ext_add(ct, helper, flags);
+ help = nf_ct_helper_ext_add(ct, flags);
if (help == NULL)
return -ENOMEM;
} else {
@@ -564,12 +562,12 @@ int nf_conntrack_helper_init(void)
return 0;
out_extend:
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ kvfree(nf_ct_helper_hash);
return ret;
}
void nf_conntrack_helper_fini(void)
{
nf_ct_extend_unregister(&helper_extend);
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ kvfree(nf_ct_helper_hash);
}
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
deleted file mode 100644
index 397e6911214f..000000000000
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
- *
- * Based largely upon the original ip_conntrack code which
- * had the following copyright information:
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-
-static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
- memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
-
- return true;
-}
-
-static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
- memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
-
- return true;
-}
-
-static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- /* Never track !!! */
- return -NF_ACCEPT;
-}
-
-
-struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
- .l3proto = PF_UNSPEC,
- .pkt_to_tuple = generic_pkt_to_tuple,
- .invert_tuple = generic_invert_tuple,
- .get_l4proto = generic_get_l4proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 20a2e37c76d1..4ae8e528943a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -38,7 +38,6 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -81,9 +80,26 @@ nla_put_failure:
return -1;
}
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
+ nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto)
+ const struct nf_conntrack_tuple *tuple)
{
int ret = 0;
struct nlattr *nest_parms;
@@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
if (!nest_parms)
goto nla_put_failure;
- if (likely(l3proto->tuple_to_nlattr))
- ret = l3proto->tuple_to_nlattr(skb, tuple);
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ ret = ipv4_tuple_to_nlattr(skb, tuple);
+ break;
+ case NFPROTO_IPV6:
+ ret = ipv6_tuple_to_nlattr(skb, tuple);
+ break;
+ }
nla_nest_end(skb, nest_parms);
@@ -106,17 +128,14 @@ nla_put_failure:
static int ctnetlink_dump_tuples(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
int ret;
rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
- ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
+ ret = ctnetlink_dump_tuples_ip(skb, tuple);
if (ret >= 0) {
- l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
- tuple->dst.protonum);
+ l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
}
rcu_read_unlock();
@@ -164,7 +183,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
struct nlattr *nest_proto;
int ret;
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
if (!l4proto->to_nlattr)
return 0;
@@ -556,18 +575,23 @@ nla_put_failure:
return -1;
}
+static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = {
+ [CTA_IP_V4_SRC] = { .type = NLA_U32 },
+ [CTA_IP_V4_DST] = { .type = NLA_U32 },
+ [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 },
+ [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 },
+};
+
#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS)
static size_t ctnetlink_proto_size(const struct nf_conn *ct)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
size_t len, len4 = 0;
- l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
- len = l3proto->nla_size;
+ len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
len *= 3u; /* ORIG, REPLY, MASTER */
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
len += l4proto->nlattr_size;
if (l4proto->nlattr_tuple_size) {
len4 = l4proto->nlattr_tuple_size();
@@ -796,6 +820,7 @@ static int ctnetlink_done(struct netlink_callback *cb)
}
struct ctnetlink_filter {
+ u8 family;
struct {
u_int32_t val;
u_int32_t mask;
@@ -803,22 +828,45 @@ struct ctnetlink_filter {
};
static struct ctnetlink_filter *
-ctnetlink_alloc_filter(const struct nlattr * const cda[])
+ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
{
-#ifdef CONFIG_NF_CONNTRACK_MARK
struct ctnetlink_filter *filter;
+#ifndef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK] && cda[CTA_MARK_MASK])
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+
filter = kzalloc(sizeof(*filter), GFP_KERNEL);
if (filter == NULL)
return ERR_PTR(-ENOMEM);
- filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
- filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ filter->family = family;
- return filter;
-#else
- return ERR_PTR(-EOPNOTSUPP);
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
+ filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
+ filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ }
#endif
+ return filter;
+}
+
+static int ctnetlink_start(struct netlink_callback *cb)
+{
+ const struct nlattr * const *cda = cb->data;
+ struct ctnetlink_filter *filter = NULL;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u8 family = nfmsg->nfgen_family;
+
+ if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+ filter = ctnetlink_alloc_filter(cda, family);
+ if (IS_ERR(filter))
+ return PTR_ERR(filter);
+ }
+
+ cb->data = filter;
+ return 0;
}
static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
@@ -826,13 +874,24 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
struct ctnetlink_filter *filter = data;
if (filter == NULL)
- return 1;
+ goto out;
+
+ /* Match entries of a given L3 protocol number.
+ * If it is not specified, ie. l3proto == 0,
+ * then match everything.
+ */
+ if (filter->family && nf_ct_l3num(ct) != filter->family)
+ goto ignore_entry;
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((ct->mark & filter->mark.mask) == filter->mark.val)
- return 1;
+ if ((ct->mark & filter->mark.mask) != filter->mark.val)
+ goto ignore_entry;
#endif
+out:
+ return 1;
+
+ignore_entry:
return 0;
}
@@ -843,8 +902,6 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
struct nf_conn *ct, *last;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- u_int8_t l3proto = nfmsg->nfgen_family;
struct nf_conn *nf_ct_evict[8];
int res, i;
spinlock_t *lockp;
@@ -883,11 +940,6 @@ restart:
if (!net_eq(net, nf_ct_net(ct)))
continue;
- /* Dump entries of a given L3 protocol number.
- * If it is not specified, ie. l3proto == 0,
- * then dump everything. */
- if (l3proto && nf_ct_l3num(ct) != l3proto)
- continue;
if (cb->args[1]) {
if (ct != last)
continue;
@@ -936,29 +988,54 @@ out:
return skb->len;
}
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
+ return -EINVAL;
+
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+
+ return 0;
+}
+
+static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
+ return -EINVAL;
+
+ t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+ t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+
+ return 0;
+}
+
static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_IP_MAX+1];
- struct nf_conntrack_l3proto *l3proto;
int ret = 0;
ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL);
if (ret < 0)
return ret;
- rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+ ret = nla_validate_nested(attr, CTA_IP_MAX,
+ cta_ip_nla_policy, NULL);
+ if (ret)
+ return ret;
- if (likely(l3proto->nlattr_to_tuple)) {
- ret = nla_validate_nested(attr, CTA_IP_MAX,
- l3proto->nla_policy, NULL);
- if (ret == 0)
- ret = l3proto->nlattr_to_tuple(tb, tuple);
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ ret = ipv4_nlattr_to_tuple(tb, tuple);
+ break;
+ case NFPROTO_IPV6:
+ ret = ipv6_nlattr_to_tuple(tb, tuple);
+ break;
}
- rcu_read_unlock();
-
return ret;
}
@@ -983,7 +1060,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
rcu_read_lock();
- l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum);
+ l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
if (likely(l4proto->nlattr_to_tuple)) {
ret = nla_validate_nested(attr, CTA_PROTO_MAX,
@@ -1148,12 +1225,12 @@ static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
static int ctnetlink_flush_conntrack(struct net *net,
const struct nlattr * const cda[],
- u32 portid, int report)
+ u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
- filter = ctnetlink_alloc_filter(cda);
+ if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+ filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
}
@@ -1192,7 +1269,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
else {
return ctnetlink_flush_conntrack(net, cda,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(nlh), u3);
}
if (err < 0)
@@ -1240,19 +1317,12 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start = ctnetlink_start,
.dump = ctnetlink_dump_table,
.done = ctnetlink_done,
+ .data = (void *)cda,
};
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
- struct ctnetlink_filter *filter;
-
- filter = ctnetlink_alloc_filter(cda);
- if (IS_ERR(filter))
- return PTR_ERR(filter);
-
- c.data = filter;
- }
return netlink_dump_start(ctnl, skb, nlh, &c);
}
@@ -1638,7 +1708,7 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct,
return err;
rcu_read_lock();
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
if (l4proto->from_nlattr)
err = l4proto->from_nlattr(tb, ct);
rcu_read_unlock();
@@ -1897,7 +1967,7 @@ ctnetlink_create_conntrack(struct net *net,
} else {
struct nf_conn_help *help;
- help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help == NULL) {
err = -ENOMEM;
goto err2;
@@ -2581,7 +2651,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple_mask *mask)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple m;
struct nlattr *nest_parms;
@@ -2597,11 +2666,9 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
goto nla_put_failure;
rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
- ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
+ ret = ctnetlink_dump_tuples_ip(skb, &m);
if (ret >= 0) {
- l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
- tuple->dst.protonum);
+ l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
}
rcu_read_unlock();
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index d88841fbc560..40643af7137e 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -1,14 +1,4 @@
-/* L3/L4 protocol support for nf_conntrack. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/netfilter.h>
@@ -24,14 +14,36 @@
#include <linux/netdevice.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_log.h>
-static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly;
-struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_l3protos);
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+extern unsigned int nf_conntrack_net_id;
+
+static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly;
static DEFINE_MUTEX(nf_ct_proto_mutex);
@@ -112,154 +124,21 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid);
#endif
-const struct nf_conntrack_l4proto *
-__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
+const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto)
{
- if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL))
+ if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos)))
return &nf_conntrack_l4proto_generic;
- return rcu_dereference(nf_ct_protos[l3proto][l4proto]);
+ return rcu_dereference(nf_ct_protos[l4proto]);
}
EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
-/* this is guaranteed to always return a valid protocol helper, since
- * it falls back to generic_protocol */
-const struct nf_conntrack_l3proto *
-nf_ct_l3proto_find_get(u_int16_t l3proto)
-{
- struct nf_conntrack_l3proto *p;
-
- rcu_read_lock();
- p = __nf_ct_l3proto_find(l3proto);
- if (!try_module_get(p->me))
- p = &nf_conntrack_l3proto_generic;
- rcu_read_unlock();
-
- return p;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
-
-int
-nf_ct_l3proto_try_module_get(unsigned short l3proto)
-{
- const struct nf_conntrack_l3proto *p;
- int ret;
-
-retry: p = nf_ct_l3proto_find_get(l3proto);
- if (p == &nf_conntrack_l3proto_generic) {
- ret = request_module("nf_conntrack-%d", l3proto);
- if (!ret)
- goto retry;
-
- return -EPROTOTYPE;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
-
-void nf_ct_l3proto_module_put(unsigned short l3proto)
-{
- struct nf_conntrack_l3proto *p;
-
- /* rcu_read_lock not necessary since the caller holds a reference, but
- * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
- */
- rcu_read_lock();
- p = __nf_ct_l3proto_find(l3proto);
- module_put(p->me);
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
-
-static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
-{
- const struct nf_conntrack_l3proto *l3proto;
- int ret;
-
- might_sleep();
-
- ret = nf_ct_l3proto_try_module_get(nfproto);
- if (ret < 0)
- return ret;
-
- /* we already have a reference, can't fail */
- rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(nfproto);
- rcu_read_unlock();
-
- if (!l3proto->net_ns_get)
- return 0;
-
- ret = l3proto->net_ns_get(net);
- if (ret < 0)
- nf_ct_l3proto_module_put(nfproto);
-
- return ret;
-}
-
-int nf_ct_netns_get(struct net *net, u8 nfproto)
-{
- int err;
-
- if (nfproto == NFPROTO_INET) {
- err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
- if (err < 0)
- goto err1;
- err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
- if (err < 0)
- goto err2;
- } else {
- err = nf_ct_netns_do_get(net, nfproto);
- if (err < 0)
- goto err1;
- }
- return 0;
-
-err2:
- nf_ct_netns_put(net, NFPROTO_IPV4);
-err1:
- return err;
-}
-EXPORT_SYMBOL_GPL(nf_ct_netns_get);
-
-static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
-{
- const struct nf_conntrack_l3proto *l3proto;
-
- might_sleep();
-
- /* same as nf_conntrack_netns_get(), reference assumed */
- rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(nfproto);
- rcu_read_unlock();
-
- if (WARN_ON(!l3proto))
- return;
-
- if (l3proto->net_ns_put)
- l3proto->net_ns_put(net);
-
- nf_ct_l3proto_module_put(nfproto);
-}
-
-void nf_ct_netns_put(struct net *net, uint8_t nfproto)
-{
- if (nfproto == NFPROTO_INET) {
- nf_ct_netns_do_put(net, NFPROTO_IPV4);
- nf_ct_netns_do_put(net, NFPROTO_IPV6);
- } else
- nf_ct_netns_do_put(net, nfproto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_netns_put);
-
-const struct nf_conntrack_l4proto *
-nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
+const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num)
{
const struct nf_conntrack_l4proto *p;
rcu_read_lock();
- p = __nf_ct_l4proto_find(l3num, l4num);
+ p = __nf_ct_l4proto_find(l4num);
if (!try_module_get(p->me))
p = &nf_conntrack_l4proto_generic;
rcu_read_unlock();
@@ -274,65 +153,13 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p)
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
-static int kill_l3proto(struct nf_conn *i, void *data)
-{
- return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto;
-}
-
static int kill_l4proto(struct nf_conn *i, void *data)
{
const struct nf_conntrack_l4proto *l4proto;
l4proto = data;
- return nf_ct_protonum(i) == l4proto->l4proto &&
- nf_ct_l3num(i) == l4proto->l3proto;
+ return nf_ct_protonum(i) == l4proto->l4proto;
}
-int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto)
-{
- int ret = 0;
- struct nf_conntrack_l3proto *old;
-
- if (proto->l3proto >= NFPROTO_NUMPROTO)
- return -EBUSY;
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (proto->tuple_to_nlattr && proto->nla_size == 0)
- return -EINVAL;
-#endif
- mutex_lock(&nf_ct_proto_mutex);
- old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
- lockdep_is_held(&nf_ct_proto_mutex));
- if (old != &nf_conntrack_l3proto_generic) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
-
-out_unlock:
- mutex_unlock(&nf_ct_proto_mutex);
- return ret;
-
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
-
-void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto)
-{
- BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO);
-
- mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
- lockdep_is_held(&nf_ct_proto_mutex)
- ) != proto);
- rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
- &nf_conntrack_l3proto_generic);
- mutex_unlock(&nf_ct_proto_mutex);
-
- synchronize_rcu();
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_destroy(kill_l3proto, (void*)proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
-
static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
const struct nf_conntrack_l4proto *l4proto)
{
@@ -389,48 +216,20 @@ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto)
{
int ret = 0;
- if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos))
- return -EBUSY;
-
if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) ||
(l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
- if (!nf_ct_protos[l4proto->l3proto]) {
- /* l3proto may be loaded latter. */
- struct nf_conntrack_l4proto __rcu **proto_array;
- int i;
-
- proto_array =
- kmalloc_array(MAX_NF_CT_PROTO,
- sizeof(struct nf_conntrack_l4proto *),
- GFP_KERNEL);
- if (proto_array == NULL) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- for (i = 0; i < MAX_NF_CT_PROTO; i++)
- RCU_INIT_POINTER(proto_array[i],
- &nf_conntrack_l4proto_generic);
-
- /* Before making proto_array visible to lockless readers,
- * we must make sure its content is committed to memory.
- */
- smp_wmb();
-
- nf_ct_protos[l4proto->l3proto] = proto_array;
- } else if (rcu_dereference_protected(
- nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l4proto],
lockdep_is_held(&nf_ct_proto_mutex)
) != &nf_conntrack_l4proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
- rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
- l4proto);
+ rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto);
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
return ret;
@@ -444,7 +243,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net,
struct nf_proto_net *pn = NULL;
if (l4proto->init_net) {
- ret = l4proto->init_net(net, l4proto->l3proto);
+ ret = l4proto->init_net(net);
if (ret < 0)
goto out;
}
@@ -466,13 +265,13 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
{
- BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos));
+ BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos));
BUG_ON(rcu_dereference_protected(
- nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ nf_ct_protos[l4proto->l4proto],
lockdep_is_held(&nf_ct_proto_mutex)
) != l4proto);
- rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ rcu_assign_pointer(nf_ct_protos[l4proto->l4proto],
&nf_conntrack_l4proto_generic);
}
@@ -482,7 +281,9 @@ void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
__nf_ct_l4proto_unregister_one(l4proto);
mutex_unlock(&nf_ct_proto_mutex);
- synchronize_rcu();
+ synchronize_net();
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
@@ -499,10 +300,28 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
-int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
- unsigned int num_proto)
+static void
+nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
+ unsigned int num_proto)
{
- int ret = -EINVAL, ver;
+ int i;
+
+ mutex_lock(&nf_ct_proto_mutex);
+ for (i = 0; i < num_proto; i++)
+ __nf_ct_l4proto_unregister_one(l4proto[i]);
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ synchronize_net();
+
+ for (i = 0; i < num_proto; i++)
+ nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto[i]);
+}
+
+static int
+nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
+ unsigned int num_proto)
+{
+ int ret = -EINVAL;
unsigned int i;
for (i = 0; i < num_proto; i++) {
@@ -511,14 +330,12 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
break;
}
if (i != num_proto) {
- ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4;
- pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n",
- ver, l4proto[i]->l4proto);
+ pr_err("nf_conntrack: can't register l4 %d proto.\n",
+ l4proto[i]->l4proto);
nf_ct_l4proto_unregister(l4proto, i);
}
return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
int nf_ct_l4proto_pernet_register(struct net *net,
const struct nf_conntrack_l4proto *const l4proto[],
@@ -533,29 +350,14 @@ int nf_ct_l4proto_pernet_register(struct net *net,
break;
}
if (i != num_proto) {
- pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n",
- l4proto[i]->l4proto,
- l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
+ pr_err("nf_conntrack %d: pernet registration failed\n",
+ l4proto[i]->l4proto);
nf_ct_l4proto_pernet_unregister(net, l4proto, i);
}
return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
-void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
- unsigned int num_proto)
-{
- mutex_lock(&nf_ct_proto_mutex);
- while (num_proto-- != 0)
- __nf_ct_l4proto_unregister_one(l4proto[num_proto]);
- mutex_unlock(&nf_ct_proto_mutex);
-
- synchronize_net();
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
-
void nf_ct_l4proto_pernet_unregister(struct net *net,
const struct nf_conntrack_l4proto *const l4proto[],
unsigned int num_proto)
@@ -565,14 +367,582 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
+static unsigned int ipv4_helper(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ /* rcu_read_lock()ed by nf_hook_thresh */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
+}
+
+static unsigned int ipv4_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv4_conntrack_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(skb, state);
+}
+
+static unsigned int ipv4_conntrack_local(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *tmpl;
+
+ tmpl = nf_ct_get(skb, &ctinfo);
+ if (tmpl && nf_ct_is_template(tmpl)) {
+ /* when skipping ct, clear templates to avoid fooling
+ * later targets/matches
+ */
+ skb->_nfct = 0;
+ nf_ct_put(tmpl);
+ }
+ return NF_ACCEPT;
+ }
+
+ return nf_conntrack_in(skb, state);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ * make it the first hook.
+ */
+static const struct nf_hook_ops ipv4_conntrack_ops[] = {
+ {
+ .hook = ipv4_conntrack_in,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_conntrack_local,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_helper,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
+ .hook = ipv4_helper,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ * blame them).
+ * Reversing the socket's dst/src point of view gives us the reply
+ * mapping.
+ */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+
+ memset(&tuple, 0, sizeof(tuple));
+
+ lock_sock(sk);
+ tuple.src.u3.ip = inet->inet_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.ip = inet->inet_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.src.l3num = PF_INET;
+ tuple.dst.protonum = sk->sk_protocol;
+ release_sock(sk);
+
+ /* We only do TCP and SCTP at the moment: is there a better way? */
+ if (tuple.dst.protonum != IPPROTO_TCP &&
+ tuple.dst.protonum != IPPROTO_SCTP) {
+ pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
+ pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
+ if (h) {
+ struct sockaddr_in sin;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u3.ip;
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
+ &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ nf_ct_put(ct);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
+ &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst = {
+ .pf = PF_INET,
+ .get_optmin = SO_ORIGINAL_DST,
+ .get_optmax = SO_ORIGINAL_DST + 1,
+ .get = getorigdst,
+ .owner = THIS_MODULE,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
+ const struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct sockaddr_in6 sin6;
+ struct nf_conn *ct;
+ __be32 flow_label;
+ int bound_dev_if;
+
+ lock_sock(sk);
+ tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.in6 = sk->sk_v6_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.dst.protonum = sk->sk_protocol;
+ bound_dev_if = sk->sk_bound_dev_if;
+ flow_label = inet6->flow_label;
+ release_sock(sk);
+
+ if (tuple.dst.protonum != IPPROTO_TCP &&
+ tuple.dst.protonum != IPPROTO_SCTP)
+ return -ENOPROTOOPT;
+
+ if (*len < 0 || (unsigned int)*len < sizeof(sin6))
+ return -EINVAL;
+
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
+ if (!h) {
+ pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
+ &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+ }
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+ sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
+ memcpy(&sin6.sin6_addr,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
+ sizeof(sin6.sin6_addr));
+
+ nf_ct_put(ct);
+ sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
+ return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
+}
+
+static struct nf_sockopt_ops so_getorigdst6 = {
+ .pf = NFPROTO_IPV6,
+ .get_optmin = IP6T_SO_ORIGINAL_DST,
+ .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
+ .get = ipv6_getorigdst,
+ .owner = THIS_MODULE,
+};
+
+static unsigned int ipv6_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+ int protoff;
+ __be16 frag_off;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ goto out;
+ }
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv6_conntrack_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(skb, state);
+}
+
+static unsigned int ipv6_conntrack_local(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(skb, state);
+}
+
+static unsigned int ipv6_helper(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+ enum ip_conntrack_info ctinfo;
+ __be16 frag_off;
+ int protoff;
+ u8 nexthdr;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+ /* rcu_read_lock()ed by nf_hook_thresh */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+
+ return helper->help(skb, protoff, ct, ctinfo);
+}
+
+static const struct nf_hook_ops ipv6_conntrack_ops[] = {
+ {
+ .hook = ipv6_conntrack_in,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv6_conntrack_local,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv6_helper,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv6_confirm,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_LAST,
+ },
+ {
+ .hook = ipv6_helper,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv6_confirm,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_LAST - 1,
+ },
+};
+#endif
+
+static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto)
+{
+ u8 nfproto = (unsigned long)_nfproto;
+
+ if (nf_ct_l3num(ct) != nfproto)
+ return 0;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP &&
+ ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED) {
+ ct->proto.tcp.seen[0].td_maxwin = 0;
+ ct->proto.tcp.seen[1].td_maxwin = 0;
+ }
+
+ return 0;
+}
+
+static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
+{
+ struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ bool fixup_needed = false;
+ int err = 0;
+
+ mutex_lock(&nf_ct_proto_mutex);
+
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ cnet->users4++;
+ if (cnet->users4 > 1)
+ goto out_unlock;
+ err = nf_defrag_ipv4_enable(net);
+ if (err) {
+ cnet->users4 = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ if (err)
+ cnet->users4 = 0;
+ else
+ fixup_needed = true;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ cnet->users6++;
+ if (cnet->users6 > 1)
+ goto out_unlock;
+ err = nf_defrag_ipv6_enable(net);
+ if (err < 0) {
+ cnet->users6 = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ if (err)
+ cnet->users6 = 0;
+ else
+ fixup_needed = true;
+ break;
+#endif
+ default:
+ err = -EPROTO;
+ break;
+ }
+ out_unlock:
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ if (fixup_needed)
+ nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
+ (void *)(unsigned long)nfproto, 0, 0);
+
+ return err;
+}
+
+static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
+{
+ struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+
+ mutex_lock(&nf_ct_proto_mutex);
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ if (cnet->users4 && (--cnet->users4 == 0))
+ nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ if (cnet->users6 && (--cnet->users6 == 0))
+ nf_unregister_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ break;
+#endif
+ }
+
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ int err;
+
+ if (nfproto == NFPROTO_INET) {
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ } else {
+ err = nf_ct_netns_do_get(net, nfproto);
+ if (err < 0)
+ goto err1;
+ }
+ return 0;
+
+err2:
+ nf_ct_netns_put(net, NFPROTO_IPV4);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, uint8_t nfproto)
+{
+ if (nfproto == NFPROTO_INET) {
+ nf_ct_netns_do_put(net, NFPROTO_IPV4);
+ nf_ct_netns_do_put(net, NFPROTO_IPV6);
+ } else {
+ nf_ct_netns_do_put(net, nfproto);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
+static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
+ &nf_conntrack_l4proto_tcp,
+ &nf_conntrack_l4proto_udp,
+ &nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite,
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ &nf_conntrack_l4proto_icmpv6,
+#endif /* CONFIG_IPV6 */
+};
+
+int nf_conntrack_proto_init(void)
+{
+ int ret = 0, i;
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret < 0)
+ return ret;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = nf_register_sockopt(&so_getorigdst6);
+ if (ret < 0)
+ goto cleanup_sockopt;
+#endif
+
+ for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
+ RCU_INIT_POINTER(nf_ct_protos[i],
+ &nf_conntrack_l4proto_generic);
+
+ ret = nf_ct_l4proto_register(builtin_l4proto,
+ ARRAY_SIZE(builtin_l4proto));
+ if (ret < 0)
+ goto cleanup_sockopt2;
+
+ return ret;
+cleanup_sockopt2:
+ nf_unregister_sockopt(&so_getorigdst);
+#if IS_ENABLED(CONFIG_IPV6)
+cleanup_sockopt:
+ nf_unregister_sockopt(&so_getorigdst6);
+#endif
+ return ret;
+}
+
+void nf_conntrack_proto_fini(void)
+{
+ nf_unregister_sockopt(&so_getorigdst);
+#if IS_ENABLED(CONFIG_IPV6)
+ nf_unregister_sockopt(&so_getorigdst6);
+#endif
+}
+
int nf_conntrack_proto_pernet_init(struct net *net)
{
int err;
struct nf_proto_net *pn = nf_ct_l4proto_net(net,
&nf_conntrack_l4proto_generic);
- err = nf_conntrack_l4proto_generic.init_net(net,
- nf_conntrack_l4proto_generic.l3proto);
+ err = nf_conntrack_l4proto_generic.init_net(net);
if (err < 0)
return err;
err = nf_ct_l4proto_register_sysctl(net,
@@ -581,6 +951,14 @@ int nf_conntrack_proto_pernet_init(struct net *net)
if (err < 0)
return err;
+ err = nf_ct_l4proto_pernet_register(net, builtin_l4proto,
+ ARRAY_SIZE(builtin_l4proto));
+ if (err < 0) {
+ nf_ct_l4proto_unregister_sysctl(net, pn,
+ &nf_conntrack_l4proto_generic);
+ return err;
+ }
+
pn->users++;
return 0;
}
@@ -590,25 +968,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net)
struct nf_proto_net *pn = nf_ct_l4proto_net(net,
&nf_conntrack_l4proto_generic);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto,
+ ARRAY_SIZE(builtin_l4proto));
pn->users--;
nf_ct_l4proto_unregister_sysctl(net,
pn,
&nf_conntrack_l4proto_generic);
}
-int nf_conntrack_proto_init(void)
-{
- unsigned int i;
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
- rcu_assign_pointer(nf_ct_l3protos[i],
- &nf_conntrack_l3proto_generic);
- return 0;
-}
-void nf_conntrack_proto_fini(void)
-{
- unsigned int i;
- /* free l3proto protocol tables */
- for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
- kfree(nf_ct_protos[i]);
-}
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+ &nf_conntrack_htable_size, 0600);
+
+MODULE_ALIAS("ip_conntrack");
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 9ce6336d1e55..171e9e122e5f 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -23,6 +23,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
/* Timeouts are based on values from RFC4340:
@@ -388,41 +389,15 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
return &net->ct.nf_ct_proto.dccp;
}
-static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- struct dccp_hdr _hdr, *dh;
-
- /* Actually only need first 4 bytes to get ports. */
- dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (dh == NULL)
- return false;
-
- tuple->src.u.dccp.port = dh->dccph_sport;
- tuple->dst.u.dccp.port = dh->dccph_dport;
- return true;
-}
-
-static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
- const struct nf_conntrack_tuple *tuple)
-{
- inv->src.u.dccp.port = tuple->dst.u.dccp.port;
- inv->dst.u.dccp.port = tuple->src.u.dccp.port;
- return true;
-}
-
-static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+static noinline bool
+dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ const struct dccp_hdr *dh)
{
struct net *net = nf_ct_net(ct);
struct nf_dccp_net *dn;
- struct dccp_hdr _dh, *dh;
const char *msg;
u_int8_t state;
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
- BUG_ON(dh == NULL);
-
state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
switch (state) {
default:
@@ -460,23 +435,68 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
ntohl(dhack->dccph_ack_nr_low);
}
-static unsigned int *dccp_get_timeouts(struct net *net)
+static bool dccp_error(const struct dccp_hdr *dh,
+ struct sk_buff *skb, unsigned int dataoff,
+ const struct nf_hook_state *state)
{
- return dccp_pernet(net)->dccp_timeout;
+ unsigned int dccp_len = skb->len - dataoff;
+ unsigned int cscov;
+ const char *msg;
+
+ if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
+ dh->dccph_doff * 4 > dccp_len) {
+ msg = "nf_ct_dccp: truncated/malformed packet ";
+ goto out_invalid;
+ }
+
+ cscov = dccp_len;
+ if (dh->dccph_cscov) {
+ cscov = (dh->dccph_cscov - 1) * 4;
+ if (cscov > dccp_len) {
+ msg = "nf_ct_dccp: bad checksum coverage ";
+ goto out_invalid;
+ }
+ }
+
+ if (state->hook == NF_INET_PRE_ROUTING &&
+ state->net->ct.sysctl_checksum &&
+ nf_checksum_partial(skb, state->hook, dataoff, cscov,
+ IPPROTO_DCCP, state->pf)) {
+ msg = "nf_ct_dccp: bad checksum ";
+ goto out_invalid;
+ }
+
+ if (dh->dccph_type >= DCCP_PKT_INVALID) {
+ msg = "nf_ct_dccp: reserved packet type ";
+ goto out_invalid;
+ }
+ return false;
+out_invalid:
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ IPPROTO_DCCP, "%s", msg);
+ return true;
}
-static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
+static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
unsigned int dataoff, enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ const struct nf_hook_state *state)
{
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
struct dccp_hdr _dh, *dh;
u_int8_t type, old_state, new_state;
enum ct_dccp_roles role;
+ unsigned int *timeouts;
dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
- BUG_ON(dh == NULL);
+ if (!dh)
+ return NF_DROP;
+
+ if (dccp_error(dh, skb, dataoff, state))
+ return -NF_ACCEPT;
+
type = dh->dccph_type;
+ if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh))
+ return -NF_ACCEPT;
if (type == DCCP_PKT_RESET &&
!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -546,60 +566,14 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
if (new_state != old_state)
nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout;
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
return NF_ACCEPT;
}
-static int dccp_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb, unsigned int dataoff,
- u_int8_t pf, unsigned int hooknum)
-{
- struct dccp_hdr _dh, *dh;
- unsigned int dccp_len = skb->len - dataoff;
- unsigned int cscov;
- const char *msg;
-
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
- if (dh == NULL) {
- msg = "nf_ct_dccp: short packet ";
- goto out_invalid;
- }
-
- if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
- dh->dccph_doff * 4 > dccp_len) {
- msg = "nf_ct_dccp: truncated/malformed packet ";
- goto out_invalid;
- }
-
- cscov = dccp_len;
- if (dh->dccph_cscov) {
- cscov = (dh->dccph_cscov - 1) * 4;
- if (cscov > dccp_len) {
- msg = "nf_ct_dccp: bad checksum coverage ";
- goto out_invalid;
- }
- }
-
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP,
- pf)) {
- msg = "nf_ct_dccp: bad checksum ";
- goto out_invalid;
- }
-
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- msg = "nf_ct_dccp: reserved packet type ";
- goto out_invalid;
- }
-
- return NF_ACCEPT;
-
-out_invalid:
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg);
- return -NF_ACCEPT;
-}
-
static bool dccp_can_early_drop(const struct nf_conn *ct)
{
switch (ct->proto.dccp.state) {
@@ -699,7 +673,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
}
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -721,6 +695,8 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
}
}
+
+ timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST];
return 0;
}
@@ -750,7 +726,7 @@ dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
[CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
[CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
/* template, data assigned later */
@@ -836,7 +812,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
return 0;
}
-static int dccp_init_net(struct net *net, u_int16_t proto)
+static int dccp_init_net(struct net *net)
{
struct nf_dccp_net *dn = dccp_pernet(net);
struct nf_proto_net *pn = &dn->pn;
@@ -851,6 +827,11 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
+
+ /* timeouts[0] is unused, make it same as SYN_SENT so
+ * ->timeouts[0] contains 'new' timeout, like udp or icmp.
+ */
+ dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST];
}
return dccp_kmemdup_sysctl_table(net, pn, dn);
@@ -861,51 +842,9 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
return &net->ct.nf_ct_proto.dccp.pn;
}
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
- .l3proto = AF_INET,
- .l4proto = IPPROTO_DCCP,
- .pkt_to_tuple = dccp_pkt_to_tuple,
- .invert_tuple = dccp_invert_tuple,
- .new = dccp_new,
- .packet = dccp_packet,
- .get_timeouts = dccp_get_timeouts,
- .error = dccp_error,
- .can_early_drop = dccp_can_early_drop,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
- .print_conntrack = dccp_print_conntrack,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_size = DCCP_NLATTR_SIZE,
- .to_nlattr = dccp_to_nlattr,
- .from_nlattr = nlattr_to_dccp,
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
- .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
- .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
- .nla_policy = dccp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .init_net = dccp_init_net,
- .get_net_proto = dccp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
- .l3proto = AF_INET6,
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
.l4proto = IPPROTO_DCCP,
- .pkt_to_tuple = dccp_pkt_to_tuple,
- .invert_tuple = dccp_invert_tuple,
- .new = dccp_new,
.packet = dccp_packet,
- .get_timeouts = dccp_get_timeouts,
- .error = dccp_error,
.can_early_drop = dccp_can_early_drop,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = dccp_print_conntrack,
@@ -919,7 +858,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = dccp_timeout_nlattr_to_obj,
.obj_to_nlattr = dccp_timeout_obj_to_nlattr,
@@ -927,8 +866,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
.obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
.nla_policy = dccp_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = dccp_init_net,
.get_net_proto = dccp_get_net_proto,
};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 6c6896d21cd7..e10e867e0b55 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -11,6 +11,7 @@
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
static const unsigned int nf_ct_generic_timeout = 600*HZ;
@@ -41,45 +42,29 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
return true;
}
-static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.all = 0;
- tuple->dst.u.all = 0;
-
- return true;
-}
-
-static unsigned int *generic_get_timeouts(struct net *net)
-{
- return &(generic_pernet(net)->timeout);
-}
-
/* Returns verdict for packet, or -1 for invalid. */
static int generic_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
+ const struct nf_hook_state *state)
{
- nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- bool ret;
+ const unsigned int *timeout = nf_ct_timeout_lookup(ct);
- ret = nf_generic_should_process(nf_ct_protonum(ct));
- if (!ret)
+ if (!nf_generic_should_process(nf_ct_protonum(ct))) {
pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
nf_ct_protonum(ct));
- return ret;
+ return -NF_ACCEPT;
+ }
+
+ if (!timeout)
+ timeout = &generic_pernet(nf_ct_net(ct))->timeout;
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+ return NF_ACCEPT;
}
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -87,8 +72,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- unsigned int *timeout = data;
struct nf_generic_net *gn = generic_pernet(net);
+ unsigned int *timeout = data;
+
+ if (!timeout)
+ timeout = &gn->timeout;
if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
*timeout =
@@ -119,7 +107,7 @@ static const struct nla_policy
generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
[CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
static struct ctl_table generic_sysctl_table[] = {
@@ -148,7 +136,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int generic_init_net(struct net *net, u_int16_t proto)
+static int generic_init_net(struct net *net)
{
struct nf_generic_net *gn = generic_pernet(net);
struct nf_proto_net *pn = &gn->pn;
@@ -165,14 +153,10 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
{
- .l3proto = PF_UNSPEC,
.l4proto = 255,
.pkt_to_tuple = generic_pkt_to_tuple,
- .invert_tuple = generic_invert_tuple,
.packet = generic_packet,
- .get_timeouts = generic_get_timeouts,
- .new = generic_new,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = generic_timeout_nlattr_to_obj,
.obj_to_nlattr = generic_timeout_obj_to_nlattr,
@@ -180,7 +164,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
.obj_size = sizeof(unsigned int),
.nla_policy = generic_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = generic_init_net,
.get_net_proto = generic_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index d049ea5a3770..9b48dc8b4b88 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -39,6 +39,7 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
@@ -179,15 +180,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
-/* invert gre part of tuple */
-static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->dst.u.gre.key = orig->src.u.gre.key;
- tuple->src.u.gre.key = orig->dst.u.gre.key;
- return true;
-}
-
/* gre hdr info to tuple */
static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct net *net, struct nf_conntrack_tuple *tuple)
@@ -241,11 +233,26 @@ static unsigned int *gre_get_timeouts(struct net *net)
/* Returns verdict for packet, and may modify conntrack */
static int gre_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ const struct nf_hook_state *state)
{
+ if (state->pf != NFPROTO_IPV4)
+ return -NF_ACCEPT;
+
+ if (!nf_ct_is_confirmed(ct)) {
+ unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+
+ if (!timeouts)
+ timeouts = gre_get_timeouts(nf_ct_net(ct));
+
+ /* initialize to sane value. Ideally a conntrack helper
+ * (e.g. in case of pptp) is increasing them */
+ ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
+ ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
+ }
+
/* If we've seen traffic both ways, this is a GRE connection.
* Extend timeout. */
if (ct->status & IPS_SEEN_REPLY) {
@@ -261,21 +268,6 @@ static int gre_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Called when a new connection for this protocol found. */
-static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- pr_debug(": ");
- nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-
- /* initialize to sane value. Ideally a conntrack helper
- * (e.g. in case of pptp) is increasing them */
- ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED];
- ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
-
- return true;
-}
-
/* Called when a conntrack entry has already been removed from the hashes
* and is about to be deleted from memory */
static void gre_destroy(struct nf_conn *ct)
@@ -289,7 +281,7 @@ static void gre_destroy(struct nf_conn *ct)
nf_ct_gre_keymap_destroy(master);
}
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -300,6 +292,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeouts = data;
struct netns_proto_gre *net_gre = gre_pernet(net);
+ if (!timeouts)
+ timeouts = gre_get_timeouts(net);
/* set default timeouts for GRE. */
timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
@@ -336,9 +330,9 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
[CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 },
[CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-static int gre_init_net(struct net *net, u_int16_t proto)
+static int gre_init_net(struct net *net)
{
struct netns_proto_gre *net_gre = gre_pernet(net);
int i;
@@ -353,16 +347,12 @@ static int gre_init_net(struct net *net, u_int16_t proto)
/* protocol helper struct */
static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
- .l3proto = AF_INET,
.l4proto = IPPROTO_GRE,
.pkt_to_tuple = gre_pkt_to_tuple,
- .invert_tuple = gre_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = gre_print_conntrack,
#endif
- .get_timeouts = gre_get_timeouts,
.packet = gre_packet,
- .new = gre_new,
.destroy = gre_destroy,
.me = THIS_MODULE,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -371,7 +361,7 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = gre_timeout_nlattr_to_obj,
.obj_to_nlattr = gre_timeout_obj_to_nlattr,
@@ -379,7 +369,7 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
.obj_size = sizeof(unsigned int) * GRE_CT_MAX,
.nla_policy = gre_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.net_id = &proto_gre_net_id,
.init_net = gre_init_net,
};
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 5c15beafa711..3598520bd19b 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -19,6 +19,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
@@ -71,30 +72,17 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
return true;
}
-static unsigned int *icmp_get_timeouts(struct net *net)
-{
- return &icmp_pernet(net)->timeout;
-}
-
/* Returns verdict for packet, or -1 for invalid. */
static int icmp_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
+ const struct nf_hook_state *state)
{
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
- nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
-
- return NF_ACCEPT;
-}
-
-/* Called when a new connection for this protocol found. */
-static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
+ unsigned int *timeout = nf_ct_timeout_lookup(ct);
static const u_int8_t valid_new[] = {
[ICMP_ECHO] = 1,
[ICMP_TIMESTAMP] = 1,
@@ -102,21 +90,29 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
[ICMP_ADDRESS] = 1
};
+ if (state->pf != NFPROTO_IPV4)
+ return -NF_ACCEPT;
+
if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
!valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
/* Can't create a new ICMP `conn' with this. */
pr_debug("icmp: can't create new conn with type %u\n",
ct->tuplehash[0].tuple.dst.u.icmp.type);
nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
- return false;
+ return -NF_ACCEPT;
}
- return true;
+
+ if (!timeout)
+ timeout = &icmp_pernet(nf_ct_net(ct))->timeout;
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+ return NF_ACCEPT;
}
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
static int
-icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- unsigned int hooknum)
+icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
@@ -132,25 +128,24 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
if (!nf_ct_get_tuplepr(skb,
skb_network_offset(skb) + ip_hdrlen(skb)
+ sizeof(struct icmphdr),
- PF_INET, net, &origtuple)) {
+ PF_INET, state->net, &origtuple)) {
pr_debug("icmp_error_message: failed to get tuple\n");
return -NF_ACCEPT;
}
/* rcu_read_lock()ed by nf_hook_thresh */
- innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
+ innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&innertuple, &origtuple,
- &nf_conntrack_l3proto_ipv4, innerproto)) {
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT;
}
ctinfo = IP_CT_RELATED;
- h = nf_conntrack_find_get(net, zone, &innertuple);
+ h = nf_conntrack_find_get(state->net, zone, &innertuple);
if (!h) {
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT;
@@ -164,17 +159,18 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
return NF_ACCEPT;
}
-static void icmp_error_log(const struct sk_buff *skb, struct net *net,
- u8 pf, const char *msg)
+static void icmp_error_log(const struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ const char *msg)
{
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ IPPROTO_ICMP, "%s", msg);
}
/* Small and modified version of icmp_rcv */
-static int
-icmp_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb, unsigned int dataoff,
- u8 pf, unsigned int hooknum)
+int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
+ const struct nf_hook_state *state)
{
const struct icmphdr *icmph;
struct icmphdr _ih;
@@ -182,14 +178,15 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
/* Not enough header? */
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
- icmp_error_log(skb, net, pf, "short packet");
+ icmp_error_log(skb, state, "short packet");
return -NF_ACCEPT;
}
/* See ip_conntrack_proto_tcp.c */
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_ip_checksum(skb, hooknum, dataoff, 0)) {
- icmp_error_log(skb, net, pf, "bad hw icmp checksum");
+ if (state->net->ct.sysctl_checksum &&
+ state->hook == NF_INET_PRE_ROUTING &&
+ nf_ip_checksum(skb, state->hook, dataoff, 0)) {
+ icmp_error_log(skb, state, "bad hw icmp checksum");
return -NF_ACCEPT;
}
@@ -200,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
* discarded.
*/
if (icmph->type > NR_ICMP_TYPES) {
- icmp_error_log(skb, net, pf, "invalid icmp type");
+ icmp_error_log(skb, state, "invalid icmp type");
return -NF_ACCEPT;
}
@@ -212,7 +209,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
- return icmp_error_message(net, tmpl, skb, hooknum);
+ return icmp_error_message(tmpl, skb, state);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -269,7 +266,7 @@ static unsigned int icmp_nlattr_tuple_size(void)
}
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -281,9 +278,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct nf_icmp_net *in = icmp_pernet(net);
if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+ if (!timeout)
+ timeout = &in->timeout;
*timeout =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
- } else {
+ } else if (timeout) {
/* Set default ICMP timeout. */
*timeout = in->timeout;
}
@@ -307,7 +306,7 @@ static const struct nla_policy
icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
[CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
static struct ctl_table icmp_sysctl_table[] = {
@@ -336,7 +335,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int icmp_init_net(struct net *net, u_int16_t proto)
+static int icmp_init_net(struct net *net)
{
struct nf_icmp_net *in = icmp_pernet(net);
struct nf_proto_net *pn = &in->pn;
@@ -353,14 +352,10 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
{
- .l3proto = PF_INET,
.l4proto = IPPROTO_ICMP,
.pkt_to_tuple = icmp_pkt_to_tuple,
.invert_tuple = icmp_invert_tuple,
.packet = icmp_packet,
- .get_timeouts = icmp_get_timeouts,
- .new = icmp_new,
- .error = icmp_error,
.destroy = NULL,
.me = NULL,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -369,7 +364,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
.nlattr_to_tuple = icmp_nlattr_to_tuple,
.nla_policy = icmp_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = icmp_timeout_nlattr_to_obj,
.obj_to_nlattr = icmp_timeout_obj_to_nlattr,
@@ -377,7 +372,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
.obj_size = sizeof(unsigned int),
.nla_policy = icmp_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = icmp_init_net,
.get_net_proto = icmp_get_net_proto,
};
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 2548e2c8aedd..378618feed5d 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -23,6 +23,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
#include <net/netfilter/nf_log.h>
@@ -91,11 +92,35 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
/* Returns verdict for packet, or -1 for invalid. */
static int icmpv6_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
+ unsigned int *timeout = nf_ct_timeout_lookup(ct);
+ static const u8 valid_new[] = {
+ [ICMPV6_ECHO_REQUEST - 128] = 1,
+ [ICMPV6_NI_QUERY - 128] = 1
+ };
+
+ if (state->pf != NFPROTO_IPV6)
+ return -NF_ACCEPT;
+
+ if (!nf_ct_is_confirmed(ct)) {
+ int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
+
+ if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
+ /* Can't create a new ICMPv6 `conn' with this. */
+ pr_debug("icmpv6: can't create new conn with type %u\n",
+ type + 128);
+ nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
+ return -NF_ACCEPT;
+ }
+ }
+
+ if (!timeout)
+ timeout = icmpv6_get_timeouts(nf_ct_net(ct));
+
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
@@ -104,26 +129,6 @@ static int icmpv6_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Called when a new connection for this protocol found. */
-static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- static const u_int8_t valid_new[] = {
- [ICMPV6_ECHO_REQUEST - 128] = 1,
- [ICMPV6_NI_QUERY - 128] = 1
- };
- int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128;
-
- if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) {
- /* Can't create a new ICMPv6 `conn' with this. */
- pr_debug("icmpv6: can't create new conn with type %u\n",
- type + 128);
- nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
- return false;
- }
- return true;
-}
-
static int
icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
@@ -148,12 +153,11 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
}
/* rcu_read_lock()ed by nf_hook_thresh */
- inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum);
+ inproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&intuple, &origtuple,
- &nf_conntrack_l3proto_ipv6, inproto)) {
+ if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) {
pr_debug("icmpv6_error: Can't invert tuple\n");
return -NF_ACCEPT;
}
@@ -175,16 +179,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
return NF_ACCEPT;
}
-static void icmpv6_error_log(const struct sk_buff *skb, struct net *net,
- u8 pf, const char *msg)
+static void icmpv6_error_log(const struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ const char *msg)
{
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg);
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ IPPROTO_ICMPV6, "%s", msg);
}
-static int
-icmpv6_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb, unsigned int dataoff,
- u8 pf, unsigned int hooknum)
+int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
{
const struct icmp6hdr *icmp6h;
struct icmp6hdr _ih;
@@ -192,13 +198,14 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
if (icmp6h == NULL) {
- icmpv6_error_log(skb, net, pf, "short packet");
+ icmpv6_error_log(skb, state, "short packet");
return -NF_ACCEPT;
}
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
- icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed");
+ if (state->hook == NF_INET_PRE_ROUTING &&
+ state->net->ct.sysctl_checksum &&
+ nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) {
+ icmpv6_error_log(skb, state, "ICMPv6 checksum failed");
return -NF_ACCEPT;
}
@@ -213,7 +220,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT;
- return icmpv6_error_message(net, tmpl, skb, dataoff);
+ return icmpv6_error_message(state->net, tmpl, skb, dataoff);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -270,7 +277,7 @@ static unsigned int icmpv6_nlattr_tuple_size(void)
}
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -281,6 +288,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeout = data;
struct nf_icmp_net *in = icmpv6_pernet(net);
+ if (!timeout)
+ timeout = icmpv6_get_timeouts(net);
if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) {
*timeout =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ;
@@ -308,7 +317,7 @@ static const struct nla_policy
icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = {
[CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
static struct ctl_table icmpv6_sysctl_table[] = {
@@ -337,7 +346,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int icmpv6_init_net(struct net *net, u_int16_t proto)
+static int icmpv6_init_net(struct net *net)
{
struct nf_icmp_net *in = icmpv6_pernet(net);
struct nf_proto_net *pn = &in->pn;
@@ -354,21 +363,17 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
{
- .l3proto = PF_INET6,
.l4proto = IPPROTO_ICMPV6,
.pkt_to_tuple = icmpv6_pkt_to_tuple,
.invert_tuple = icmpv6_invert_tuple,
.packet = icmpv6_packet,
- .get_timeouts = icmpv6_get_timeouts,
- .new = icmpv6_new,
- .error = icmpv6_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmpv6_tuple_to_nlattr,
.nlattr_tuple_size = icmpv6_nlattr_tuple_size,
.nlattr_to_tuple = icmpv6_nlattr_to_tuple,
.nla_policy = icmpv6_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = icmpv6_timeout_nlattr_to_obj,
.obj_to_nlattr = icmpv6_timeout_obj_to_nlattr,
@@ -376,7 +381,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
.obj_size = sizeof(unsigned int),
.nla_policy = icmpv6_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = icmpv6_init_net,
.get_net_proto = icmpv6_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index fb9a35d16069..3d719d3eb9a3 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -28,6 +28,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR
@@ -150,30 +151,6 @@ static inline struct nf_sctp_net *sctp_pernet(struct net *net)
return &net->ct.nf_ct_proto.sctp;
}
-static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- const struct sctphdr *hp;
- struct sctphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.sctp.port = hp->source;
- tuple->dst.u.sctp.port = hp->dest;
- return true;
-}
-
-static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.sctp.port = orig->dst.u.sctp.port;
- tuple->dst.u.sctp.port = orig->src.u.sctp.port;
- return true;
-}
-
#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -296,17 +273,100 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
return sctp_conntracks[dir][i][cur_state];
}
-static unsigned int *sctp_get_timeouts(struct net *net)
+/* Don't need lock here: this conntrack not in circulation yet */
+static noinline bool
+sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ const struct sctphdr *sh, unsigned int dataoff)
{
- return sctp_pernet(net)->timeouts;
+ enum sctp_conntrack new_state;
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
+ u32 offset, count;
+
+ memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
+ new_state = SCTP_CONNTRACK_MAX;
+ for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) {
+ new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
+ SCTP_CONNTRACK_NONE, sch->type);
+
+ /* Invalid: delete conntrack */
+ if (new_state == SCTP_CONNTRACK_NONE ||
+ new_state == SCTP_CONNTRACK_MAX) {
+ pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
+ return false;
+ }
+
+ /* Copy the vtag into the state info */
+ if (sch->type == SCTP_CID_INIT) {
+ struct sctp_inithdr _inithdr, *ih;
+ /* Sec 8.5.1 (A) */
+ if (sh->vtag)
+ return false;
+
+ ih = skb_header_pointer(skb, offset + sizeof(_sch),
+ sizeof(_inithdr), &_inithdr);
+ if (!ih)
+ return false;
+
+ pr_debug("Setting vtag %x for new conn\n",
+ ih->init_tag);
+
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ pr_debug("Setting vtag %x for secondary conntrack\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
+ } else {
+ /* If it is a shutdown ack OOTB packet, we expect a return
+ shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+ pr_debug("Setting vtag %x for new conn OOTB\n",
+ sh->vtag);
+ ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+ }
+
+ ct->proto.sctp.state = new_state;
+ }
+
+ return true;
+}
+
+static bool sctp_error(struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
+{
+ const struct sctphdr *sh;
+ const char *logmsg;
+
+ if (skb->len < dataoff + sizeof(struct sctphdr)) {
+ logmsg = "nf_ct_sctp: short packet ";
+ goto out_invalid;
+ }
+ if (state->hook == NF_INET_PRE_ROUTING &&
+ state->net->ct.sysctl_checksum &&
+ skb->ip_summed == CHECKSUM_NONE) {
+ if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
+ logmsg = "nf_ct_sctp: failed to read header ";
+ goto out_invalid;
+ }
+ sh = (const struct sctphdr *)(skb->data + dataoff);
+ if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
+ logmsg = "nf_ct_sctp: bad CRC ";
+ goto out_invalid;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return false;
+out_invalid:
+ nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg);
+ return true;
}
/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
static int sctp_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ const struct nf_hook_state *state)
{
enum sctp_conntrack new_state, old_state;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -315,8 +375,12 @@ static int sctp_packet(struct nf_conn *ct,
const struct sctp_chunkhdr *sch;
struct sctp_chunkhdr _sch;
u_int32_t offset, count;
+ unsigned int *timeouts;
unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+ if (sctp_error(skb, dataoff, state))
+ return -NF_ACCEPT;
+
sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
if (sh == NULL)
goto out;
@@ -324,6 +388,17 @@ static int sctp_packet(struct nf_conn *ct,
if (do_basic_checks(ct, skb, dataoff, map) != 0)
goto out;
+ if (!nf_ct_is_confirmed(ct)) {
+ /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+ if (test_bit(SCTP_CID_ABORT, map) ||
+ test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
+ test_bit(SCTP_CID_COOKIE_ACK, map))
+ return -NF_ACCEPT;
+
+ if (!sctp_new(ct, skb, sh, dataoff))
+ return -NF_ACCEPT;
+ }
+
/* Check the verification tag (Sec 8.5) */
if (!test_bit(SCTP_CID_INIT, map) &&
!test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) &&
@@ -403,6 +478,10 @@ static int sctp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = sctp_pernet(nf_ct_net(ct))->timeouts;
+
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
@@ -421,110 +500,6 @@ out:
return -NF_ACCEPT;
}
-/* Called when a new connection for this protocol found. */
-static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- enum sctp_conntrack new_state;
- const struct sctphdr *sh;
- struct sctphdr _sctph;
- const struct sctp_chunkhdr *sch;
- struct sctp_chunkhdr _sch;
- u_int32_t offset, count;
- unsigned long map[256 / sizeof(unsigned long)] = { 0 };
-
- sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
- if (sh == NULL)
- return false;
-
- if (do_basic_checks(ct, skb, dataoff, map) != 0)
- return false;
-
- /* If an OOTB packet has any of these chunks discard (Sec 8.4) */
- if (test_bit(SCTP_CID_ABORT, map) ||
- test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) ||
- test_bit(SCTP_CID_COOKIE_ACK, map))
- return false;
-
- memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
- new_state = SCTP_CONNTRACK_MAX;
- for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
- /* Don't need lock here: this conntrack not in circulation yet */
- new_state = sctp_new_state(IP_CT_DIR_ORIGINAL,
- SCTP_CONNTRACK_NONE, sch->type);
-
- /* Invalid: delete conntrack */
- if (new_state == SCTP_CONNTRACK_NONE ||
- new_state == SCTP_CONNTRACK_MAX) {
- pr_debug("nf_conntrack_sctp: invalid new deleting.\n");
- return false;
- }
-
- /* Copy the vtag into the state info */
- if (sch->type == SCTP_CID_INIT) {
- struct sctp_inithdr _inithdr, *ih;
- /* Sec 8.5.1 (A) */
- if (sh->vtag)
- return false;
-
- ih = skb_header_pointer(skb, offset + sizeof(_sch),
- sizeof(_inithdr), &_inithdr);
- if (!ih)
- return false;
-
- pr_debug("Setting vtag %x for new conn\n",
- ih->init_tag);
-
- ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag;
- } else if (sch->type == SCTP_CID_HEARTBEAT) {
- pr_debug("Setting vtag %x for secondary conntrack\n",
- sh->vtag);
- ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
- }
- /* If it is a shutdown ack OOTB packet, we expect a return
- shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
- else {
- pr_debug("Setting vtag %x for new conn OOTB\n",
- sh->vtag);
- ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
- }
-
- ct->proto.sctp.state = new_state;
- }
-
- return true;
-}
-
-static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb,
- unsigned int dataoff,
- u8 pf, unsigned int hooknum)
-{
- const struct sctphdr *sh;
- const char *logmsg;
-
- if (skb->len < dataoff + sizeof(struct sctphdr)) {
- logmsg = "nf_ct_sctp: short packet ";
- goto out_invalid;
- }
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- skb->ip_summed == CHECKSUM_NONE) {
- if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
- logmsg = "nf_ct_sctp: failed to read header ";
- goto out_invalid;
- }
- sh = (const struct sctphdr *)(skb->data + dataoff);
- if (sh->checksum != sctp_compute_cksum(skb, dataoff)) {
- logmsg = "nf_ct_sctp: bad CRC ";
- goto out_invalid;
- }
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- }
- return NF_ACCEPT;
-out_invalid:
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg);
- return -NF_ACCEPT;
-}
-
static bool sctp_can_early_drop(const struct nf_conn *ct)
{
switch (ct->proto.sctp.state) {
@@ -615,7 +590,7 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
}
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -637,6 +612,8 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
}
}
+
+ timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED];
return 0;
}
@@ -668,7 +645,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
[CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 },
[CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
@@ -757,7 +734,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int sctp_init_net(struct net *net, u_int16_t proto)
+static int sctp_init_net(struct net *net)
{
struct nf_sctp_net *sn = sctp_pernet(net);
struct nf_proto_net *pn = &sn->pn;
@@ -767,6 +744,11 @@ static int sctp_init_net(struct net *net, u_int16_t proto)
for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
sn->timeouts[i] = sctp_timeouts[i];
+
+ /* timeouts[0] is unused, init it so ->timeouts[0] contains
+ * 'new' timeout, like udp or icmp.
+ */
+ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED];
}
return sctp_kmemdup_sysctl_table(pn, sn);
@@ -777,18 +759,12 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
return &net->ct.nf_ct_proto.sctp.pn;
}
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
- .l3proto = PF_INET,
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
.l4proto = IPPROTO_SCTP,
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = sctp_print_conntrack,
#endif
.packet = sctp_packet,
- .get_timeouts = sctp_get_timeouts,
- .new = sctp_new,
- .error = sctp_error,
.can_early_drop = sctp_can_early_drop,
.me = THIS_MODULE,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -800,7 +776,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = sctp_timeout_nlattr_to_obj,
.obj_to_nlattr = sctp_timeout_obj_to_nlattr,
@@ -808,45 +784,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
.obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
.nla_policy = sctp_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .init_net = sctp_init_net,
- .get_net_proto = sctp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
- .l3proto = PF_INET6,
- .l4proto = IPPROTO_SCTP,
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
- .print_conntrack = sctp_print_conntrack,
-#endif
- .packet = sctp_packet,
- .get_timeouts = sctp_get_timeouts,
- .new = sctp_new,
- .error = sctp_error,
- .can_early_drop = sctp_can_early_drop,
- .me = THIS_MODULE,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_size = SCTP_NLATTR_SIZE,
- .to_nlattr = sctp_to_nlattr,
- .from_nlattr = nlattr_to_sctp,
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = sctp_timeout_nlattr_to_obj,
- .obj_to_nlattr = sctp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_SCTP_MAX,
- .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX,
- .nla_policy = sctp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-#endif
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = sctp_init_net,
.get_net_proto = sctp_get_net_proto,
};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 8e67910185a0..1bcf9984d45e 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -276,31 +277,6 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net)
return &net->ct.nf_ct_proto.tcp;
}
-static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- const struct tcphdr *hp;
- struct tcphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.tcp.port = hp->source;
- tuple->dst.u.tcp.port = hp->dest;
-
- return true;
-}
-
-static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.tcp.port = orig->dst.u.tcp.port;
- tuple->dst.u.tcp.port = orig->src.u.tcp.port;
- return true;
-}
-
#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -741,35 +717,26 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
[TCPHDR_ACK|TCPHDR_URG] = 1,
};
-static void tcp_error_log(const struct sk_buff *skb, struct net *net,
- u8 pf, const char *msg)
+static void tcp_error_log(const struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ const char *msg)
{
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
}
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
-static int tcp_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb,
- unsigned int dataoff,
- u_int8_t pf,
- unsigned int hooknum)
+static bool tcp_error(const struct tcphdr *th,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
{
- const struct tcphdr *th;
- struct tcphdr _tcph;
unsigned int tcplen = skb->len - dataoff;
- u_int8_t tcpflags;
-
- /* Smaller that minimal TCP header? */
- th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
- if (th == NULL) {
- tcp_error_log(skb, net, pf, "short packet");
- return -NF_ACCEPT;
- }
+ u8 tcpflags;
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
- tcp_error_log(skb, net, pf, "truncated packet");
- return -NF_ACCEPT;
+ tcp_error_log(skb, state, "truncated packet");
+ return true;
}
/* Checksum invalid? Ignore.
@@ -777,46 +744,121 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
* because the checksum is assumed to be correct.
*/
/* FIXME: Source route IP option packets --RR */
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
- tcp_error_log(skb, net, pf, "bad checksum");
- return -NF_ACCEPT;
+ if (state->net->ct.sysctl_checksum &&
+ state->hook == NF_INET_PRE_ROUTING &&
+ nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) {
+ tcp_error_log(skb, state, "bad checksum");
+ return true;
}
/* Check TCP flags. */
tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
if (!tcp_valid_flags[tcpflags]) {
- tcp_error_log(skb, net, pf, "invalid tcp flag combination");
- return -NF_ACCEPT;
+ tcp_error_log(skb, state, "invalid tcp flag combination");
+ return true;
}
- return NF_ACCEPT;
+ return false;
}
-static unsigned int *tcp_get_timeouts(struct net *net)
+static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *th)
{
- return tcp_pernet(net)->timeouts;
+ enum tcp_conntrack new_state;
+ struct net *net = nf_ct_net(ct);
+ const struct nf_tcp_net *tn = tcp_pernet(net);
+ const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
+ const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
+
+ /* Don't need lock here: this conntrack not in circulation yet */
+ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
+
+ /* Invalid: delete conntrack */
+ if (new_state >= TCP_CONNTRACK_MAX) {
+ pr_debug("nf_ct_tcp: invalid new deleting.\n");
+ return false;
+ }
+
+ if (new_state == TCP_CONNTRACK_SYN_SENT) {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+ /* SYN packet */
+ ct->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ dataoff, th);
+ ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (ct->proto.tcp.seen[0].td_maxwin == 0)
+ ct->proto.tcp.seen[0].td_maxwin = 1;
+ ct->proto.tcp.seen[0].td_maxend =
+ ct->proto.tcp.seen[0].td_end;
+
+ tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
+ } else if (tn->tcp_loose == 0) {
+ /* Don't try to pick up connections. */
+ return false;
+ } else {
+ memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
+ /*
+ * We are in the middle of a connection,
+ * its history is lost for us.
+ * Let's try to use the data from the packet.
+ */
+ ct->proto.tcp.seen[0].td_end =
+ segment_seq_plus_len(ntohl(th->seq), skb->len,
+ dataoff, th);
+ ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+ if (ct->proto.tcp.seen[0].td_maxwin == 0)
+ ct->proto.tcp.seen[0].td_maxwin = 1;
+ ct->proto.tcp.seen[0].td_maxend =
+ ct->proto.tcp.seen[0].td_end +
+ ct->proto.tcp.seen[0].td_maxwin;
+
+ /* We assume SACK and liberal window checking to handle
+ * window scaling */
+ ct->proto.tcp.seen[0].flags =
+ ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
+ IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ /* tcp_packet will set them */
+ ct->proto.tcp.last_index = TCP_NONE_SET;
+
+ pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
+ "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+ __func__,
+ sender->td_end, sender->td_maxend, sender->td_maxwin,
+ sender->td_scale,
+ receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+ receiver->td_scale);
+ return true;
}
/* Returns verdict for packet, or -1 for invalid. */
static int tcp_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ const struct nf_hook_state *state)
{
struct net *net = nf_ct_net(ct);
struct nf_tcp_net *tn = tcp_pernet(net);
struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
+ unsigned int index, *timeouts;
enum ip_conntrack_dir dir;
const struct tcphdr *th;
struct tcphdr _tcph;
unsigned long timeout;
- unsigned int index;
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
- BUG_ON(th == NULL);
+ if (th == NULL)
+ return -NF_ACCEPT;
+
+ if (tcp_error(th, skb, dataoff, state))
+ return -NF_ACCEPT;
+
+ if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
+ return -NF_ACCEPT;
spin_lock_bh(&ct->lock);
old_state = ct->proto.tcp.state;
@@ -1046,6 +1088,10 @@ static int tcp_packet(struct nf_conn *ct,
&& new_state == TCP_CONNTRACK_FIN_WAIT)
ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = tn->timeouts;
+
if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
timeout = timeouts[TCP_CONNTRACK_RETRANS];
@@ -1093,82 +1139,6 @@ static int tcp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Called when a new connection for this protocol found. */
-static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- enum tcp_conntrack new_state;
- const struct tcphdr *th;
- struct tcphdr _tcph;
- struct net *net = nf_ct_net(ct);
- struct nf_tcp_net *tn = tcp_pernet(net);
- const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
- const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
-
- th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
- BUG_ON(th == NULL);
-
- /* Don't need lock here: this conntrack not in circulation yet */
- new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
-
- /* Invalid: delete conntrack */
- if (new_state >= TCP_CONNTRACK_MAX) {
- pr_debug("nf_ct_tcp: invalid new deleting.\n");
- return false;
- }
-
- if (new_state == TCP_CONNTRACK_SYN_SENT) {
- memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
- /* SYN packet */
- ct->proto.tcp.seen[0].td_end =
- segment_seq_plus_len(ntohl(th->seq), skb->len,
- dataoff, th);
- ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
- if (ct->proto.tcp.seen[0].td_maxwin == 0)
- ct->proto.tcp.seen[0].td_maxwin = 1;
- ct->proto.tcp.seen[0].td_maxend =
- ct->proto.tcp.seen[0].td_end;
-
- tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
- } else if (tn->tcp_loose == 0) {
- /* Don't try to pick up connections. */
- return false;
- } else {
- memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
- /*
- * We are in the middle of a connection,
- * its history is lost for us.
- * Let's try to use the data from the packet.
- */
- ct->proto.tcp.seen[0].td_end =
- segment_seq_plus_len(ntohl(th->seq), skb->len,
- dataoff, th);
- ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
- if (ct->proto.tcp.seen[0].td_maxwin == 0)
- ct->proto.tcp.seen[0].td_maxwin = 1;
- ct->proto.tcp.seen[0].td_maxend =
- ct->proto.tcp.seen[0].td_end +
- ct->proto.tcp.seen[0].td_maxwin;
-
- /* We assume SACK and liberal window checking to handle
- * window scaling */
- ct->proto.tcp.seen[0].flags =
- ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
- IP_CT_TCP_FLAG_BE_LIBERAL;
- }
-
- /* tcp_packet will set them */
- ct->proto.tcp.last_index = TCP_NONE_SET;
-
- pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
- return true;
-}
-
static bool tcp_can_early_drop(const struct nf_conn *ct)
{
switch (ct->proto.tcp.state) {
@@ -1239,8 +1209,8 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
#define TCP_NLATTR_SIZE ( \
NLA_ALIGN(NLA_HDRLEN + 1) + \
NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))) + \
- NLA_ALIGN(NLA_HDRLEN + sizeof(sizeof(struct nf_ct_tcp_flags))))
+ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \
+ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)))
static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
{
@@ -1305,7 +1275,7 @@ static unsigned int tcp_nlattr_tuple_size(void)
}
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -1313,10 +1283,12 @@ static unsigned int tcp_nlattr_tuple_size(void)
static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- unsigned int *timeouts = data;
struct nf_tcp_net *tn = tcp_pernet(net);
+ unsigned int *timeouts = data;
int i;
+ if (!timeouts)
+ timeouts = tn->timeouts;
/* set default TCP timeouts. */
for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
timeouts[i] = tn->timeouts[i];
@@ -1325,6 +1297,7 @@ static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
timeouts[TCP_CONNTRACK_SYN_SENT] =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
}
+
if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
timeouts[TCP_CONNTRACK_SYN_RECV] =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
@@ -1365,6 +1338,8 @@ static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
timeouts[TCP_CONNTRACK_UNACK] =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
}
+
+ timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT];
return 0;
}
@@ -1415,7 +1390,7 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
[CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 },
[CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
static struct ctl_table tcp_sysctl_table[] = {
@@ -1531,7 +1506,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int tcp_init_net(struct net *net, u_int16_t proto)
+static int tcp_init_net(struct net *net)
{
struct nf_tcp_net *tn = tcp_pernet(net);
struct nf_proto_net *pn = &tn->pn;
@@ -1542,6 +1517,10 @@ static int tcp_init_net(struct net *net, u_int16_t proto)
for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
tn->timeouts[i] = tcp_timeouts[i];
+ /* timeouts[0] is unused, make it same as SYN_SENT so
+ * ->timeouts[0] contains 'new' timeout, like udp or icmp.
+ */
+ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
tn->tcp_loose = nf_ct_tcp_loose;
tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
@@ -1555,19 +1534,13 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net)
return &net->ct.nf_ct_proto.tcp.pn;
}
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
{
- .l3proto = PF_INET,
.l4proto = IPPROTO_TCP,
- .pkt_to_tuple = tcp_pkt_to_tuple,
- .invert_tuple = tcp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = tcp_print_conntrack,
#endif
.packet = tcp_packet,
- .get_timeouts = tcp_get_timeouts,
- .new = tcp_new,
- .error = tcp_error,
.can_early_drop = tcp_can_early_drop,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = tcp_to_nlattr,
@@ -1578,45 +1551,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
.nlattr_size = TCP_NLATTR_SIZE,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
- .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_TCP_MAX,
- .obj_size = sizeof(unsigned int) *
- TCP_CONNTRACK_TIMEOUT_MAX,
- .nla_policy = tcp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .init_net = tcp_init_net,
- .get_net_proto = tcp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
-{
- .l3proto = PF_INET6,
- .l4proto = IPPROTO_TCP,
- .pkt_to_tuple = tcp_pkt_to_tuple,
- .invert_tuple = tcp_invert_tuple,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
- .print_conntrack = tcp_print_conntrack,
-#endif
- .packet = tcp_packet,
- .get_timeouts = tcp_get_timeouts,
- .new = tcp_new,
- .error = tcp_error,
- .can_early_drop = tcp_can_early_drop,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_size = TCP_NLATTR_SIZE,
- .to_nlattr = tcp_to_nlattr,
- .from_nlattr = nlattr_to_tcp,
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nlattr_tuple_size = tcp_nlattr_tuple_size,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = tcp_timeout_nlattr_to_obj,
.obj_to_nlattr = tcp_timeout_obj_to_nlattr,
@@ -1625,8 +1560,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
TCP_CONNTRACK_TIMEOUT_MAX,
.nla_policy = tcp_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = tcp_init_net,
.get_net_proto = tcp_get_net_proto,
};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index fe7243970aa4..a7aa70370913 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -22,6 +22,7 @@
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -36,45 +37,74 @@ static inline struct nf_udp_net *udp_pernet(struct net *net)
return &net->ct.nf_ct_proto.udp;
}
-static bool udp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct net *net,
- struct nf_conntrack_tuple *tuple)
+static unsigned int *udp_get_timeouts(struct net *net)
{
- const struct udphdr *hp;
- struct udphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.udp.port = hp->source;
- tuple->dst.u.udp.port = hp->dest;
-
- return true;
+ return udp_pernet(net)->timeouts;
}
-static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+static void udp_error_log(const struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ const char *msg)
{
- tuple->src.u.udp.port = orig->dst.u.udp.port;
- tuple->dst.u.udp.port = orig->src.u.udp.port;
- return true;
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ IPPROTO_UDP, "%s", msg);
}
-static unsigned int *udp_get_timeouts(struct net *net)
+static bool udp_error(struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
{
- return udp_pernet(net)->timeouts;
+ unsigned int udplen = skb->len - dataoff;
+ const struct udphdr *hdr;
+ struct udphdr _hdr;
+
+ /* Header is too small? */
+ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+ if (!hdr) {
+ udp_error_log(skb, state, "short packet");
+ return true;
+ }
+
+ /* Truncated/malformed packets */
+ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+ udp_error_log(skb, state, "truncated/malformed packet");
+ return true;
+ }
+
+ /* Packet with no checksum */
+ if (!hdr->check)
+ return false;
+
+ /* Checksum invalid? Ignore.
+ * We skip checking packets on the outgoing path
+ * because the checksum is assumed to be correct.
+ * FIXME: Source route IP option packets --RR */
+ if (state->hook == NF_INET_PRE_ROUTING &&
+ state->net->ct.sysctl_checksum &&
+ nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) {
+ udp_error_log(skb, state, "bad checksum");
+ return true;
+ }
+
+ return false;
}
/* Returns verdict for packet, and may modify conntracktype */
static int udp_packet(struct nf_conn *ct,
- const struct sk_buff *skb,
+ struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ const struct nf_hook_state *state)
{
+ unsigned int *timeouts;
+
+ if (udp_error(skb, dataoff, state))
+ return -NF_ACCEPT;
+
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = udp_get_timeouts(nf_ct_net(ct));
+
/* If we've seen traffic both ways, this is some kind of UDP
stream. Extend timeout. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -90,24 +120,18 @@ static int udp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Called when a new connection for this protocol found. */
-static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
-{
- return true;
-}
-
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-static void udplite_error_log(const struct sk_buff *skb, struct net *net,
- u8 pf, const char *msg)
+static void udplite_error_log(const struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ const char *msg)
{
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg);
+ nf_l4proto_log_invalid(skb, state->net, state->pf,
+ IPPROTO_UDPLITE, "%s", msg);
}
-static int udplite_error(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb,
- unsigned int dataoff,
- u8 pf, unsigned int hooknum)
+static bool udplite_error(struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
{
unsigned int udplen = skb->len - dataoff;
const struct udphdr *hdr;
@@ -117,82 +141,69 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
/* Header is too small? */
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (!hdr) {
- udplite_error_log(skb, net, pf, "short packet");
- return -NF_ACCEPT;
+ udplite_error_log(skb, state, "short packet");
+ return true;
}
cscov = ntohs(hdr->len);
if (cscov == 0) {
cscov = udplen;
} else if (cscov < sizeof(*hdr) || cscov > udplen) {
- udplite_error_log(skb, net, pf, "invalid checksum coverage");
- return -NF_ACCEPT;
+ udplite_error_log(skb, state, "invalid checksum coverage");
+ return true;
}
/* UDPLITE mandates checksums */
if (!hdr->check) {
- udplite_error_log(skb, net, pf, "checksum missing");
- return -NF_ACCEPT;
+ udplite_error_log(skb, state, "checksum missing");
+ return true;
}
/* Checksum invalid? Ignore. */
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
- pf)) {
- udplite_error_log(skb, net, pf, "bad checksum");
- return -NF_ACCEPT;
+ if (state->hook == NF_INET_PRE_ROUTING &&
+ state->net->ct.sysctl_checksum &&
+ nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP,
+ state->pf)) {
+ udplite_error_log(skb, state, "bad checksum");
+ return true;
}
- return NF_ACCEPT;
-}
-#endif
-
-static void udp_error_log(const struct sk_buff *skb, struct net *net,
- u8 pf, const char *msg)
-{
- nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg);
+ return false;
}
-static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
- unsigned int dataoff,
- u_int8_t pf,
- unsigned int hooknum)
+/* Returns verdict for packet, and may modify conntracktype */
+static int udplite_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
- unsigned int udplen = skb->len - dataoff;
- const struct udphdr *hdr;
- struct udphdr _hdr;
-
- /* Header is too small? */
- hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
- if (hdr == NULL) {
- udp_error_log(skb, net, pf, "short packet");
- return -NF_ACCEPT;
- }
+ unsigned int *timeouts;
- /* Truncated/malformed packets */
- if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
- udp_error_log(skb, net, pf, "truncated/malformed packet");
+ if (udplite_error(skb, dataoff, state))
return -NF_ACCEPT;
- }
- /* Packet with no checksum */
- if (!hdr->check)
- return NF_ACCEPT;
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = udp_get_timeouts(nf_ct_net(ct));
- /* Checksum invalid? Ignore.
- * We skip checking packets on the outgoing path
- * because the checksum is assumed to be correct.
- * FIXME: Source route IP option packets --RR */
- if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
- nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
- udp_error_log(skb, net, pf, "bad checksum");
- return -NF_ACCEPT;
+ /* If we've seen traffic both ways, this is some kind of UDP
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_REPLIED]);
+ /* Also, more likely to be important, and not a probe */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ } else {
+ nf_ct_refresh_acct(ct, ctinfo, skb,
+ timeouts[UDP_CT_UNREPLIED]);
}
-
return NF_ACCEPT;
}
+#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
@@ -203,6 +214,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeouts = data;
struct nf_udp_net *un = udp_pernet(net);
+ if (!timeouts)
+ timeouts = un->timeouts;
+
/* set default timeouts for UDP. */
timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
@@ -239,7 +253,7 @@ udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
[CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 },
[CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 },
};
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
#ifdef CONFIG_SYSCTL
static struct ctl_table udp_sysctl_table[] = {
@@ -276,7 +290,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
return 0;
}
-static int udp_init_net(struct net *net, u_int16_t proto)
+static int udp_init_net(struct net *net)
{
struct nf_udp_net *un = udp_pernet(net);
struct nf_proto_net *pn = &un->pn;
@@ -296,24 +310,18 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net)
return &net->ct.nf_ct_proto.udp.pn;
}
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
{
- .l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
.packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
- .new = udp_new,
- .error = udp_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = udp_timeout_nlattr_to_obj,
.obj_to_nlattr = udp_timeout_obj_to_nlattr,
@@ -321,95 +329,24 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
.obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
.nla_policy = udp_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = udp_init_net,
.get_net_proto = udp_get_net_proto,
};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4);
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite =
{
- .l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
- .packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
- .new = udp_new,
- .error = udplite_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = udp_timeout_nlattr_to_obj,
- .obj_to_nlattr = udp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_UDP_MAX,
- .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
- .nla_policy = udp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .init_net = udp_init_net,
- .get_net_proto = udp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4);
-#endif
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
-{
- .l3proto = PF_INET6,
- .l4proto = IPPROTO_UDP,
- .allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
- .packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
- .new = udp_new,
- .error = udp_error,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
- .ctnl_timeout = {
- .nlattr_to_obj = udp_timeout_nlattr_to_obj,
- .obj_to_nlattr = udp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_UDP_MAX,
- .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
- .nla_policy = udp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
- .init_net = udp_init_net,
- .get_net_proto = udp_get_net_proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6);
-
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
-{
- .l3proto = PF_INET6,
- .l4proto = IPPROTO_UDPLITE,
- .allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
- .packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
- .new = udp_new,
- .error = udplite_error,
+ .packet = udplite_packet,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
.nla_policy = nf_ct_port_nla_policy,
#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = udp_timeout_nlattr_to_obj,
.obj_to_nlattr = udp_timeout_obj_to_nlattr,
@@ -417,9 +354,8 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
.obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX,
.nla_policy = udp_timeout_nla_policy,
},
-#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
.init_net = udp_init_net,
.get_net_proto = udp_get_net_proto,
};
-EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
#endif
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b642c0b2495c..463d17d349c1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,12 +1,4 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
+// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/slab.h>
@@ -24,7 +16,6 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -33,15 +24,14 @@
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <linux/rculist_nulls.h>
-MODULE_LICENSE("GPL");
+unsigned int nf_conntrack_net_id __read_mostly;
#ifdef CONFIG_NF_CONNTRACK_PROCFS
void
print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
- switch (l3proto->l3proto) {
+ switch (tuple->src.l3num) {
case NFPROTO_IPV4:
seq_printf(s, "src=%pI4 dst=%pI4 ",
&tuple->src.u3.ip, &tuple->dst.u3.ip);
@@ -282,7 +272,6 @@ static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct net *net = seq_file_net(s);
int ret = 0;
@@ -303,14 +292,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (!net_eq(nf_ct_net(ct), net))
goto release;
- l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
- WARN_ON(!l3proto);
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+ l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
WARN_ON(!l4proto);
ret = -ENOSPC;
seq_printf(s, "%-8s %u %-8s %u ",
- l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
+ l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct),
l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
@@ -320,7 +307,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
l4proto->print_conntrack(s, ct);
print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- l3proto, l4proto);
+ l4proto);
ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
@@ -333,8 +320,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
seq_puts(s, "[UNREPLIED] ");
- print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- l3proto, l4proto);
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto);
ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
@@ -680,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
static struct pernet_operations nf_conntrack_net_ops = {
.init = nf_conntrack_pernet_init,
.exit_batch = nf_conntrack_pernet_exit,
+ .id = &nf_conntrack_net_id,
+ .size = sizeof(struct nf_conntrack_net),
};
static int __init nf_conntrack_standalone_init(void)
@@ -732,10 +720,3 @@ static void __exit nf_conntrack_standalone_fini(void)
module_init(nf_conntrack_standalone_init);
module_exit(nf_conntrack_standalone_fini);
-
-/* Some modules need us, but don't depend directly on any symbol.
- They should call this. */
-void need_conntrack(void)
-{
-}
-EXPORT_SYMBOL_GPL(need_conntrack);
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 46aee65f339b..91fbd183da2d 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -24,13 +24,30 @@
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-struct ctnl_timeout *
+struct nf_ct_timeout *
(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly;
+void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+static int untimeout(struct nf_conn *ct, void *timeout)
+{
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+
+ /* We are not intended to delete this conntrack. */
+ return 0;
+}
+
+void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
+{
+ nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
+}
+EXPORT_SYMBOL_GPL(nf_ct_untimeout);
+
static const struct nf_ct_ext_type timeout_extend = {
.len = sizeof(struct nf_conn_timeout),
.align = __alignof__(struct nf_conn_timeout),
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index eb0d1658ac05..b7a4816add76 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -107,11 +107,12 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
tcp->seen[1].td_maxwin = 0;
}
+#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
+#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
+
static void flow_offload_fixup_ct_state(struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
- struct net *net = nf_ct_net(ct);
- unsigned int *timeouts;
unsigned int timeout;
int l4num;
@@ -119,18 +120,14 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct)
if (l4num == IPPROTO_TCP)
flow_offload_fixup_tcp(&ct->proto.tcp);
- l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num);
+ l4proto = __nf_ct_l4proto_find(l4num);
if (!l4proto)
return;
- timeouts = l4proto->get_timeouts(net);
- if (!timeouts)
- return;
-
if (l4num == IPPROTO_TCP)
- timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
+ timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
else if (l4num == IPPROTO_UDP)
- timeout = timeouts[UDP_CT_REPLIED];
+ timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
else
return;
@@ -236,8 +233,8 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
struct flow_offload *flow;
int dir;
- tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
- nf_flow_offload_rhash_params);
+ tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
+ nf_flow_offload_rhash_params);
if (!tuplehash)
return NULL;
@@ -257,20 +254,17 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
struct flow_offload_tuple_rhash *tuplehash;
struct rhashtable_iter hti;
struct flow_offload *flow;
- int err;
-
- err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
- if (err)
- return err;
+ int err = 0;
+ rhashtable_walk_enter(&flow_table->rhashtable, &hti);
rhashtable_walk_start(&hti);
while ((tuplehash = rhashtable_walk_next(&hti))) {
if (IS_ERR(tuplehash)) {
- err = PTR_ERR(tuplehash);
- if (err != -EAGAIN)
- goto out;
-
+ if (PTR_ERR(tuplehash) != -EAGAIN) {
+ err = PTR_ERR(tuplehash);
+ break;
+ }
continue;
}
if (tuplehash->tuple.dir)
@@ -280,7 +274,6 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
iter(flow, data);
}
-out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
@@ -293,25 +286,19 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
return (__s32)(flow->timeout - (u32)jiffies) <= 0;
}
-static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
+static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
{
struct flow_offload_tuple_rhash *tuplehash;
struct rhashtable_iter hti;
struct flow_offload *flow;
- int err;
-
- err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
- if (err)
- return 0;
+ rhashtable_walk_enter(&flow_table->rhashtable, &hti);
rhashtable_walk_start(&hti);
while ((tuplehash = rhashtable_walk_next(&hti))) {
if (IS_ERR(tuplehash)) {
- err = PTR_ERR(tuplehash);
- if (err != -EAGAIN)
- goto out;
-
+ if (PTR_ERR(tuplehash) != -EAGAIN)
+ break;
continue;
}
if (tuplehash->tuple.dir)
@@ -324,11 +311,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
FLOW_OFFLOAD_TEARDOWN)))
flow_offload_del(flow_table, flow);
}
-out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
-
- return 1;
}
static void nf_flow_offload_work_gc(struct work_struct *work)
@@ -481,14 +465,17 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init);
static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
+ struct flow_offload_entry *e;
+
+ e = container_of(flow, struct flow_offload_entry, flow);
if (!dev) {
flow_offload_teardown(flow);
return;
}
-
- if (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
- flow->tuplehash[1].tuple.iifidx == dev->ifindex)
+ if (net_eq(nf_ct_net(e->ct), dev_net(dev)) &&
+ (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
+ flow->tuplehash[1].tuple.iifidx == dev->ifindex))
flow_offload_dead(flow);
}
@@ -499,7 +486,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
flush_delayed_work(&flowtable->gc_work);
}
-void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+void nf_flow_table_cleanup(struct net_device *dev)
{
struct nf_flowtable *flowtable;
@@ -517,7 +504,7 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_unlock(&flowtable_lock);
cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- WARN_ON(!nf_flow_offload_gc_step(flow_table));
+ nf_flow_offload_gc_step(flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 15ed91309992..1d291a51cd45 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -254,8 +254,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
return NF_ACCEPT;
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
return NF_DROP;
flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
@@ -471,8 +470,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (skb_try_make_writable(skb, sizeof(*ip6h)))
return NF_DROP;
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
return NF_DROP;
flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index dc61399e30be..a8c5c846aec1 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
-void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
+void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
+ struct sock *sk)
{
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
return;
read_lock_bh(&sk->sk_callback_lock);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 46f9df99d276..e2b196054dfc 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -28,7 +28,6 @@
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
@@ -108,6 +107,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
struct flowi fl;
unsigned int hh_len;
struct dst_entry *dst;
+ struct sock *sk = skb->sk;
int err;
err = xfrm_decode_session(skb, &fl, family);
@@ -119,7 +119,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
dst = ((struct xfrm_dst *)dst)->route;
dst_hold(dst);
- dst = xfrm_lookup(net, dst, &fl, skb->sk, 0);
+ if (sk && !net_eq(net, sock_net(sk)))
+ sk = NULL;
+
+ dst = xfrm_lookup(net, dst, &fl, sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -739,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
{
- int err;
-
- err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
- if (err < 0)
- return err;
-
mutex_lock(&nf_nat_proto_mutex);
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
&nf_nat_l4proto_tcp);
@@ -777,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
synchronize_rcu();
nf_nat_l3proto_clean(l3proto->l3proto);
- nf_ct_l3proto_module_put(l3proto->l3proto);
}
EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
@@ -1060,7 +1056,7 @@ static int __init nf_nat_init(void)
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ kvfree(nf_nat_bysource);
pr_err("Unable to register extension\n");
return ret;
}
@@ -1098,7 +1094,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ kvfree(nf_nat_bysource);
unregister_pernet_subsys(&nat_net_ops);
}
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 99606baedda4..38793b95d9bc 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -37,7 +37,7 @@ static void mangle_contents(struct sk_buff *skb,
{
unsigned char *data;
- BUG_ON(skb_is_nonlinear(skb));
+ SKB_LINEAR_ASSERT(skb);
data = skb_network_header(skb) + dataoff;
/* move post-replacement */
@@ -110,8 +110,6 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
!enlarge_skb(skb, rep_len - match_len))
return false;
- SKB_LINEAR_ASSERT(skb);
-
tcph = (void *)skb->data + protoff;
oldlen = skb->len - protoff;
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index adee04af8d43..78a9e6454ff3 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -52,13 +52,11 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
newdst = 0;
- rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
if (indev && indev->ifa_list) {
ifa = indev->ifa_list;
newdst = ifa->ifa_local;
}
- rcu_read_unlock();
if (!newdst)
return NF_DROP;
@@ -97,7 +95,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
struct inet6_ifaddr *ifa;
bool addr = false;
- rcu_read_lock();
idev = __in6_dev_get(skb->dev);
if (idev != NULL) {
read_lock_bh(&idev->lock);
@@ -108,7 +105,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
}
read_unlock_bh(&idev->lock);
}
- rcu_read_unlock();
if (!addr)
return NF_DROP;
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
deleted file mode 100644
index 5ba5c7bef2f9..000000000000
--- a/net/netfilter/nf_osf.c
+++ /dev/null
@@ -1,218 +0,0 @@
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <linux/capability.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/list.h>
-#include <linux/rculist.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/tcp.h>
-
-#include <net/ip.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_log.h>
-#include <linux/netfilter/nf_osf.h>
-
-static inline int nf_osf_ttl(const struct sk_buff *skb,
- const struct nf_osf_info *info,
- unsigned char f_ttl)
-{
- const struct iphdr *ip = ip_hdr(skb);
-
- if (info->flags & NF_OSF_TTL) {
- if (info->ttl == NF_OSF_TTL_TRUE)
- return ip->ttl == f_ttl;
- if (info->ttl == NF_OSF_TTL_NOCHECK)
- return 1;
- else if (ip->ttl <= f_ttl)
- return 1;
- else {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
- int ret = 0;
-
- for_ifa(in_dev) {
- if (inet_ifa_match(ip->saddr, ifa)) {
- ret = (ip->ttl == f_ttl);
- break;
- }
- }
- endfor_ifa(in_dev);
-
- return ret;
- }
- }
-
- return ip->ttl == f_ttl;
-}
-
-bool
-nf_osf_match(const struct sk_buff *skb, u_int8_t family,
- int hooknum, struct net_device *in, struct net_device *out,
- const struct nf_osf_info *info, struct net *net,
- const struct list_head *nf_osf_fingers)
-{
- const unsigned char *optp = NULL, *_optp = NULL;
- unsigned int optsize = 0, check_WSS = 0;
- int fmatch = FMATCH_WRONG, fcount = 0;
- const struct iphdr *ip = ip_hdr(skb);
- const struct nf_osf_user_finger *f;
- unsigned char opts[MAX_IPOPTLEN];
- const struct nf_osf_finger *kf;
- u16 window, totlen, mss = 0;
- const struct tcphdr *tcp;
- struct tcphdr _tcph;
- bool df;
-
- tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
- if (!tcp)
- return false;
-
- if (!tcp->syn)
- return false;
-
- totlen = ntohs(ip->tot_len);
- df = ntohs(ip->frag_off) & IP_DF;
- window = ntohs(tcp->window);
-
- if (tcp->doff * 4 > sizeof(struct tcphdr)) {
- optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
- _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
- sizeof(struct tcphdr), optsize, opts);
- }
-
- list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
- int foptsize, optnum;
-
- f = &kf->finger;
-
- if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
- continue;
-
- optp = _optp;
- fmatch = FMATCH_WRONG;
-
- if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
- continue;
-
- /*
- * Should not happen if userspace parser was written correctly.
- */
- if (f->wss.wc >= OSF_WSS_MAX)
- continue;
-
- /* Check options */
-
- foptsize = 0;
- for (optnum = 0; optnum < f->opt_num; ++optnum)
- foptsize += f->opt[optnum].length;
-
- if (foptsize > MAX_IPOPTLEN ||
- optsize > MAX_IPOPTLEN ||
- optsize != foptsize)
- continue;
-
- check_WSS = f->wss.wc;
-
- for (optnum = 0; optnum < f->opt_num; ++optnum) {
- if (f->opt[optnum].kind == (*optp)) {
- __u32 len = f->opt[optnum].length;
- const __u8 *optend = optp + len;
-
- fmatch = FMATCH_OK;
-
- switch (*optp) {
- case OSFOPT_MSS:
- mss = optp[3];
- mss <<= 8;
- mss |= optp[2];
-
- mss = ntohs((__force __be16)mss);
- break;
- case OSFOPT_TS:
- break;
- }
-
- optp = optend;
- } else
- fmatch = FMATCH_OPT_WRONG;
-
- if (fmatch != FMATCH_OK)
- break;
- }
-
- if (fmatch != FMATCH_OPT_WRONG) {
- fmatch = FMATCH_WRONG;
-
- switch (check_WSS) {
- case OSF_WSS_PLAIN:
- if (f->wss.val == 0 || window == f->wss.val)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MSS:
- /*
- * Some smart modems decrease mangle MSS to
- * SMART_MSS_2, so we check standard, decreased
- * and the one provided in the fingerprint MSS
- * values.
- */
-#define SMART_MSS_1 1460
-#define SMART_MSS_2 1448
- if (window == f->wss.val * mss ||
- window == f->wss.val * SMART_MSS_1 ||
- window == f->wss.val * SMART_MSS_2)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MTU:
- if (window == f->wss.val * (mss + 40) ||
- window == f->wss.val * (SMART_MSS_1 + 40) ||
- window == f->wss.val * (SMART_MSS_2 + 40))
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MODULO:
- if ((window % f->wss.val) == 0)
- fmatch = FMATCH_OK;
- break;
- }
- }
-
- if (fmatch != FMATCH_OK)
- continue;
-
- fcount++;
-
- if (info->flags & NF_OSF_LOG)
- nf_log_packet(net, family, hooknum, skb,
- in, out, NULL,
- "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
- f->genre, f->version, f->subtype,
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest),
- f->ttl - ip->ttl);
-
- if ((info->flags & NF_OSF_LOG) &&
- info->loglevel == NF_OSF_LOGLEVEL_FIRST)
- break;
- }
-
- if (!fcount && (info->flags & NF_OSF_LOG))
- nf_log_packet(net, family, hooknum, skb, in, out, NULL,
- "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest));
-
- if (fcount)
- fmatch = FMATCH_OK;
-
- return fmatch == FMATCH_OK;
-}
-EXPORT_SYMBOL_GPL(nf_osf_match);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f5745e4c6513..42487d01a3ed 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/vmalloc.h>
+#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
@@ -26,6 +27,8 @@
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
+static LIST_HEAD(nf_tables_destroy_list);
+static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
static u64 table_handle;
enum {
@@ -63,6 +66,8 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
net->nft.validate_state = new_validate_state;
}
+static void nf_tables_trans_destroy_work(struct work_struct *w);
+static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
@@ -206,6 +211,18 @@ static int nft_delchain(struct nft_ctx *ctx)
return err;
}
+/* either expr ops provide both activate/deactivate, or neither */
+static bool nft_expr_check_ops(const struct nft_expr_ops *ops)
+{
+ if (!ops)
+ return true;
+
+ if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate)))
+ return false;
+
+ return true;
+}
+
static void nft_rule_expr_activate(const struct nft_ctx *ctx,
struct nft_rule *rule)
{
@@ -297,7 +314,7 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
return 0;
}
-static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
+static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
struct nft_set *set)
{
struct nft_trans *trans;
@@ -317,7 +334,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type,
return 0;
}
-static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
+static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
int err;
@@ -455,20 +472,59 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
return NULL;
}
+/*
+ * Loading a module requires dropping mutex that guards the
+ * transaction.
+ * We first need to abort any pending transactions as once
+ * mutex is unlocked a different client could start a new
+ * transaction. It must not see any 'future generation'
+ * changes * as these changes will never happen.
+ */
+#ifdef CONFIG_MODULES
+static int __nf_tables_abort(struct net *net);
+
+static void nft_request_module(struct net *net, const char *fmt, ...)
+{
+ char module_name[MODULE_NAME_LEN];
+ va_list args;
+ int ret;
+
+ __nf_tables_abort(net);
+
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+ if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
+ return;
+
+ mutex_unlock(&net->nft.commit_mutex);
+ request_module("%s", module_name);
+ mutex_lock(&net->nft.commit_mutex);
+}
+#endif
+
+static void lockdep_nfnl_nft_mutex_not_held(void)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+#endif
+}
+
static const struct nft_chain_type *
-nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
+nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
+ u8 family, bool autoload)
{
const struct nft_chain_type *type;
type = __nf_tables_chain_type_lookup(nla, family);
if (type != NULL)
return type;
+
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (autoload) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-chain-%u-%.*s", family,
- nla_len(nla), (const char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-chain-%u-%.*s", family,
+ nla_len(nla), (const char *)nla_data(nla));
type = __nf_tables_chain_type_lookup(nla, family);
if (type != NULL)
return ERR_PTR(-EAGAIN);
@@ -772,6 +828,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int err;
+ lockdep_assert_held(&net->nft.commit_mutex);
attr = nla[NFTA_TABLE_NAME];
table = nft_table_lookup(net, attr, family, genmask);
if (IS_ERR(table)) {
@@ -964,7 +1021,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
static void nf_tables_table_destroy(struct nft_ctx *ctx)
{
- BUG_ON(ctx->table->use > 0);
+ if (WARN_ON(ctx->table->use > 0))
+ return;
rhltable_destroy(&ctx->table->chains_ht);
kfree(ctx->table->name);
@@ -1012,7 +1070,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
return ERR_PTR(-ENOENT);
}
-static struct nft_chain *nft_chain_lookup(struct nft_table *table,
+static bool lockdep_commit_lock_is_held(struct net *net)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ return lockdep_is_held(&net->nft.commit_mutex);
+#else
+ return true;
+#endif
+}
+
+static struct nft_chain *nft_chain_lookup(struct net *net,
+ struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
char search[NFT_CHAIN_MAXNAMELEN + 1];
@@ -1025,7 +1093,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table,
nla_strlcpy(search, nla, sizeof(search));
WARN_ON(!rcu_read_lock_held() &&
- !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ !lockdep_commit_lock_is_held(net));
chain = ERR_PTR(-ENOENT);
rcu_read_lock();
@@ -1265,7 +1333,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
return PTR_ERR(chain);
@@ -1361,7 +1429,8 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
struct nft_chain *chain = ctx->chain;
- BUG_ON(chain->use > 0);
+ if (WARN_ON(chain->use > 0))
+ return;
/* no concurrent access possible anymore */
nf_tables_chain_free_chain_rules(chain);
@@ -1391,13 +1460,16 @@ struct nft_chain_hook {
static int nft_chain_parse_hook(struct net *net,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
- bool create)
+ bool autoload)
{
struct nlattr *ha[NFTA_HOOK_MAX + 1];
const struct nft_chain_type *type;
struct net_device *dev;
int err;
+ lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_nfnl_nft_mutex_not_held();
+
err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
nft_hook_policy, NULL);
if (err < 0)
@@ -1412,8 +1484,8 @@ static int nft_chain_parse_hook(struct net *net,
type = chain_type[family][NFT_CHAIN_T_DEFAULT];
if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
- family, create);
+ type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+ family, autoload);
if (IS_ERR(type))
return PTR_ERR(type);
}
@@ -1480,7 +1552,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
}
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy, bool create)
+ u8 policy)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -1498,7 +1570,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
struct nft_chain_hook hook;
struct nf_hook_ops *ops;
- err = nft_chain_parse_hook(net, nla, &hook, family, create);
+ err = nft_chain_parse_hook(net, nla, &hook, family, true);
if (err < 0)
return err;
@@ -1589,8 +1661,7 @@ err1:
return err;
}
-static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
- bool create)
+static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -1607,7 +1678,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return -EBUSY;
err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
- create);
+ false);
if (err < 0)
return err;
@@ -1631,7 +1702,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nla[NFTA_CHAIN_NAME]) {
struct nft_chain *chain2;
- chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ chain2 = nft_chain_lookup(ctx->net, table,
+ nla[NFTA_CHAIN_NAME], genmask);
if (!IS_ERR(chain2))
return -EEXIST;
}
@@ -1706,9 +1778,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
- bool create;
- create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+ lockdep_assert_held(&net->nft.commit_mutex);
table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
if (IS_ERR(table)) {
@@ -1728,7 +1799,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
attr = nla[NFTA_CHAIN_HANDLE];
} else {
- chain = nft_chain_lookup(table, attr, genmask);
+ chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
NL_SET_BAD_ATTR(extack, attr);
@@ -1771,10 +1842,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- return nf_tables_updchain(&ctx, genmask, policy, create);
+ return nf_tables_updchain(&ctx, genmask, policy);
}
- return nf_tables_addchain(&ctx, family, genmask, policy, create);
+ return nf_tables_addchain(&ctx, family, genmask, policy);
}
static int nf_tables_delchain(struct net *net, struct sock *nlsk,
@@ -1806,7 +1877,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
chain = nft_chain_lookup_byhandle(table, handle, genmask);
} else {
attr = nla[NFTA_CHAIN_NAME];
- chain = nft_chain_lookup(table, attr, genmask);
+ chain = nft_chain_lookup(net, table, attr, genmask);
}
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, attr);
@@ -1854,6 +1925,9 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
*/
int nft_register_expr(struct nft_expr_type *type)
{
+ if (!nft_expr_check_ops(type->ops))
+ return -EINVAL;
+
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
list_add_tail_rcu(&type->list, &nf_tables_expressions);
@@ -1891,7 +1965,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
return NULL;
}
-static const struct nft_expr_type *nft_expr_type_get(u8 family,
+static const struct nft_expr_type *nft_expr_type_get(struct net *net,
+ u8 family,
struct nlattr *nla)
{
const struct nft_expr_type *type;
@@ -1903,19 +1978,16 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family,
if (type != NULL && try_module_get(type->owner))
return type;
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-expr-%u-%.*s", family,
- nla_len(nla), (char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla));
if (__nft_expr_type_get(family, nla))
return ERR_PTR(-EAGAIN);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-expr-%.*s",
- nla_len(nla), (char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-expr-%.*s",
+ nla_len(nla), (char *)nla_data(nla));
if (__nft_expr_type_get(family, nla))
return ERR_PTR(-EAGAIN);
}
@@ -1984,7 +2056,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
if (err < 0)
return err;
- type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+ type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);
if (IS_ERR(type))
return PTR_ERR(type);
@@ -2003,6 +2075,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
err = PTR_ERR(ops);
goto err1;
}
+ if (!nft_expr_check_ops(ops)) {
+ err = -EINVAL;
+ goto err1;
+ }
} else
ops = type->ops;
@@ -2349,7 +2425,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
@@ -2454,8 +2530,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
#define NFT_RULE_MAXEXPRS 128
-static struct nft_expr_info *info;
-
static int nf_tables_newrule(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2463,6 +2537,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
+ struct nft_expr_info *info = NULL;
int family = nfmsg->nfgen_family;
struct nft_table *table;
struct nft_chain *chain;
@@ -2474,10 +2549,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
struct nlattr *tmp;
unsigned int size, i, n, ulen = 0, usize = 0;
int err, rem;
- bool create;
u64 handle, pos_handle;
- create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+ lockdep_assert_held(&net->nft.commit_mutex);
table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
if (IS_ERR(table)) {
@@ -2485,7 +2559,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
@@ -2508,7 +2582,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
else
return -EOPNOTSUPP;
} else {
- if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
+ nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);
@@ -2533,6 +2608,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) {
+ info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+ sizeof(struct nft_expr_info),
+ GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
@@ -2625,6 +2706,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
list_add_rcu(&rule->list, &chain->rules);
}
}
+ kvfree(info);
chain->use++;
if (net->nft.validate_state == NFT_VALIDATE_DO)
@@ -2638,6 +2720,7 @@ err1:
if (info[i].ops != NULL)
module_put(info[i].ops->type->owner);
}
+ kvfree(info);
return err;
}
@@ -2677,7 +2760,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
}
if (nla[NFTA_RULE_CHAIN]) {
- chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
+ genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
@@ -2769,11 +2853,11 @@ nft_select_set_ops(const struct nft_ctx *ctx,
const struct nft_set_type *type;
u32 flags = 0;
+ lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (list_empty(&nf_tables_set_types)) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-set");
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(ctx->net, "nft-set");
if (!list_empty(&nf_tables_set_types))
return ERR_PTR(-EAGAIN);
}
@@ -3294,8 +3378,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_set *set;
struct nft_ctx ctx;
char *name;
- unsigned int size;
- bool create;
+ u64 size;
u64 timeout;
u32 ktype, dtype, flags, policy, gc_int, objtype;
struct nft_set_desc desc;
@@ -3396,8 +3479,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return err;
}
- create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
-
table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
@@ -3510,13 +3591,6 @@ static void nft_set_destroy(struct nft_set *set)
kvfree(set);
}
-static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
-{
- list_del_rcu(&set->list);
- nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
- nft_set_destroy(set);
-}
-
static int nf_tables_delset(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -3611,17 +3685,38 @@ bind:
}
EXPORT_SYMBOL_GPL(nf_tables_bind_set);
-void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
{
+ if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
+ nft_is_active(ctx->net, set))
+ list_add_tail_rcu(&set->list, &ctx->table->sets);
+
+ list_add_tail_rcu(&binding->list, &set->bindings);
+}
+EXPORT_SYMBOL_GPL(nf_tables_rebind_set);
+
+void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding)
+{
list_del_rcu(&binding->list);
if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
nft_is_active(ctx->net, set))
- nf_tables_set_destroy(ctx, set);
+ list_del_rcu(&set->list);
}
EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
+void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
+ nft_is_active(ctx->net, set)) {
+ nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
+ nft_set_destroy(set);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
+
const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_KEY] = {
.align = __alignof__(u32),
@@ -3963,7 +4058,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
- const struct nft_set_ext *ext;
struct nft_data_desc desc;
struct nft_set_elem elem;
struct sk_buff *skb;
@@ -3997,7 +4091,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return PTR_ERR(priv);
elem.priv = priv;
- ext = nft_set_elem_ext(set, &elem);
err = -ENOMEM;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
@@ -4582,6 +4675,7 @@ static int nft_flush_set(const struct nft_ctx *ctx,
}
set->ndeact++;
+ nft_set_elem_deactivate(ctx->net, set, elem);
nft_trans_elem_set(trans) = set;
nft_trans_elem(trans) = *elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
@@ -4818,7 +4912,8 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
return NULL;
}
-static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+static const struct nft_object_type *
+nft_obj_type_get(struct net *net, u32 objtype)
{
const struct nft_object_type *type;
@@ -4826,11 +4921,10 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype)
if (type != NULL && try_module_get(type->owner))
return type;
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-obj-%u", objtype);
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-obj-%u", objtype);
if (__nft_obj_type_get(objtype))
return ERR_PTR(-EAGAIN);
}
@@ -4882,7 +4976,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
- type = nft_obj_type_get(objtype);
+ type = nft_obj_type_get(net, objtype);
if (IS_ERR(type))
return PTR_ERR(type);
@@ -5372,7 +5466,8 @@ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
return NULL;
}
-static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
+static const struct nf_flowtable_type *
+nft_flowtable_type_get(struct net *net, u8 family)
{
const struct nf_flowtable_type *type;
@@ -5380,11 +5475,10 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
if (type != NULL && try_module_get(type->owner))
return type;
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nf-flowtable-%u", family);
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nf-flowtable-%u", family);
if (__nft_flowtable_type_get(family))
return ERR_PTR(-EAGAIN);
}
@@ -5464,7 +5558,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
goto err1;
}
- type = nft_flowtable_type_get(family);
+ type = nft_flowtable_type_get(net, family);
if (IS_ERR(type)) {
err = PTR_ERR(type);
goto err2;
@@ -5870,18 +5964,15 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
if (event != NETDEV_UNREGISTER)
return 0;
- net = maybe_get_net(dev_net(dev));
- if (!net)
- return 0;
-
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ net = dev_net(dev);
+ mutex_lock(&net->nft.commit_mutex);
list_for_each_entry(table, &net->nft.tables, list) {
list_for_each_entry(flowtable, &table->flowtables, list) {
nft_flowtable_event(event, dev, flowtable);
}
}
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- put_net(net);
+ mutex_unlock(&net->nft.commit_mutex);
+
return NOTIFY_DONE;
}
@@ -6138,19 +6229,28 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
+
+ if (trans->put_net)
+ put_net(trans->ctx.net);
+
kfree(trans);
}
-static void nf_tables_commit_release(struct net *net)
+static void nf_tables_trans_destroy_work(struct work_struct *w)
{
struct nft_trans *trans, *next;
+ LIST_HEAD(head);
+
+ spin_lock(&nf_tables_destroy_list_lock);
+ list_splice_init(&nf_tables_destroy_list, &head);
+ spin_unlock(&nf_tables_destroy_list_lock);
- if (list_empty(&net->nft.commit_list))
+ if (list_empty(&head))
return;
synchronize_rcu();
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &head, list) {
list_del(&trans->list);
nft_commit_release(trans);
}
@@ -6232,9 +6332,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha
next_genbit = nft_gencursor_next(net);
g0 = rcu_dereference_protected(chain->rules_gen_0,
- lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ lockdep_commit_lock_is_held(net));
g1 = rcu_dereference_protected(chain->rules_gen_1,
- lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ lockdep_commit_lock_is_held(net));
/* No changes to this chain? */
if (chain->rules_next == NULL) {
@@ -6281,6 +6381,37 @@ static void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
+static void nf_tables_commit_release(struct net *net)
+{
+ struct nft_trans *trans;
+
+ /* all side effects have to be made visible.
+ * For example, if a chain named 'foo' has been deleted, a
+ * new transaction must not find it anymore.
+ *
+ * Memory reclaim happens asynchronously from work queue
+ * to prevent expensive synchronize_rcu() in commit phase.
+ */
+ if (list_empty(&net->nft.commit_list)) {
+ mutex_unlock(&net->nft.commit_mutex);
+ return;
+ }
+
+ trans = list_last_entry(&net->nft.commit_list,
+ struct nft_trans, list);
+ get_net(trans->ctx.net);
+ WARN_ON_ONCE(trans->put_net);
+
+ trans->put_net = true;
+ spin_lock(&nf_tables_destroy_list_lock);
+ list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
+ spin_unlock(&nf_tables_destroy_list_lock);
+
+ mutex_unlock(&net->nft.commit_mutex);
+
+ schedule_work(&trans_destroy_work);
+}
+
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
struct nft_trans *trans, *next;
@@ -6442,8 +6573,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
}
}
- nf_tables_commit_release(net);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+ nf_tables_commit_release(net);
return 0;
}
@@ -6595,12 +6726,25 @@ static void nf_tables_cleanup(struct net *net)
static int nf_tables_abort(struct net *net, struct sk_buff *skb)
{
- return __nf_tables_abort(net);
+ int ret = __nf_tables_abort(net);
+
+ mutex_unlock(&net->nft.commit_mutex);
+
+ return ret;
}
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
- return net->nft.base_seq == genid;
+ bool genid_ok;
+
+ mutex_lock(&net->nft.commit_mutex);
+
+ genid_ok = genid == 0 || net->nft.base_seq == genid;
+ if (!genid_ok)
+ mutex_unlock(&net->nft.commit_mutex);
+
+ /* else, commit mutex has to be released by commit or abort function */
+ return genid_ok;
}
static const struct nfnetlink_subsystem nf_tables_subsys = {
@@ -6612,6 +6756,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.abort = nf_tables_abort,
.cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
+ .owner = THIS_MODULE,
};
int nft_chain_validate_dependency(const struct nft_chain *chain,
@@ -6931,8 +7076,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
case NFT_GOTO:
if (!tb[NFTA_VERDICT_CHAIN])
return -EINVAL;
- chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
- genmask);
+ chain = nft_chain_lookup(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
@@ -7100,7 +7245,8 @@ int __nft_release_basechain(struct nft_ctx *ctx)
{
struct nft_rule *rule, *nr;
- BUG_ON(!nft_is_base_chain(ctx->chain));
+ if (WARN_ON(!nft_is_base_chain(ctx->chain)))
+ return 0;
nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
@@ -7134,9 +7280,6 @@ static void __nft_release_tables(struct net *net)
list_for_each_entry(chain, &table->chains, list)
nf_tables_unregister_hook(net, table, chain);
- list_for_each_entry(flowtable, &table->flowtables, list)
- nf_unregister_net_hooks(net, flowtable->ops,
- flowtable->ops_len);
/* No packets are walking on these chains anymore. */
ctx.table = table;
list_for_each_entry(chain, &table->chains, list) {
@@ -7177,6 +7320,7 @@ static int __net_init nf_tables_init_net(struct net *net)
{
INIT_LIST_HEAD(&net->nft.tables);
INIT_LIST_HEAD(&net->nft.commit_list);
+ mutex_init(&net->nft.commit_mutex);
net->nft.base_seq = 1;
net->nft.validate_state = NFT_VALIDATE_SKIP;
@@ -7185,11 +7329,11 @@ static int __net_init nf_tables_init_net(struct net *net)
static void __net_exit nf_tables_exit_net(struct net *net)
{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ mutex_lock(&net->nft.commit_mutex);
if (!list_empty(&net->nft.commit_list))
__nf_tables_abort(net);
__nft_release_tables(net);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ mutex_unlock(&net->nft.commit_mutex);
WARN_ON_ONCE(!list_empty(&net->nft.tables));
}
@@ -7202,31 +7346,37 @@ static int __init nf_tables_module_init(void)
{
int err;
- nft_chain_filter_init();
+ spin_lock_init(&nf_tables_destroy_list_lock);
+ err = register_pernet_subsys(&nf_tables_net_ops);
+ if (err < 0)
+ return err;
- info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info),
- GFP_KERNEL);
- if (info == NULL) {
- err = -ENOMEM;
+ err = nft_chain_filter_init();
+ if (err < 0)
goto err1;
- }
err = nf_tables_core_module_init();
if (err < 0)
goto err2;
- err = nfnetlink_subsys_register(&nf_tables_subsys);
+ err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
if (err < 0)
goto err3;
- register_netdevice_notifier(&nf_tables_flowtable_notifier);
+ /* must be last */
+ err = nfnetlink_subsys_register(&nf_tables_subsys);
+ if (err < 0)
+ goto err4;
- return register_pernet_subsys(&nf_tables_net_ops);
+ return err;
+err4:
+ unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
err3:
nf_tables_core_module_exit();
err2:
- kfree(info);
+ nft_chain_filter_fini();
err1:
+ unregister_pernet_subsys(&nf_tables_net_ops);
return err;
}
@@ -7236,9 +7386,9 @@ static void __exit nf_tables_module_exit(void)
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
+ cancel_work_sync(&trans_destroy_work);
rcu_barrier();
nf_tables_core_module_exit();
- kfree(info);
}
module_init(nf_tables_module_init);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 8de912ca53d3..3fbce3b9c5ec 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -120,6 +120,20 @@ struct nft_jumpstack {
struct nft_rule *const *rules;
};
+static void expr_call_ops_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ struct nft_pktinfo *pkt)
+{
+ unsigned long e = (unsigned long)expr->ops->eval;
+
+ if (e == (unsigned long)nft_meta_get_eval)
+ nft_meta_get_eval(expr, regs, pkt);
+ else if (e == (unsigned long)nft_lookup_eval)
+ nft_lookup_eval(expr, regs, pkt);
+ else
+ expr->ops->eval(expr, regs, pkt);
+}
+
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
@@ -153,7 +167,7 @@ next_rule:
nft_cmp_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
- expr->ops->eval(expr, &regs, pkt);
+ expr_call_ops_eval(expr, &regs, pkt);
if (regs.verdict.code != NFT_CONTINUE)
break;
@@ -235,12 +249,24 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_exthdr_type,
};
+static struct nft_object_type *nft_basic_objects[] = {
+#ifdef CONFIG_NETWORK_SECMARK
+ &nft_secmark_obj_type,
+#endif
+};
+
int __init nf_tables_core_module_init(void)
{
- int err, i;
+ int err, i, j = 0;
+
+ for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
+ err = nft_register_obj(nft_basic_objects[i]);
+ if (err)
+ goto err;
+ }
- for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) {
- err = nft_register_expr(nft_basic_types[i]);
+ for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) {
+ err = nft_register_expr(nft_basic_types[j]);
if (err)
goto err;
}
@@ -248,8 +274,12 @@ int __init nf_tables_core_module_init(void)
return 0;
err:
+ while (j-- > 0)
+ nft_unregister_expr(nft_basic_types[j]);
+
while (i-- > 0)
- nft_unregister_expr(nft_basic_types[i]);
+ nft_unregister_obj(nft_basic_objects[i]);
+
return err;
}
@@ -260,4 +290,8 @@ void nf_tables_core_module_exit(void)
i = ARRAY_SIZE(nft_basic_types);
while (i-- > 0)
nft_unregister_expr(nft_basic_types[i]);
+
+ i = ARRAY_SIZE(nft_basic_objects);
+ while (i-- > 0)
+ nft_unregister_obj(nft_basic_objects[i]);
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e1b6be29848d..916913454624 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -331,18 +331,27 @@ replay:
}
}
- if (!ss->commit || !ss->abort) {
+ if (!ss->valid_genid || !ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
return kfree_skb(skb);
}
- if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+ if (!try_module_get(ss->owner)) {
+ nfnl_unlock(subsys_id);
+ netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
+ return kfree_skb(skb);
+ }
+
+ if (!ss->valid_genid(net, genid)) {
+ module_put(ss->owner);
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -ERESTART, NULL);
return kfree_skb(skb);
}
+ nfnl_unlock(subsys_id);
+
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -464,14 +473,10 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- const struct nfnetlink_subsystem *ss2;
-
- ss2 = nfnl_dereference_protected(subsys_id);
- if (ss2 == ss)
- ss->abort(net, oskb);
+ ss->abort(net, oskb);
nfnl_err_reset(&err_list);
- nfnl_unlock(subsys_id);
kfree_skb(skb);
+ module_put(ss->owner);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
err = ss->commit(net, oskb);
@@ -489,8 +494,8 @@ done:
ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
- nfnl_unlock(subsys_id);
kfree_skb(skb);
+ module_put(ss->owner);
}
static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index a0e5adf0b3b6..8fa8bf7c48e6 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -238,29 +238,33 @@ static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = {
[NFACCT_FILTER_VALUE] = { .type = NLA_U32 },
};
-static struct nfacct_filter *
-nfacct_filter_alloc(const struct nlattr * const attr)
+static int nfnl_acct_start(struct netlink_callback *cb)
{
- struct nfacct_filter *filter;
+ const struct nlattr *const attr = cb->data;
struct nlattr *tb[NFACCT_FILTER_MAX + 1];
+ struct nfacct_filter *filter;
int err;
+ if (!attr)
+ return 0;
+
err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy,
NULL);
if (err < 0)
- return ERR_PTR(err);
+ return err;
if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
if (!filter)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK]));
filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE]));
+ cb->data = filter;
- return filter;
+ return 0;
}
static int nfnl_acct_get(struct net *net, struct sock *nfnl,
@@ -275,18 +279,11 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_acct_dump,
+ .start = nfnl_acct_start,
.done = nfnl_acct_done,
+ .data = (void *)tb[NFACCT_FILTER],
};
- if (tb[NFACCT_FILTER]) {
- struct nfacct_filter *filter;
-
- filter = nfacct_filter_alloc(tb[NFACCT_FILTER]);
- if (IS_ERR(filter))
- return PTR_ERR(filter);
-
- c.data = filter;
- }
return netlink_dump_start(nfnl, skb, nlh, &c);
}
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 9ee5fa551fa6..e7a50af1b3d6 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -26,7 +26,6 @@
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_timeout.h>
@@ -47,16 +46,13 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
};
static int
-ctnl_timeout_parse_policy(void *timeouts,
+ctnl_timeout_parse_policy(void *timeout,
const struct nf_conntrack_l4proto *l4proto,
struct net *net, const struct nlattr *attr)
{
struct nlattr **tb;
int ret = 0;
- if (!l4proto->ctnl_timeout.nlattr_to_obj)
- return 0;
-
tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
GFP_KERNEL);
@@ -68,7 +64,7 @@ ctnl_timeout_parse_policy(void *timeouts,
if (ret < 0)
goto err;
- ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+ ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout);
err:
kfree(tb);
@@ -114,19 +110,19 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
/* You cannot replace one timeout policy by another of
* different kind, sorry.
*/
- if (matching->l3num != l3num ||
- matching->l4proto->l4proto != l4num)
+ if (matching->timeout.l3num != l3num ||
+ matching->timeout.l4proto->l4proto != l4num)
return -EINVAL;
- return ctnl_timeout_parse_policy(&matching->data,
- matching->l4proto, net,
- cda[CTA_TIMEOUT_DATA]);
+ return ctnl_timeout_parse_policy(&matching->timeout.data,
+ matching->timeout.l4proto,
+ net, cda[CTA_TIMEOUT_DATA]);
}
return -EBUSY;
}
- l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+ l4proto = nf_ct_l4proto_find_get(l4num);
/* This protocol is not supportted, skip. */
if (l4proto->l4proto != l4num) {
@@ -141,14 +137,14 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
goto err_proto_put;
}
- ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net,
+ ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net,
cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME]));
- timeout->l3num = l3num;
- timeout->l4proto = l4proto;
+ timeout->timeout.l3num = l3num;
+ timeout->timeout.l4proto = l4proto;
refcount_set(&timeout->refcnt, 1);
list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list);
@@ -167,7 +163,9 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
- const struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
+ const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
+ struct nlattr *nest_parms;
+ int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
@@ -180,27 +178,22 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
nfmsg->res_id = 0;
if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
- nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) ||
- nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
+ nla_put_be16(skb, CTA_TIMEOUT_L3PROTO,
+ htons(timeout->timeout.l3num)) ||
+ nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) ||
nla_put_be32(skb, CTA_TIMEOUT_USE,
htonl(refcount_read(&timeout->refcnt))))
goto nla_put_failure;
- if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
- struct nlattr *nest_parms;
- int ret;
+ nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
- nest_parms = nla_nest_start(skb,
- CTA_TIMEOUT_DATA | NLA_F_NESTED);
- if (!nest_parms)
- goto nla_put_failure;
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data);
+ if (ret < 0)
+ goto nla_put_failure;
- ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
- if (ret < 0)
- goto nla_put_failure;
-
- nla_nest_end(skb, nest_parms);
- }
+ nla_nest_end(skb, nest_parms);
nlmsg_end(skb, nlh);
return skb->len;
@@ -298,22 +291,6 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
return ret;
}
-static int untimeout(struct nf_conn *ct, void *timeout)
-{
- struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
-
- if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
- RCU_INIT_POINTER(timeout_ext->timeout, NULL);
-
- /* We are not intended to delete this conntrack. */
- return 0;
-}
-
-static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
-{
- nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
-}
-
/* try to delete object, fail if it is still in use. */
static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
{
@@ -325,8 +302,8 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
if (refcount_dec_if_one(&timeout->refcnt)) {
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
- nf_ct_l4proto_put(timeout->l4proto);
- ctnl_untimeout(net, timeout);
+ nf_ct_l4proto_put(timeout->timeout.l4proto);
+ nf_ct_untimeout(net, &timeout->timeout);
kfree_rcu(timeout, rcu_head);
} else {
ret = -EBUSY;
@@ -373,8 +350,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
struct netlink_ext_ack *extack)
{
const struct nf_conntrack_l4proto *l4proto;
- unsigned int *timeouts;
- __u16 l3num;
__u8 l4num;
int ret;
@@ -383,9 +358,8 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
!cda[CTA_TIMEOUT_DATA])
return -EINVAL;
- l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+ l4proto = nf_ct_l4proto_find_get(l4num);
/* This protocol is not supported, skip. */
if (l4proto->l4proto != l4num) {
@@ -393,9 +367,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
goto err;
}
- timeouts = l4proto->get_timeouts(net);
-
- ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+ ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -409,12 +381,14 @@ err:
static int
cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
- u32 seq, u32 type, int event,
+ u32 seq, u32 type, int event, u16 l3num,
const struct nf_conntrack_l4proto *l4proto)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
+ struct nlattr *nest_parms;
+ int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
@@ -426,26 +400,19 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
nfmsg->version = NFNETLINK_V0;
nfmsg->res_id = 0;
- if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) ||
+ if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
goto nla_put_failure;
- if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
- struct nlattr *nest_parms;
- unsigned int *timeouts = l4proto->get_timeouts(net);
- int ret;
-
- nest_parms = nla_nest_start(skb,
- CTA_TIMEOUT_DATA | NLA_F_NESTED);
- if (!nest_parms)
- goto nla_put_failure;
+ nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_parms)
+ goto nla_put_failure;
- ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
- if (ret < 0)
- goto nla_put_failure;
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
+ if (ret < 0)
+ goto nla_put_failure;
- nla_nest_end(skb, nest_parms);
- }
+ nla_nest_end(skb, nest_parms);
nlmsg_end(skb, nlh);
return skb->len;
@@ -473,7 +440,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+ l4proto = nf_ct_l4proto_find_get(l4num);
/* This protocol is not supported, skip. */
if (l4proto->l4proto != l4num) {
@@ -491,6 +458,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
nlh->nlmsg_seq,
NFNL_MSG_TYPE(nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
+ l3num,
l4proto);
if (ret <= 0) {
kfree_skb(skb2);
@@ -508,9 +476,8 @@ err:
return err;
}
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-static struct ctnl_timeout *
-ctnl_timeout_find_get(struct net *net, const char *name)
+static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
+ const char *name)
{
struct ctnl_timeout *timeout, *matching = NULL;
@@ -529,17 +496,19 @@ ctnl_timeout_find_get(struct net *net, const char *name)
break;
}
err:
- return matching;
+ return matching ? &matching->timeout : NULL;
}
-static void ctnl_timeout_put(struct ctnl_timeout *timeout)
+static void ctnl_timeout_put(struct nf_ct_timeout *t)
{
+ struct ctnl_timeout *timeout =
+ container_of(t, struct ctnl_timeout, timeout);
+
if (refcount_dec_and_test(&timeout->refcnt))
kfree_rcu(timeout, rcu_head);
module_put(THIS_MODULE);
}
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
[IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
@@ -580,11 +549,11 @@ static void __net_exit cttimeout_net_exit(struct net *net)
struct ctnl_timeout *cur, *tmp;
nf_ct_unconfirmed_destroy(net);
- ctnl_untimeout(net, NULL);
+ nf_ct_untimeout(net, NULL);
list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
list_del_rcu(&cur->head);
- nf_ct_l4proto_put(cur->l4proto);
+ nf_ct_l4proto_put(cur->timeout.l4proto);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
@@ -610,10 +579,8 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
return 0;
err_out:
@@ -626,11 +593,9 @@ static void __exit cttimeout_exit(void)
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
synchronize_rcu();
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
new file mode 100644
index 000000000000..6f41dd74729d
--- /dev/null
+++ b/net/netfilter/nfnetlink_osf.c
@@ -0,0 +1,432 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nfnetlink_osf.h>
+
+/*
+ * Indexed by dont-fragment bit.
+ * It is the only constant value in the fingerprint.
+ */
+struct list_head nf_osf_fingers[2];
+EXPORT_SYMBOL_GPL(nf_osf_fingers);
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+ int ttl_check, unsigned char f_ttl)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ const struct iphdr *ip = ip_hdr(skb);
+ int ret = 0;
+
+ if (ttl_check == NF_OSF_TTL_TRUE)
+ return ip->ttl == f_ttl;
+ if (ttl_check == NF_OSF_TTL_NOCHECK)
+ return 1;
+ else if (ip->ttl <= f_ttl)
+ return 1;
+
+ for_ifa(in_dev) {
+ if (inet_ifa_match(ip->saddr, ifa)) {
+ ret = (ip->ttl == f_ttl);
+ break;
+ }
+ }
+
+ endfor_ifa(in_dev);
+
+ return ret;
+}
+
+struct nf_osf_hdr_ctx {
+ bool df;
+ u16 window;
+ u16 totlen;
+ const unsigned char *optp;
+ unsigned int optsize;
+};
+
+static bool nf_osf_match_one(const struct sk_buff *skb,
+ const struct nf_osf_user_finger *f,
+ int ttl_check,
+ struct nf_osf_hdr_ctx *ctx)
+{
+ unsigned int check_WSS = 0;
+ int fmatch = FMATCH_WRONG;
+ int foptsize, optnum;
+ u16 mss = 0;
+
+ if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl))
+ return false;
+
+ /*
+ * Should not happen if userspace parser was written correctly.
+ */
+ if (f->wss.wc >= OSF_WSS_MAX)
+ return false;
+
+ /* Check options */
+
+ foptsize = 0;
+ for (optnum = 0; optnum < f->opt_num; ++optnum)
+ foptsize += f->opt[optnum].length;
+
+ if (foptsize > MAX_IPOPTLEN ||
+ ctx->optsize > MAX_IPOPTLEN ||
+ ctx->optsize != foptsize)
+ return false;
+
+ check_WSS = f->wss.wc;
+
+ for (optnum = 0; optnum < f->opt_num; ++optnum) {
+ if (f->opt[optnum].kind == *ctx->optp) {
+ __u32 len = f->opt[optnum].length;
+ const __u8 *optend = ctx->optp + len;
+
+ fmatch = FMATCH_OK;
+
+ switch (*ctx->optp) {
+ case OSFOPT_MSS:
+ mss = ctx->optp[3];
+ mss <<= 8;
+ mss |= ctx->optp[2];
+
+ mss = ntohs((__force __be16)mss);
+ break;
+ case OSFOPT_TS:
+ break;
+ }
+
+ ctx->optp = optend;
+ } else
+ fmatch = FMATCH_OPT_WRONG;
+
+ if (fmatch != FMATCH_OK)
+ break;
+ }
+
+ if (fmatch != FMATCH_OPT_WRONG) {
+ fmatch = FMATCH_WRONG;
+
+ switch (check_WSS) {
+ case OSF_WSS_PLAIN:
+ if (f->wss.val == 0 || ctx->window == f->wss.val)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MSS:
+ /*
+ * Some smart modems decrease mangle MSS to
+ * SMART_MSS_2, so we check standard, decreased
+ * and the one provided in the fingerprint MSS
+ * values.
+ */
+#define SMART_MSS_1 1460
+#define SMART_MSS_2 1448
+ if (ctx->window == f->wss.val * mss ||
+ ctx->window == f->wss.val * SMART_MSS_1 ||
+ ctx->window == f->wss.val * SMART_MSS_2)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MTU:
+ if (ctx->window == f->wss.val * (mss + 40) ||
+ ctx->window == f->wss.val * (SMART_MSS_1 + 40) ||
+ ctx->window == f->wss.val * (SMART_MSS_2 + 40))
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MODULO:
+ if ((ctx->window % f->wss.val) == 0)
+ fmatch = FMATCH_OK;
+ break;
+ }
+ }
+
+ return fmatch == FMATCH_OK;
+}
+
+static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct iphdr *ip,
+ unsigned char *opts)
+{
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ if (!tcp)
+ return NULL;
+
+ if (!tcp->syn)
+ return NULL;
+
+ ctx->totlen = ntohs(ip->tot_len);
+ ctx->df = ntohs(ip->frag_off) & IP_DF;
+ ctx->window = ntohs(tcp->window);
+
+ if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+ ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+ ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+ sizeof(struct tcphdr), ctx->optsize, opts);
+ }
+
+ return tcp;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+ int hooknum, struct net_device *in, struct net_device *out,
+ const struct nf_osf_info *info, struct net *net,
+ const struct list_head *nf_osf_fingers)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct nf_osf_user_finger *f;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct nf_osf_finger *kf;
+ int fcount = 0, ttl_check;
+ int fmatch = FMATCH_WRONG;
+ struct nf_osf_hdr_ctx ctx;
+ const struct tcphdr *tcp;
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ if (!tcp)
+ return false;
+
+ ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0;
+
+ list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
+
+ f = &kf->finger;
+
+ if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+ continue;
+
+ if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
+ continue;
+
+ fmatch = FMATCH_OK;
+
+ fcount++;
+
+ if (info->flags & NF_OSF_LOG)
+ nf_log_packet(net, family, hooknum, skb,
+ in, out, NULL,
+ "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+ f->genre, f->version, f->subtype,
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest),
+ f->ttl - ip->ttl);
+
+ if ((info->flags & NF_OSF_LOG) &&
+ info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+ break;
+ }
+
+ if (!fcount && (info->flags & NF_OSF_LOG))
+ nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+ "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest));
+
+ if (fcount)
+ fmatch = FMATCH_OK;
+
+ return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+const char *nf_osf_find(const struct sk_buff *skb,
+ const struct list_head *nf_osf_fingers,
+ const int ttl_check)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct nf_osf_user_finger *f;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct nf_osf_finger *kf;
+ struct nf_osf_hdr_ctx ctx;
+ const struct tcphdr *tcp;
+ const char *genre = NULL;
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ if (!tcp)
+ return NULL;
+
+ list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
+ f = &kf->finger;
+ if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
+ continue;
+
+ genre = f->genre;
+ break;
+ }
+
+ return genre;
+}
+EXPORT_SYMBOL_GPL(nf_osf_find);
+
+static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
+ [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) },
+};
+
+static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const osf_attrs[],
+ struct netlink_ext_ack *extack)
+{
+ struct nf_osf_user_finger *f;
+ struct nf_osf_finger *kf = NULL, *sf;
+ int err = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!osf_attrs[OSF_ATTR_FINGER])
+ return -EINVAL;
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -EINVAL;
+
+ f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+ kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
+ if (!kf)
+ return -ENOMEM;
+
+ memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger));
+
+ list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
+ if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
+ continue;
+
+ kfree(kf);
+ kf = NULL;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ err = -EEXIST;
+ break;
+ }
+
+ /*
+ * We are protected by nfnl mutex.
+ */
+ if (kf)
+ list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]);
+
+ return err;
+}
+
+static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const osf_attrs[],
+ struct netlink_ext_ack *extack)
+{
+ struct nf_osf_user_finger *f;
+ struct nf_osf_finger *sf;
+ int err = -ENOENT;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!osf_attrs[OSF_ATTR_FINGER])
+ return -EINVAL;
+
+ f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+ list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
+ if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
+ continue;
+
+ /*
+ * We are protected by nfnl mutex.
+ */
+ list_del_rcu(&sf->finger_entry);
+ kfree_rcu(sf, rcu_head);
+
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
+ [OSF_MSG_ADD] = {
+ .call = nfnl_osf_add_callback,
+ .attr_count = OSF_ATTR_MAX,
+ .policy = nfnl_osf_policy,
+ },
+ [OSF_MSG_REMOVE] = {
+ .call = nfnl_osf_remove_callback,
+ .attr_count = OSF_ATTR_MAX,
+ .policy = nfnl_osf_policy,
+ },
+};
+
+static const struct nfnetlink_subsystem nfnl_osf_subsys = {
+ .name = "osf",
+ .subsys_id = NFNL_SUBSYS_OSF,
+ .cb_count = OSF_MSG_MAX,
+ .cb = nfnl_osf_callbacks,
+};
+
+static int __init nfnl_osf_init(void)
+{
+ int err = -EINVAL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i)
+ INIT_LIST_HEAD(&nf_osf_fingers[i]);
+
+ err = nfnetlink_subsys_register(&nfnl_osf_subsys);
+ if (err < 0) {
+ pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
+ goto err_out_exit;
+ }
+ return 0;
+
+err_out_exit:
+ return err;
+}
+
+static void __exit nfnl_osf_fini(void)
+{
+ struct nf_osf_finger *f;
+ int i;
+
+ nfnetlink_subsys_unregister(&nfnl_osf_subsys);
+
+ rcu_read_lock();
+ for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) {
+ list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) {
+ list_del_rcu(&f->finger_entry);
+ kfree_rcu(f, rcu_head);
+ }
+ }
+ rcu_read_unlock();
+
+ rcu_barrier();
+}
+
+module_init(nfnl_osf_init);
+module_exit(nfnl_osf_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea4ba551abb2..43041f087eb3 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -233,6 +233,7 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
int err;
if (verdict == NF_ACCEPT ||
+ verdict == NF_REPEAT ||
verdict == NF_STOP) {
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
@@ -764,7 +765,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
return ret;
}
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
entry_seg = nf_queue_entry_dup(entry);
if (entry_seg) {
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index d21834bed805..3fd540b2c6ba 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -293,6 +293,13 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
if (strcmp(basechain->dev_name, dev->name) != 0)
return;
+ /* UNREGISTER events are also happpening on netns exit.
+ *
+ * Altough nf_tables core releases all tables/chains, only
+ * this event handler provides guarantee that
+ * basechain.ops->dev is still accessible, so we cannot
+ * skip exiting net namespaces.
+ */
__nft_release_basechain(ctx);
break;
case NETDEV_CHANGENAME:
@@ -318,11 +325,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
- ctx.net = maybe_get_net(ctx.net);
- if (!ctx.net)
- return NOTIFY_DONE;
-
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ mutex_lock(&ctx.net->nft.commit_mutex);
list_for_each_entry(table, &ctx.net->nft.tables, list) {
if (table->family != NFPROTO_NETDEV)
continue;
@@ -337,8 +340,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
nft_netdev_event(event, dev, &ctx);
}
}
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- put_net(ctx.net);
+ mutex_unlock(&ctx.net->nft.commit_mutex);
return NOTIFY_DONE;
}
@@ -392,7 +394,7 @@ int __init nft_chain_filter_init(void)
return 0;
}
-void __exit nft_chain_filter_fini(void)
+void nft_chain_filter_fini(void)
{
nft_chain_filter_bridge_fini();
nft_chain_filter_inet_fini();
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index fa90a8402845..79d48c1d06f4 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -79,7 +79,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
tb[NFTA_CMP_DATA]);
- BUG_ON(err < 0);
+ if (err < 0)
+ return err;
priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
err = nft_validate_register_load(priv->sreg, desc.len);
@@ -129,7 +130,8 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx,
err = nft_data_init(NULL, &data, sizeof(data), &desc,
tb[NFTA_CMP_DATA]);
- BUG_ON(err < 0);
+ if (err < 0)
+ return err;
priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
err = nft_validate_register_load(priv->sreg, desc.len);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 32535eea51b2..768292eac2a4 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -290,6 +290,24 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
module_put(target->me);
}
+static int nft_extension_dump_info(struct sk_buff *skb, int attr,
+ const void *info,
+ unsigned int size, unsigned int user_size)
+{
+ unsigned int info_size, aligned_size = XT_ALIGN(size);
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, attr, aligned_size);
+ if (!nla)
+ return -1;
+
+ info_size = user_size ? : size;
+ memcpy(nla_data(nla), info, info_size);
+ memset(nla_data(nla) + info_size, 0, aligned_size - info_size);
+
+ return 0;
+}
+
static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct xt_target *target = expr->ops->data;
@@ -297,7 +315,8 @@ static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
- nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(target->targetsize), info))
+ nft_extension_dump_info(skb, NFTA_TARGET_INFO, info,
+ target->targetsize, target->usersize))
goto nla_put_failure;
return 0;
@@ -532,7 +551,8 @@ static int __nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr,
if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
- nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(match->matchsize), info))
+ nft_extension_dump_info(skb, NFTA_MATCH_INFO, info,
+ match->matchsize, match->usersize))
goto nla_put_failure;
return 0;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index a832c59f0a9c..b90d96ba4a12 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,10 +14,9 @@
#include <net/netfilter/nf_conntrack_zones.h>
struct nft_connlimit {
- spinlock_t lock;
- struct hlist_head hhead;
- u32 limit;
- bool invert;
+ struct nf_conncount_list list;
+ u32 limit;
+ bool invert;
};
static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
@@ -45,21 +44,19 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
return;
}
- spin_lock_bh(&priv->lock);
- count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
- &addit);
+ nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
+ &addit);
+ count = priv->list.count;
if (!addit)
goto out;
- if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) {
+ if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) {
regs->verdict.code = NF_DROP;
- spin_unlock_bh(&priv->lock);
return;
}
count++;
out:
- spin_unlock_bh(&priv->lock);
if ((count > priv->limit) ^ priv->invert) {
regs->verdict.code = NFT_BREAK;
@@ -87,8 +84,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- spin_lock_init(&priv->lock);
- INIT_HLIST_HEAD(&priv->hhead);
+ nf_conncount_list_init(&priv->list);
priv->limit = limit;
priv->invert = invert;
@@ -99,7 +95,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
struct nft_connlimit *priv)
{
nf_ct_netns_put(ctx->net, ctx->family);
- nf_conncount_cache_free(&priv->hhead);
+ nf_conncount_cache_free(&priv->list);
}
static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -212,8 +208,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- spin_lock_init(&priv_dst->lock);
- INIT_HLIST_HEAD(&priv_dst->hhead);
+ nf_conncount_list_init(&priv_dst->list);
priv_dst->limit = priv_src->limit;
priv_dst->invert = priv_src->invert;
@@ -225,21 +220,14 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- nf_conncount_cache_free(&priv->hhead);
+ nf_conncount_cache_free(&priv->list);
}
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- bool addit, ret;
- spin_lock_bh(&priv->lock);
- nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
-
- ret = hlist_empty(&priv->hhead);
- spin_unlock_bh(&priv->lock);
-
- return ret;
+ return nf_conncount_gc_list(net, &priv->list);
}
static struct nft_expr_type nft_connlimit_type;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 1435ffc5f57e..586627c361df 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -22,6 +22,8 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
struct nft_ct {
enum nft_ct_keys key:8;
@@ -277,7 +279,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
{
const struct nft_ct *priv = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
-#ifdef CONFIG_NF_CONNTRACK_MARK
+#if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK)
u32 value = regs->data[priv->sreg];
#endif
enum ip_conntrack_info ctinfo;
@@ -296,6 +298,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
}
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ if (ct->secmark != value) {
+ ct->secmark = value;
+ nf_conntrack_event_cache(IPCT_SECMARK, ct);
+ }
+ break;
+#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS:
nf_connlabels_replace(ct,
@@ -563,6 +573,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
len = sizeof(u32);
break;
#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+ len = sizeof(u32);
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -765,6 +782,191 @@ static struct nft_expr_type nft_notrack_type __read_mostly = {
.owner = THIS_MODULE,
};
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+static int
+nft_ct_timeout_parse_policy(void *timeouts,
+ const struct nf_conntrack_l4proto *l4proto,
+ struct net *net, const struct nlattr *attr)
+{
+ struct nlattr **tb;
+ int ret = 0;
+
+ tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
+ GFP_KERNEL);
+
+ if (!tb)
+ return -ENOMEM;
+
+ ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
+ attr, l4proto->ctnl_timeout.nla_policy,
+ NULL);
+ if (ret < 0)
+ goto err;
+
+ ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+
+err:
+ kfree(tb);
+ return ret;
+}
+
+struct nft_ct_timeout_obj {
+ struct nf_ct_timeout *timeout;
+ u8 l4proto;
+};
+
+static void nft_ct_timeout_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+ struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb);
+ struct nf_conn_timeout *timeout;
+ const unsigned int *values;
+
+ if (priv->l4proto != pkt->tprot)
+ return;
+
+ if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct))
+ return;
+
+ timeout = nf_ct_timeout_find(ct);
+ if (!timeout) {
+ timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC);
+ if (!timeout) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+ }
+
+ rcu_assign_pointer(timeout->timeout, priv->timeout);
+
+ /* adjust the timeout as per 'new' state. ct is unconfirmed,
+ * so the current timestamp must not be added.
+ */
+ values = nf_ct_timeout_data(timeout);
+ if (values)
+ nf_ct_refresh(ct, pkt->skb, values[0]);
+}
+
+static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+ const struct nf_conntrack_l4proto *l4proto;
+ struct nf_ct_timeout *timeout;
+ int l3num = ctx->family;
+ __u8 l4num;
+ int ret;
+
+ if (!tb[NFTA_CT_TIMEOUT_L4PROTO] ||
+ !tb[NFTA_CT_TIMEOUT_DATA])
+ return -EINVAL;
+
+ if (tb[NFTA_CT_TIMEOUT_L3PROTO])
+ l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO]));
+
+ l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
+ priv->l4proto = l4num;
+
+ l4proto = nf_ct_l4proto_find_get(l4num);
+
+ if (l4proto->l4proto != l4num) {
+ ret = -EOPNOTSUPP;
+ goto err_proto_put;
+ }
+
+ timeout = kzalloc(sizeof(struct nf_ct_timeout) +
+ l4proto->ctnl_timeout.obj_size, GFP_KERNEL);
+ if (timeout == NULL) {
+ ret = -ENOMEM;
+ goto err_proto_put;
+ }
+
+ ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net,
+ tb[NFTA_CT_TIMEOUT_DATA]);
+ if (ret < 0)
+ goto err_free_timeout;
+
+ timeout->l3num = l3num;
+ timeout->l4proto = l4proto;
+
+ ret = nf_ct_netns_get(ctx->net, ctx->family);
+ if (ret < 0)
+ goto err_free_timeout;
+
+ priv->timeout = timeout;
+ return 0;
+
+err_free_timeout:
+ kfree(timeout);
+err_proto_put:
+ nf_ct_l4proto_put(l4proto);
+ return ret;
+}
+
+static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+ struct nf_ct_timeout *timeout = priv->timeout;
+
+ nf_ct_untimeout(ctx->net, timeout);
+ nf_ct_l4proto_put(timeout->l4proto);
+ nf_ct_netns_put(ctx->net, ctx->family);
+ kfree(priv->timeout);
+}
+
+static int nft_ct_timeout_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ const struct nft_ct_timeout_obj *priv = nft_obj_data(obj);
+ const struct nf_ct_timeout *timeout = priv->timeout;
+ struct nlattr *nest_params;
+ int ret;
+
+ if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) ||
+ nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num)))
+ return -1;
+
+ nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA | NLA_F_NESTED);
+ if (!nest_params)
+ return -1;
+
+ ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data);
+ if (ret < 0)
+ return -1;
+ nla_nest_end(skb, nest_params);
+ return 0;
+}
+
+static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = {
+ [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 },
+ [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 },
+ [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED },
+};
+
+static struct nft_object_type nft_ct_timeout_obj_type;
+
+static const struct nft_object_ops nft_ct_timeout_obj_ops = {
+ .type = &nft_ct_timeout_obj_type,
+ .size = sizeof(struct nft_ct_timeout_obj),
+ .eval = nft_ct_timeout_obj_eval,
+ .init = nft_ct_timeout_obj_init,
+ .destroy = nft_ct_timeout_obj_destroy,
+ .dump = nft_ct_timeout_obj_dump,
+};
+
+static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = {
+ .type = NFT_OBJECT_CT_TIMEOUT,
+ .ops = &nft_ct_timeout_obj_ops,
+ .maxattr = NFTA_CT_TIMEOUT_MAX,
+ .policy = nft_ct_timeout_policy,
+ .owner = THIS_MODULE,
+};
+#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
+
static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
@@ -773,6 +975,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
struct nf_conntrack_helper *help4, *help6;
char name[NF_CT_HELPER_NAME_LEN];
int family = ctx->family;
+ int err;
if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO])
return -EINVAL;
@@ -823,7 +1026,18 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
priv->helper4 = help4;
priv->helper6 = help6;
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err < 0)
+ goto err_put_helper;
+
return 0;
+
+err_put_helper:
+ if (priv->helper4)
+ nf_conntrack_helper_put(priv->helper4);
+ if (priv->helper6)
+ nf_conntrack_helper_put(priv->helper6);
+ return err;
}
static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
@@ -835,6 +1049,8 @@ static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
nf_conntrack_helper_put(priv->helper4);
if (priv->helper6)
nf_conntrack_helper_put(priv->helper6);
+
+ nf_ct_netns_put(ctx->net, ctx->family);
}
static void nft_ct_helper_obj_eval(struct nft_object *obj,
@@ -870,7 +1086,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
if (test_bit(IPS_HELPER_BIT, &ct->status))
return;
- help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help) {
rcu_assign_pointer(help->helper, to_assign);
set_bit(IPS_HELPER_BIT, &ct->status);
@@ -949,9 +1165,17 @@ static int __init nft_ct_module_init(void)
err = nft_register_obj(&nft_ct_helper_obj_type);
if (err < 0)
goto err2;
-
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ err = nft_register_obj(&nft_ct_timeout_obj_type);
+ if (err < 0)
+ goto err3;
+#endif
return 0;
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+err3:
+ nft_unregister_obj(&nft_ct_helper_obj_type);
+#endif
err2:
nft_unregister_expr(&nft_notrack_type);
err1:
@@ -961,6 +1185,9 @@ err1:
static void __exit nft_ct_module_exit(void)
{
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ nft_unregister_obj(&nft_ct_timeout_obj_type);
+#endif
nft_unregister_obj(&nft_ct_helper_obj_type);
nft_unregister_expr(&nft_notrack_type);
nft_unregister_expr(&nft_ct_type);
@@ -974,3 +1201,4 @@ MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("ct");
MODULE_ALIAS_NFT_EXPR("notrack");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index 2cc1e0ef56e8..15cc62b293d6 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -46,8 +46,6 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
return nft_validate_register_load(priv->sreg_dev, sizeof(int));
}
-static const struct nft_expr_ops nft_dup_netdev_ingress_ops;
-
static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
struct nft_dup_netdev *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 27d7e4598ab6..07d4efd3d851 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
u64 timeout;
int err;
+ lockdep_assert_held(&ctx->net->nft.commit_mutex);
+
if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
tb[NFTA_DYNSET_OP] == NULL ||
tb[NFTA_DYNSET_SREG_KEY] == NULL)
@@ -185,8 +187,6 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_EXPR] != NULL) {
if (!(set->flags & NFT_SET_EVAL))
return -EINVAL;
- if (!nft_set_is_anonymous(set))
- return -EOPNOTSUPP;
priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
if (IS_ERR(priv->expr))
@@ -235,14 +235,31 @@ err1:
return err;
}
+static void nft_dynset_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_dynset *priv = nft_expr_priv(expr);
+
+ nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+}
+
+static void nft_dynset_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_dynset *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
static void nft_dynset_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_dynset *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
if (priv->expr != NULL)
nft_expr_destroy(ctx, priv->expr);
+
+ nf_tables_destroy_set(ctx, priv->set);
}
static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -279,6 +296,8 @@ static const struct nft_expr_ops nft_dynset_ops = {
.eval = nft_dynset_eval,
.init = nft_dynset_init,
.destroy = nft_dynset_destroy,
+ .activate = nft_dynset_activate,
+ .deactivate = nft_dynset_deactivate,
.dump = nft_dynset_dump,
};
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index d6bab8c3cbb0..e82d9a966c45 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -201,7 +201,7 @@ static int flow_offload_netdev_event(struct notifier_block *this,
if (event != NETDEV_DOWN)
return NOTIFY_DONE;
- nf_flow_table_cleanup(dev_net(dev), dev);
+ nf_flow_table_cleanup(dev);
return NOTIFY_DONE;
}
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 8abb9891cdf2..d7694e7255a0 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -53,8 +53,6 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
return nft_validate_register_load(priv->sreg_dev, sizeof(int));
}
-static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
-
static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
@@ -169,8 +167,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
return nft_validate_register_load(priv->sreg_addr, addr_len);
}
-static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
-
static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index c2a1d84cdfc4..227b2b15a19c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -26,9 +26,9 @@ struct nft_lookup {
struct nft_set_binding binding;
};
-static void nft_lookup_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_lookup_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
@@ -121,12 +121,28 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
return 0;
}
+static void nft_lookup_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_lookup *priv = nft_expr_priv(expr);
+
+ nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+}
+
+static void nft_lookup_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_lookup *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
static void nft_lookup_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_lookup *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ nf_tables_destroy_set(ctx, priv->set);
}
static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
@@ -209,6 +225,8 @@ static const struct nft_expr_ops nft_lookup_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
.eval = nft_lookup_eval,
.init = nft_lookup_init,
+ .activate = nft_lookup_activate,
+ .deactivate = nft_lookup_deactivate,
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
.validate = nft_lookup_validate,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 1105a23bda5e..6180626c3f80 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -41,9 +41,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
#include "../bridge/br_private.h"
#endif
-static void nft_meta_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_meta_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
@@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
break;
case NFT_META_SKUID:
sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) ||
+ !net_eq(nft_net(pkt), sock_net(sk)))
goto err;
read_lock_bh(&sk->sk_callback_lock);
@@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
break;
case NFT_META_SKGID:
sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) ||
+ !net_eq(nft_net(pkt), sock_net(sk)))
goto err;
read_lock_bh(&sk->sk_callback_lock);
@@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr,
#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk))
+ if (!sk || !sk_fullsock(sk) ||
+ !net_eq(nft_net(pkt), sock_net(sk)))
goto err;
*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
break;
@@ -281,6 +284,11 @@ static void nft_meta_set_eval(const struct nft_expr *expr,
skb->nf_trace = !!value8;
break;
+#ifdef CONFIG_NETWORK_SECMARK
+ case NFT_META_SECMARK:
+ skb->secmark = value;
+ break;
+#endif
default:
WARN_ON(1);
}
@@ -433,6 +441,9 @@ static int nft_meta_set_init(const struct nft_ctx *ctx,
switch (priv->key) {
case NFT_META_MARK:
case NFT_META_PRIORITY:
+#ifdef CONFIG_NETWORK_SECMARK
+ case NFT_META_SECMARK:
+#endif
len = sizeof(u32);
break;
case NFT_META_NFTRACE:
@@ -540,3 +551,111 @@ struct nft_expr_type nft_meta_type __read_mostly = {
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
};
+
+#ifdef CONFIG_NETWORK_SECMARK
+struct nft_secmark {
+ u32 secid;
+ char *ctx;
+};
+
+static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = {
+ [NFTA_SECMARK_CTX] = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN },
+};
+
+static int nft_secmark_compute_secid(struct nft_secmark *priv)
+{
+ u32 tmp_secid = 0;
+ int err;
+
+ err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid);
+ if (err)
+ return err;
+
+ if (!tmp_secid)
+ return -ENOENT;
+
+ err = security_secmark_relabel_packet(tmp_secid);
+ if (err)
+ return err;
+
+ priv->secid = tmp_secid;
+ return 0;
+}
+
+static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_secmark *priv = nft_obj_data(obj);
+ struct sk_buff *skb = pkt->skb;
+
+ skb->secmark = priv->secid;
+}
+
+static int nft_secmark_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_secmark *priv = nft_obj_data(obj);
+ int err;
+
+ if (tb[NFTA_SECMARK_CTX] == NULL)
+ return -EINVAL;
+
+ priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
+ if (!priv->ctx)
+ return -ENOMEM;
+
+ err = nft_secmark_compute_secid(priv);
+ if (err) {
+ kfree(priv->ctx);
+ return err;
+ }
+
+ security_secmark_refcount_inc();
+
+ return 0;
+}
+
+static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj,
+ bool reset)
+{
+ struct nft_secmark *priv = nft_obj_data(obj);
+ int err;
+
+ if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx))
+ return -1;
+
+ if (reset) {
+ err = nft_secmark_compute_secid(priv);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
+{
+ struct nft_secmark *priv = nft_obj_data(obj);
+
+ security_secmark_refcount_dec();
+
+ kfree(priv->ctx);
+}
+
+static const struct nft_object_ops nft_secmark_obj_ops = {
+ .type = &nft_secmark_obj_type,
+ .size = sizeof(struct nft_secmark),
+ .init = nft_secmark_obj_init,
+ .eval = nft_secmark_obj_eval,
+ .dump = nft_secmark_obj_dump,
+ .destroy = nft_secmark_obj_destroy,
+};
+struct nft_object_type nft_secmark_obj_type __read_mostly = {
+ .type = NFT_OBJECT_SECMARK,
+ .ops = &nft_secmark_obj_ops,
+ .maxattr = NFTA_SECMARK_MAX,
+ .policy = nft_secmark_policy,
+ .owner = THIS_MODULE,
+};
+#endif /* CONFIG_NETWORK_SECMARK */
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 1f4d0854cf70..649d1700ec5b 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -237,10 +237,8 @@ static int nft_ng_random_map_init(const struct nft_ctx *ctx,
priv->map = nft_set_lookup_global(ctx->net, ctx->table,
tb[NFTA_NG_SET_NAME],
tb[NFTA_NG_SET_ID], genmask);
- if (IS_ERR(priv->map))
- return PTR_ERR(priv->map);
- return 0;
+ return PTR_ERR_OR_ZERO(priv->map);
}
static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index cdf348f751ec..a3185ca2a3a9 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -155,12 +155,28 @@ nla_put_failure:
return -1;
}
+static void nft_objref_map_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+}
+
+static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
static void nft_objref_map_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ nf_tables_destroy_set(ctx, priv->set);
}
static struct nft_expr_type nft_objref_type;
@@ -169,6 +185,8 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
.eval = nft_objref_map_eval,
.init = nft_objref_map_init,
+ .activate = nft_objref_map_activate,
+ .deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
};
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
new file mode 100644
index 000000000000..ca5e5d8c5ef8
--- /dev/null
+++ b/net/netfilter/nft_osf.c
@@ -0,0 +1,127 @@
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_tables.h>
+#include <linux/netfilter/nfnetlink_osf.h>
+
+struct nft_osf {
+ enum nft_registers dreg:8;
+ u8 ttl;
+};
+
+static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
+ [NFTA_OSF_DREG] = { .type = NLA_U32 },
+ [NFTA_OSF_TTL] = { .type = NLA_U8 },
+};
+
+static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct sk_buff *skb = pkt->skb;
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+ const char *os_name;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(struct tcphdr), &_tcph);
+ if (!tcp) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ if (!tcp->syn) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl);
+ if (!os_name)
+ strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
+ else
+ strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN);
+}
+
+static int nft_osf_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ int err;
+ u8 ttl;
+
+ if (nla_get_u8(tb[NFTA_OSF_TTL])) {
+ ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
+ if (ttl > 2)
+ return -EINVAL;
+ priv->ttl = ttl;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_osf *priv = nft_expr_priv(expr);
+
+ if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
+ goto nla_put_failure;
+
+ if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static int nft_osf_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_FORWARD));
+}
+
+static struct nft_expr_type nft_osf_type;
+static const struct nft_expr_ops nft_osf_op = {
+ .eval = nft_osf_eval,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_osf)),
+ .init = nft_osf_init,
+ .dump = nft_osf_dump,
+ .type = &nft_osf_type,
+ .validate = nft_osf_validate,
+};
+
+static struct nft_expr_type nft_osf_type __read_mostly = {
+ .ops = &nft_osf_op,
+ .name = "osf",
+ .owner = THIS_MODULE,
+ .policy = nft_osf_policy,
+ .maxattr = NFTA_OSF_MAX,
+};
+
+static int __init nft_osf_module_init(void)
+{
+ return nft_register_expr(&nft_osf_type);
+}
+
+static void __exit nft_osf_module_exit(void)
+{
+ return nft_unregister_expr(&nft_osf_type);
+}
+
+module_init(nft_osf_module_init);
+module_exit(nft_osf_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
+MODULE_ALIAS_NFT_EXPR("osf");
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 29f5bd2377b0..b48e58cceeb7 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -94,7 +94,8 @@ static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = {
int nft_reject_icmp_code(u8 code)
{
- BUG_ON(code > NFT_REJECT_ICMPX_MAX);
+ if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX))
+ return ICMP_NET_UNREACH;
return icmp_code_v4[code];
}
@@ -111,7 +112,8 @@ static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = {
int nft_reject_icmpv6_code(u8 code)
{
- BUG_ON(code > NFT_REJECT_ICMPX_MAX);
+ if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX))
+ return ICMPV6_NOROUTE;
return icmp_code_v6[code];
}
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 76dba9f6b6f6..f35fa33913ae 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -90,6 +90,11 @@ static void nft_rt_get_eval(const struct nft_expr *expr,
case NFT_RT_TCPMSS:
nft_reg_store16(dest, get_tcpmss(pkt, dst));
break;
+#ifdef CONFIG_XFRM
+ case NFT_RT_XFRM:
+ nft_reg_store8(dest, !!dst->xfrm);
+ break;
+#endif
default:
WARN_ON(1);
goto err;
@@ -130,6 +135,11 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
case NFT_RT_TCPMSS:
len = sizeof(u16);
break;
+#ifdef CONFIG_XFRM
+ case NFT_RT_XFRM:
+ len = sizeof(u8);
+ break;
+#endif
default:
return -EOPNOTSUPP;
}
@@ -164,6 +174,7 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
case NFT_RT_NEXTHOP4:
case NFT_RT_NEXTHOP6:
case NFT_RT_CLASSID:
+ case NFT_RT_XFRM:
return 0;
case NFT_RT_TCPMSS:
hooks = (1 << NF_INET_FORWARD) |
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 128bc16f52dd..f866bd41e5d2 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -248,13 +248,13 @@ static inline u32 nft_bitmap_size(u32 klen)
return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1;
}
-static inline u32 nft_bitmap_total_size(u32 klen)
+static inline u64 nft_bitmap_total_size(u32 klen)
{
return sizeof(struct nft_bitmap) + nft_bitmap_size(klen);
}
-static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[],
- const struct nft_set_desc *desc)
+static u64 nft_bitmap_privsize(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc)
{
u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 90c3e7e6cacb..339a9dd1c832 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -88,7 +88,7 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
.key = key,
};
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params);
+ he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
*ext = &he->ext;
@@ -106,7 +106,7 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
.key = elem->key.val.data,
};
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params);
+ he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
return he;
@@ -129,7 +129,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
.key = key,
};
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params);
+ he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
goto out;
@@ -217,7 +217,7 @@ static void *nft_rhash_deactivate(const struct net *net,
};
rcu_read_lock();
- he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params);
+ he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL &&
!nft_rhash_flush(net, set, he))
he = NULL;
@@ -244,21 +244,15 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_rhash_elem *he;
struct rhashtable_iter hti;
struct nft_set_elem elem;
- int err;
-
- err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC);
- iter->err = err;
- if (err)
- return;
+ rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
- err = PTR_ERR(he);
- if (err != -EAGAIN) {
- iter->err = err;
- goto out;
+ if (PTR_ERR(he) != -EAGAIN) {
+ iter->err = PTR_ERR(he);
+ break;
}
continue;
@@ -275,13 +269,11 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
iter->err = iter->fn(ctx, set, iter, &elem);
if (iter->err < 0)
- goto out;
+ break;
cont:
iter->count++;
}
-
-out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
}
@@ -293,21 +285,17 @@ static void nft_rhash_gc(struct work_struct *work)
struct nft_rhash *priv;
struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
- int err;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
- err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
- if (err)
- goto schedule;
-
+ rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
if (PTR_ERR(he) != -EAGAIN)
- goto out;
+ break;
continue;
}
@@ -326,23 +314,21 @@ gc:
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (gcb == NULL)
- goto out;
+ break;
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, he);
}
-out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
nft_set_gc_batch_complete(gcb);
-schedule:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
-static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
- const struct nft_set_desc *desc)
+static u64 nft_rhash_privsize(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc)
{
return sizeof(struct nft_rhash);
}
@@ -585,8 +571,8 @@ cont:
}
}
-static unsigned int nft_hash_privsize(const struct nlattr * const nla[],
- const struct nft_set_desc *desc)
+static u64 nft_hash_privsize(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc)
{
return sizeof(struct nft_hash) +
nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 9873d734b494..fa61208371f8 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -135,9 +135,12 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
d = memcmp(this, key, set->klen);
if (d < 0) {
parent = rcu_dereference_raw(parent->rb_left);
- interval = rbe;
+ if (!(flags & NFT_SET_ELEM_INTERVAL_END))
+ interval = rbe;
} else if (d > 0) {
parent = rcu_dereference_raw(parent->rb_right);
+ if (flags & NFT_SET_ELEM_INTERVAL_END)
+ interval = rbe;
} else {
if (!nft_set_elem_active(&rbe->ext, genmask))
parent = rcu_dereference_raw(parent->rb_left);
@@ -154,7 +157,10 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
- !nft_rbtree_interval_end(interval)) {
+ ((!nft_rbtree_interval_end(interval) &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
+ (nft_rbtree_interval_end(interval) &&
+ (flags & NFT_SET_ELEM_INTERVAL_END)))) {
*elem = interval;
return true;
}
@@ -355,12 +361,11 @@ cont:
static void nft_rbtree_gc(struct work_struct *work)
{
+ struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
struct nft_set_gc_batch *gcb = NULL;
- struct rb_node *node, *prev = NULL;
- struct nft_rbtree_elem *rbe;
struct nft_rbtree *priv;
+ struct rb_node *node;
struct nft_set *set;
- int i;
priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
@@ -371,7 +376,7 @@ static void nft_rbtree_gc(struct work_struct *work)
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (nft_rbtree_interval_end(rbe)) {
- prev = node;
+ rbe_end = rbe;
continue;
}
if (!nft_set_elem_expired(&rbe->ext))
@@ -379,29 +384,30 @@ static void nft_rbtree_gc(struct work_struct *work)
if (nft_set_elem_mark_busy(&rbe->ext))
continue;
+ if (rbe_prev) {
+ rb_erase(&rbe_prev->node, &priv->root);
+ rbe_prev = NULL;
+ }
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (!gcb)
break;
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, rbe);
+ rbe_prev = rbe;
- if (prev) {
- rbe = rb_entry(prev, struct nft_rbtree_elem, node);
+ if (rbe_end) {
atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe);
- prev = NULL;
+ nft_set_gc_batch_add(gcb, rbe_end);
+ rb_erase(&rbe_end->node, &priv->root);
+ rbe_end = NULL;
}
node = rb_next(node);
if (!node)
break;
}
- if (gcb) {
- for (i = 0; i < gcb->head.cnt; i++) {
- rbe = gcb->elems[i];
- rb_erase(&rbe->node, &priv->root);
- }
- }
+ if (rbe_prev)
+ rb_erase(&rbe_prev->node, &priv->root);
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
@@ -411,8 +417,8 @@ static void nft_rbtree_gc(struct work_struct *work)
nft_set_gc_interval(set));
}
-static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[],
- const struct nft_set_desc *desc)
+static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc)
{
return sizeof(struct nft_rbtree);
}
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 74e1b3bd6954..d7f3776dfd71 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -23,12 +23,15 @@ static void nft_socket_eval(const struct nft_expr *expr,
struct sock *sk = skb->sk;
u32 *dest = &regs->data[priv->dreg];
+ if (sk && !net_eq(nft_net(pkt), sock_net(sk)))
+ sk = NULL;
+
if (!sk)
switch(nft_pf(pkt)) {
case NFPROTO_IPV4:
sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
break;
-#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case NFPROTO_IPV6:
sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
break;
@@ -39,8 +42,8 @@ static void nft_socket_eval(const struct nft_expr *expr,
return;
}
- if(!sk) {
- nft_reg_store8(dest, 0);
+ if (!sk) {
+ regs->verdict.code = NFT_BREAK;
return;
}
@@ -51,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr,
case NFT_SOCKET_TRANSPARENT:
nft_reg_store8(dest, inet_sk_transparent(sk));
break;
+ case NFT_SOCKET_MARK:
+ if (sk_fullsock(sk)) {
+ *dest = sk->sk_mark;
+ } else {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ break;
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
@@ -74,7 +85,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
switch(ctx->family) {
case NFPROTO_IPV4:
-#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case NFPROTO_IPV6:
#endif
case NFPROTO_INET:
@@ -88,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx,
case NFT_SOCKET_TRANSPARENT:
len = sizeof(u8);
break;
+ case NFT_SOCKET_MARK:
+ len = sizeof(u32);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
new file mode 100644
index 000000000000..f92a82c73880
--- /dev/null
+++ b/net/netfilter/nft_tproxy.c
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tproxy.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+#include <linux/if_ether.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+struct nft_tproxy {
+ enum nft_registers sreg_addr:8;
+ enum nft_registers sreg_port:8;
+ u8 family;
+};
+
+static void nft_tproxy_eval_v4(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct udphdr _hdr, *hp;
+ __be32 taddr = 0;
+ __be16 tport = 0;
+ struct sock *sk;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (!hp) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ /* check if there's an ongoing connection on the packet addresses, this
+ * happens if the redirect already happened and the current packet
+ * belongs to an already established connection
+ */
+ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
+ iph->saddr, iph->daddr,
+ hp->source, hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
+
+ if (priv->sreg_addr)
+ taddr = regs->data[priv->sreg_addr];
+ taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
+
+ if (priv->sreg_port)
+ tport = regs->data[priv->sreg_port];
+ if (!tport)
+ tport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT) {
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
+ } else if (!sk) {
+ /* no, there's no established connection, check if
+ * there's a listener on the redirected addr/port
+ */
+ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
+ iph->saddr, taddr,
+ hp->source, tport,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ }
+
+ if (sk && nf_tproxy_sk_is_transparent(sk))
+ nf_tproxy_assign_sock(skb, sk);
+ else
+ regs->verdict.code = NFT_BREAK;
+}
+
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+static void nft_tproxy_eval_v6(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct in6_addr taddr;
+ int thoff = pkt->xt.thoff;
+ struct udphdr _hdr, *hp;
+ __be16 tport = 0;
+ struct sock *sk;
+ int l4proto;
+
+ memset(&taddr, 0, sizeof(taddr));
+
+ if (!pkt->tprot_set) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ l4proto = pkt->tprot;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ /* check if there's an ongoing connection on the packet addresses, this
+ * happens if the redirect already happened and the current packet
+ * belongs to an already established connection
+ */
+ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto,
+ &iph->saddr, &iph->daddr,
+ hp->source, hp->dest,
+ nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED);
+
+ if (priv->sreg_addr)
+ memcpy(&taddr, &regs->data[priv->sreg_addr], sizeof(taddr));
+ taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
+
+ if (priv->sreg_port)
+ tport = regs->data[priv->sreg_port];
+ if (!tport)
+ tport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT) {
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff,
+ nft_net(pkt),
+ &taddr,
+ tport,
+ sk);
+ } else if (!sk) {
+ /* no there's no established connection, check if
+ * there's a listener on the redirected addr/port
+ */
+ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff,
+ l4proto, &iph->saddr, &taddr,
+ hp->source, tport,
+ nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER);
+ }
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && nf_tproxy_sk_is_transparent(sk))
+ nf_tproxy_assign_sock(skb, sk);
+ else
+ regs->verdict.code = NFT_BREAK;
+}
+#endif
+
+static void nft_tproxy_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_UNSPEC:
+ nft_tproxy_eval_v4(expr, regs, pkt);
+ return;
+ }
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ switch (priv->family) {
+ case NFPROTO_IPV6:
+ case NFPROTO_UNSPEC:
+ nft_tproxy_eval_v6(expr, regs, pkt);
+ return;
+ }
+#endif
+ }
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
+ [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 },
+ [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
+ [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
+};
+
+static int nft_tproxy_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_tproxy *priv = nft_expr_priv(expr);
+ unsigned int alen = 0;
+ int err;
+
+ if (!tb[NFTA_TPROXY_FAMILY] ||
+ (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT]))
+ return -EINVAL;
+
+ priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ if (priv->family != NFPROTO_IPV4)
+ return -EINVAL;
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ if (priv->family != NFPROTO_IPV6)
+ return -EINVAL;
+ break;
+#endif
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* Address is specified but the rule family is not set accordingly */
+ if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR])
+ return -EINVAL;
+
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ alen = FIELD_SIZEOF(union nf_inet_addr, in);
+ err = nf_defrag_ipv4_enable(ctx->net);
+ if (err)
+ return err;
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ alen = FIELD_SIZEOF(union nf_inet_addr, in6);
+ err = nf_defrag_ipv6_enable(ctx->net);
+ if (err)
+ return err;
+ break;
+#endif
+ case NFPROTO_UNSPEC:
+ /* No address is specified here */
+ err = nf_defrag_ipv4_enable(ctx->net);
+ if (err)
+ return err;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ err = nf_defrag_ipv6_enable(ctx->net);
+ if (err)
+ return err;
+#endif
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[NFTA_TPROXY_REG_ADDR]) {
+ priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
+ err = nft_validate_register_load(priv->sreg_addr, alen);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NFTA_TPROXY_REG_PORT]) {
+ priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
+ err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int nft_tproxy_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family)))
+ return -1;
+
+ if (priv->sreg_addr &&
+ nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr))
+ return -1;
+
+ if (priv->sreg_port &&
+ nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port))
+ return -1;
+
+ return 0;
+}
+
+static struct nft_expr_type nft_tproxy_type;
+static const struct nft_expr_ops nft_tproxy_ops = {
+ .type = &nft_tproxy_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
+ .eval = nft_tproxy_eval,
+ .init = nft_tproxy_init,
+ .dump = nft_tproxy_dump,
+};
+
+static struct nft_expr_type nft_tproxy_type __read_mostly = {
+ .name = "tproxy",
+ .ops = &nft_tproxy_ops,
+ .policy = nft_tproxy_policy,
+ .maxattr = NFTA_TPROXY_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_tproxy_module_init(void)
+{
+ return nft_register_expr(&nft_tproxy_type);
+}
+
+static void __exit nft_tproxy_module_exit(void)
+{
+ nft_unregister_expr(&nft_tproxy_type);
+}
+
+module_init(nft_tproxy_module_init);
+module_exit(nft_tproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables tproxy support module");
+MODULE_ALIAS_NFT_EXPR("tproxy");
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
new file mode 100644
index 000000000000..3a15f219e4e7
--- /dev/null
+++ b/net/netfilter/nft_tunnel.c
@@ -0,0 +1,566 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
+
+struct nft_tunnel {
+ enum nft_tunnel_keys key:8;
+ enum nft_registers dreg:8;
+};
+
+static void nft_tunnel_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = skb_tunnel_info(pkt->skb);
+
+ switch (priv->key) {
+ case NFT_TUNNEL_PATH:
+ nft_reg_store8(dest, !!tun_info);
+ break;
+ case NFT_TUNNEL_ID:
+ if (!tun_info) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+ break;
+ default:
+ WARN_ON(1);
+ regs->verdict.code = NFT_BREAK;
+ }
+}
+
+static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
+ [NFTA_TUNNEL_KEY] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_DREG] = { .type = NLA_U32 },
+};
+
+static int nft_tunnel_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_tunnel *priv = nft_expr_priv(expr);
+ u32 len;
+
+ if (!tb[NFTA_TUNNEL_KEY] &&
+ !tb[NFTA_TUNNEL_DREG])
+ return -EINVAL;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY]));
+ switch (priv->key) {
+ case NFT_TUNNEL_PATH:
+ len = sizeof(u8);
+ break;
+ case NFT_TUNNEL_ID:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
+
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+static int nft_tunnel_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_tunnel_type;
+static const struct nft_expr_ops nft_tunnel_get_ops = {
+ .type = &nft_tunnel_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_tunnel)),
+ .eval = nft_tunnel_get_eval,
+ .init = nft_tunnel_get_init,
+ .dump = nft_tunnel_get_dump,
+};
+
+static struct nft_expr_type nft_tunnel_type __read_mostly = {
+ .name = "tunnel",
+ .ops = &nft_tunnel_get_ops,
+ .policy = nft_tunnel_policy,
+ .maxattr = NFTA_TUNNEL_MAX,
+ .owner = THIS_MODULE,
+};
+
+struct nft_tunnel_opts {
+ union {
+ struct vxlan_metadata vxlan;
+ struct erspan_metadata erspan;
+ } u;
+ u32 len;
+ __be16 flags;
+};
+
+struct nft_tunnel_obj {
+ struct metadata_dst *md;
+ struct nft_tunnel_opts opts;
+};
+
+static const struct nla_policy nft_tunnel_ip_policy[NFTA_TUNNEL_KEY_IP_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_IP_SRC] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY_IP_DST] = { .type = NLA_U32 },
+};
+
+static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct ip_tunnel_info *info)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr,
+ nft_tunnel_ip_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_IP_DST])
+ return -EINVAL;
+
+ if (tb[NFTA_TUNNEL_KEY_IP_SRC])
+ info->key.u.ipv4.src = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_SRC]);
+ if (tb[NFTA_TUNNEL_KEY_IP_DST])
+ info->key.u.ipv4.dst = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_DST]);
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_ip6_policy[NFTA_TUNNEL_KEY_IP6_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_IP6_SRC] = { .len = sizeof(struct in6_addr), },
+ [NFTA_TUNNEL_KEY_IP6_DST] = { .len = sizeof(struct in6_addr), },
+ [NFTA_TUNNEL_KEY_IP6_FLOWLABEL] = { .type = NLA_U32, }
+};
+
+static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct ip_tunnel_info *info)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr,
+ nft_tunnel_ip6_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_IP6_DST])
+ return -EINVAL;
+
+ if (tb[NFTA_TUNNEL_KEY_IP6_SRC]) {
+ memcpy(&info->key.u.ipv6.src,
+ nla_data(tb[NFTA_TUNNEL_KEY_IP6_SRC]),
+ sizeof(struct in6_addr));
+ }
+ if (tb[NFTA_TUNNEL_KEY_IP6_DST]) {
+ memcpy(&info->key.u.ipv6.dst,
+ nla_data(tb[NFTA_TUNNEL_KEY_IP6_DST]),
+ sizeof(struct in6_addr));
+ }
+ if (tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL])
+ info->key.label = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]);
+
+ info->mode |= IP_TUNNEL_INFO_IPV6;
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_vxlan_policy[NFTA_TUNNEL_KEY_VXLAN_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr,
+ nft_tunnel_opts_vxlan_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_VXLAN_GBP])
+ return -EINVAL;
+
+ opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
+
+ opts->len = sizeof(struct vxlan_metadata);
+ opts->flags = TUNNEL_VXLAN_OPT;
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 },
+};
+
+static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1];
+ uint8_t hwid, dir;
+ int err, version;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr,
+ nft_tunnel_opts_erspan_policy, NULL);
+ if (err < 0)
+ return err;
+
+ version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]));
+ switch (version) {
+ case ERSPAN_VERSION:
+ if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX])
+ return -EINVAL;
+
+ opts->u.erspan.u.index =
+ nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]);
+ break;
+ case ERSPAN_VERSION2:
+ if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] ||
+ !tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID])
+ return -EINVAL;
+
+ hwid = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]);
+ dir = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]);
+
+ set_hwid(&opts->u.erspan.u.md2, hwid);
+ opts->u.erspan.u.md2.dir = dir;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ opts->u.erspan.version = version;
+
+ opts->len = sizeof(struct erspan_metadata);
+ opts->flags = TUNNEL_ERSPAN_OPT;
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
+};
+
+static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct ip_tunnel_info *info,
+ struct nft_tunnel_opts *opts)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
+ nft_tunnel_opts_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
+ err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
+ opts);
+ } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
+ err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
+ opts);
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_IP] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_IP6] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_ID] = { .type = NLA_U32, },
+ [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, },
+ [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, },
+ [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, },
+ [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, },
+};
+
+static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+ struct ip_tunnel_info info;
+ struct metadata_dst *md;
+ int err;
+
+ if (!tb[NFTA_TUNNEL_KEY_ID])
+ return -EINVAL;
+
+ memset(&info, 0, sizeof(info));
+ info.mode = IP_TUNNEL_INFO_TX;
+ info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
+ info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+
+ if (tb[NFTA_TUNNEL_KEY_IP]) {
+ err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
+ if (err < 0)
+ return err;
+ } else if (tb[NFTA_TUNNEL_KEY_IP6]) {
+ err = nft_tunnel_obj_ip6_init(ctx, tb[NFTA_TUNNEL_KEY_IP6], &info);
+ if (err < 0)
+ return err;
+ } else {
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_TUNNEL_KEY_SPORT]) {
+ info.key.tp_src = nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT]);
+ }
+ if (tb[NFTA_TUNNEL_KEY_DPORT]) {
+ info.key.tp_dst = nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT]);
+ }
+
+ if (tb[NFTA_TUNNEL_KEY_FLAGS]) {
+ u32 tun_flags;
+
+ tun_flags = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_FLAGS]));
+ if (tun_flags & ~NFT_TUNNEL_F_MASK)
+ return -EOPNOTSUPP;
+
+ if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
+ info.key.tun_flags &= ~TUNNEL_CSUM;
+ if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
+ info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
+ info.key.tun_flags |= TUNNEL_SEQ;
+ }
+ if (tb[NFTA_TUNNEL_KEY_TOS])
+ info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
+ if (tb[NFTA_TUNNEL_KEY_TTL])
+ info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
+ else
+ info.key.ttl = U8_MAX;
+
+ if (tb[NFTA_TUNNEL_KEY_OPTS]) {
+ err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
+ &info, &priv->opts);
+ if (err < 0)
+ return err;
+ }
+
+ md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ memcpy(&md->u.tun_info, &info, sizeof(info));
+ ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len,
+ priv->opts.flags);
+ priv->md = md;
+
+ return 0;
+}
+
+static inline void nft_tunnel_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+ struct sk_buff *skb = pkt->skb;
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *) priv->md);
+ skb_dst_set(skb, (struct dst_entry *) priv->md);
+}
+
+static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+ struct nlattr *nest;
+
+ if (info->mode & IP_TUNNEL_INFO_IPV6) {
+ nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6);
+ if (!nest)
+ return -1;
+
+ if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 ||
+ nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 ||
+ nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label))
+ return -1;
+
+ nla_nest_end(skb, nest);
+ } else {
+ nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP);
+ if (!nest)
+ return -1;
+
+ if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 ||
+ nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0)
+ return -1;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+}
+
+static int nft_tunnel_opts_dump(struct sk_buff *skb,
+ struct nft_tunnel_obj *priv)
+{
+ struct nft_tunnel_opts *opts = &priv->opts;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS);
+ if (!nest)
+ return -1;
+
+ if (opts->flags & TUNNEL_VXLAN_OPT) {
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP,
+ htonl(opts->u.vxlan.gbp)))
+ return -1;
+ } else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+ switch (opts->u.erspan.version) {
+ case ERSPAN_VERSION:
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
+ opts->u.erspan.u.index))
+ return -1;
+ break;
+ case ERSPAN_VERSION2:
+ if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
+ get_hwid(&opts->u.erspan.u.md2)) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
+ opts->u.erspan.u.md2.dir))
+ return -1;
+ break;
+ }
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int nft_tunnel_ports_dump(struct sk_buff *skb,
+ struct ip_tunnel_info *info)
+{
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
+ nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_tunnel_flags_dump(struct sk_buff *skb,
+ struct ip_tunnel_info *info)
+{
+ u32 flags = 0;
+
+ if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+ flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
+ if (!(info->key.tun_flags & TUNNEL_CSUM))
+ flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
+ if (info->key.tun_flags & TUNNEL_SEQ)
+ flags |= NFT_TUNNEL_F_SEQ_NUMBER;
+
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_tunnel_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+ struct ip_tunnel_info *info = &priv->md->u.tun_info;
+
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ID,
+ tunnel_id_to_key32(info->key.tun_id)) ||
+ nft_tunnel_ip_dump(skb, info) < 0 ||
+ nft_tunnel_ports_dump(skb, info) < 0 ||
+ nft_tunnel_flags_dump(skb, info) < 0 ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_TOS, info->key.tos) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_TTL, info->key.ttl) ||
+ nft_tunnel_opts_dump(skb, priv) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+
+ metadata_dst_free(priv->md);
+}
+
+static struct nft_object_type nft_tunnel_obj_type;
+static const struct nft_object_ops nft_tunnel_obj_ops = {
+ .type = &nft_tunnel_obj_type,
+ .size = sizeof(struct nft_tunnel_obj),
+ .eval = nft_tunnel_obj_eval,
+ .init = nft_tunnel_obj_init,
+ .destroy = nft_tunnel_obj_destroy,
+ .dump = nft_tunnel_obj_dump,
+};
+
+static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
+ .type = NFT_OBJECT_TUNNEL,
+ .ops = &nft_tunnel_obj_ops,
+ .maxattr = NFTA_TUNNEL_KEY_MAX,
+ .policy = nft_tunnel_key_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_tunnel_module_init(void)
+{
+ int err;
+
+ err = nft_register_expr(&nft_tunnel_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_obj(&nft_tunnel_obj_type);
+ if (err < 0)
+ nft_unregister_expr(&nft_tunnel_type);
+
+ return err;
+}
+
+static void __exit nft_tunnel_module_exit(void)
+{
+ nft_unregister_obj(&nft_tunnel_obj_type);
+ nft_unregister_expr(&nft_tunnel_type);
+}
+
+module_init(nft_tunnel_module_init);
+module_exit(nft_tunnel_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("tunnel");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
new file mode 100644
index 000000000000..5322609f7662
--- /dev/null
+++ b/net/netfilter/nft_xfrm.c
@@ -0,0 +1,294 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Generic part shared by ipv4 and ipv6 backends.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <linux/in.h>
+#include <net/xfrm.h>
+
+static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
+ [NFTA_XFRM_KEY] = { .type = NLA_U32 },
+ [NFTA_XFRM_DIR] = { .type = NLA_U8 },
+ [NFTA_XFRM_SPNUM] = { .type = NLA_U32 },
+ [NFTA_XFRM_DREG] = { .type = NLA_U32 },
+};
+
+struct nft_xfrm {
+ enum nft_xfrm_keys key:8;
+ enum nft_registers dreg:8;
+ u8 dir;
+ u8 spnum;
+};
+
+static int nft_xfrm_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_xfrm *priv = nft_expr_priv(expr);
+ unsigned int len = 0;
+ u32 spnum = 0;
+ u8 dir;
+
+ if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG])
+ return -EINVAL;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
+ switch (priv->key) {
+ case NFT_XFRM_KEY_REQID:
+ case NFT_XFRM_KEY_SPI:
+ len = sizeof(u32);
+ break;
+ case NFT_XFRM_KEY_DADDR_IP4:
+ case NFT_XFRM_KEY_SADDR_IP4:
+ len = sizeof(struct in_addr);
+ break;
+ case NFT_XFRM_KEY_DADDR_IP6:
+ case NFT_XFRM_KEY_SADDR_IP6:
+ len = sizeof(struct in6_addr);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ dir = nla_get_u8(tb[NFTA_XFRM_DIR]);
+ switch (dir) {
+ case XFRM_POLICY_IN:
+ case XFRM_POLICY_OUT:
+ priv->dir = dir;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_XFRM_SPNUM])
+ spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM]));
+
+ if (spnum >= XFRM_MAX_DEPTH)
+ return -ERANGE;
+
+ priv->spnum = spnum;
+
+ priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]);
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+/* Return true if key asks for daddr/saddr and current
+ * state does have a valid address (BEET, TUNNEL).
+ */
+static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
+{
+ switch (k) {
+ case NFT_XFRM_KEY_DADDR_IP4:
+ case NFT_XFRM_KEY_SADDR_IP4:
+ if (family == NFPROTO_IPV4)
+ break;
+ return false;
+ case NFT_XFRM_KEY_DADDR_IP6:
+ case NFT_XFRM_KEY_SADDR_IP6:
+ if (family == NFPROTO_IPV6)
+ break;
+ return false;
+ default:
+ return true;
+ }
+
+ return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+}
+
+static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
+ struct nft_regs *regs,
+ const struct xfrm_state *state)
+{
+ u32 *dest = &regs->data[priv->dreg];
+
+ if (!xfrm_state_addr_ok(priv->key,
+ state->props.family,
+ state->props.mode)) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ switch (priv->key) {
+ case NFT_XFRM_KEY_UNSPEC:
+ case __NFT_XFRM_KEY_MAX:
+ WARN_ON_ONCE(1);
+ break;
+ case NFT_XFRM_KEY_DADDR_IP4:
+ *dest = state->id.daddr.a4;
+ return;
+ case NFT_XFRM_KEY_DADDR_IP6:
+ memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
+ return;
+ case NFT_XFRM_KEY_SADDR_IP4:
+ *dest = state->props.saddr.a4;
+ return;
+ case NFT_XFRM_KEY_SADDR_IP6:
+ memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
+ return;
+ case NFT_XFRM_KEY_REQID:
+ *dest = state->props.reqid;
+ return;
+ case NFT_XFRM_KEY_SPI:
+ *dest = state->id.spi;
+ return;
+ }
+
+ regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct sec_path *sp = pkt->skb->sp;
+ const struct xfrm_state *state;
+
+ if (sp == NULL || sp->len <= priv->spnum) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ state = sp->xvec[priv->spnum];
+ nft_xfrm_state_get_key(priv, regs, state);
+}
+
+static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct dst_entry *dst = skb_dst(pkt->skb);
+ int i;
+
+ for (i = 0; dst && dst->xfrm;
+ dst = ((const struct xfrm_dst *)dst)->child, i++) {
+ if (i < priv->spnum)
+ continue;
+
+ nft_xfrm_state_get_key(priv, regs, dst->xfrm);
+ return;
+ }
+
+ regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_xfrm_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+
+ switch (priv->dir) {
+ case XFRM_POLICY_IN:
+ nft_xfrm_get_eval_in(priv, regs, pkt);
+ break;
+ case XFRM_POLICY_OUT:
+ nft_xfrm_get_eval_out(priv, regs, pkt);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ regs->verdict.code = NFT_BREAK;
+ break;
+ }
+}
+
+static int nft_xfrm_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg))
+ return -1;
+
+ if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key)))
+ return -1;
+ if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir))
+ return -1;
+ if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum)))
+ return -1;
+
+ return 0;
+}
+
+static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->dir) {
+ case XFRM_POLICY_IN:
+ hooks = (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING);
+ break;
+ case XFRM_POLICY_OUT:
+ hooks = (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+
+static struct nft_expr_type nft_xfrm_type;
+static const struct nft_expr_ops nft_xfrm_get_ops = {
+ .type = &nft_xfrm_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)),
+ .eval = nft_xfrm_get_eval,
+ .init = nft_xfrm_get_init,
+ .dump = nft_xfrm_get_dump,
+ .validate = nft_xfrm_validate,
+};
+
+static struct nft_expr_type nft_xfrm_type __read_mostly = {
+ .name = "xfrm",
+ .ops = &nft_xfrm_get_ops,
+ .policy = nft_xfrm_policy,
+ .maxattr = NFTA_XFRM_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_xfrm_module_init(void)
+{
+ return nft_register_expr(&nft_xfrm_type);
+}
+
+static void __exit nft_xfrm_module_exit(void)
+{
+ nft_unregister_expr(&nft_xfrm_type);
+}
+
+module_init(nft_xfrm_module_init);
+module_exit(nft_xfrm_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>");
+MODULE_ALIAS_NFT_EXPR("xfrm");
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 0b660c568156..e8da9a9bba73 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -1,14 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_queue.h>
+#include <net/ip6_checksum.h>
+
+#ifdef CONFIG_INET
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u8 protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if ((protocol == 0 && !csum_fold(skb->csum)) ||
+ !csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - dataoff, protocol,
+ skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ if (protocol == 0)
+ skb->csum = 0;
+ else
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ skb->len - dataoff,
+ protocol, 0);
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+#endif
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u8 protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+ skb->len - dataoff, 0);
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+}
+
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u8 protocol)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ skb->len - dataoff, protocol,
+ csum_sub(skb->csum,
+ skb_checksum(skb, 0,
+ dataoff, 0)))) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = ~csum_unfold(
+ csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ skb->len - dataoff,
+ protocol,
+ csum_sub(0,
+ skb_checksum(skb, 0,
+ dataoff, 0))));
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip6_checksum);
+
+static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u8 protocol)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __wsum hsum;
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip6_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ hsum = skb_checksum(skb, 0, dataoff, 0);
+ skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+ &ip6h->daddr,
+ skb->len - dataoff,
+ protocol,
+ csum_sub(0, hsum)));
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+};
__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol,
+ unsigned int dataoff, u8 protocol,
unsigned short family)
{
- const struct nf_ipv6_ops *v6ops;
__sum16 csum = 0;
switch (family) {
@@ -16,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
csum = nf_ip_checksum(skb, hook, dataoff, protocol);
break;
case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- csum = v6ops->checksum(skb, hook, dataoff, protocol);
+ csum = nf_ip6_checksum(skb, hook, dataoff, protocol);
break;
}
@@ -28,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum);
__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
unsigned int dataoff, unsigned int len,
- u_int8_t protocol, unsigned short family)
+ u8 protocol, unsigned short family)
{
- const struct nf_ipv6_ops *v6ops;
__sum16 csum = 0;
switch (family) {
@@ -39,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
protocol);
break;
case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- csum = v6ops->checksum_partial(skb, hook, dataoff, len,
- protocol);
+ csum = nf_ip6_checksum_partial(skb, hook, dataoff, len,
+ protocol);
break;
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d0d8397c9588..aecadd471e1d 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1178,12 +1178,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE)
return NULL;
- /* __GFP_NORETRY is not fully supported by kvmalloc but it should
- * work reasonably well if sz is too large and bail out rather
- * than shoot all processes down before realizing there is nothing
- * more to reclaim.
- */
- info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY);
+ info = kvmalloc(sz, GFP_KERNEL_ACCOUNT);
if (!info)
return NULL;
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index f368ee6741db..af883f1b64f9 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -72,7 +72,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
struct audit_buffer *ab;
int fam = -1;
- if (audit_enabled == 0)
+ if (audit_enabled == AUDIT_OFF)
goto errout;
ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
if (ab == NULL)
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index 9f4151ec3e06..6c7aa6a0a0d2 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -16,6 +16,9 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_CHECKSUM.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
MODULE_DESCRIPTION("Xtables: checksum modification");
@@ -25,7 +28,7 @@ MODULE_ALIAS("ip6t_CHECKSUM");
static unsigned int
checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
- if (skb->ip_summed == CHECKSUM_PARTIAL)
+ if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb))
skb_checksum_help(skb);
return XT_CONTINUE;
@@ -34,6 +37,8 @@ checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int checksum_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_CHECKSUM_info *einfo = par->targinfo;
+ const struct ip6t_ip6 *i6 = par->entryinfo;
+ const struct ipt_ip *i4 = par->entryinfo;
if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
pr_info_ratelimited("unsupported CHECKSUM operation %x\n",
@@ -43,6 +48,21 @@ static int checksum_tg_check(const struct xt_tgchk_param *par)
if (!einfo->operation)
return -EINVAL;
+ switch (par->family) {
+ case NFPROTO_IPV4:
+ if (i4->proto == IPPROTO_UDP &&
+ (i4->invflags & XT_INV_PROTO) == 0)
+ return 0;
+ break;
+ case NFPROTO_IPV6:
+ if ((i6->flags & IP6T_F_PROTO) &&
+ i6->proto == IPPROTO_UDP &&
+ (i6->invflags & XT_INV_PROTO) == 0)
+ return 0;
+ break;
+ }
+
+ pr_warn_once("CHECKSUM should be avoided. If really needed, restrict with \"-p udp\" and only use in OUTPUT\n");
return 0;
}
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 03b9a50ec93b..2c7a4b80206f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
return -ENOENT;
}
- help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
if (help == NULL) {
nf_conntrack_helper_put(helper);
return -ENOMEM;
@@ -104,7 +104,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
}
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout)
+static void __xt_ct_tg_timeout_put(struct nf_ct_timeout *timeout)
{
typeof(nf_ct_timeout_put_hook) timeout_put;
@@ -121,7 +121,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
const struct nf_conntrack_l4proto *l4proto;
- struct ctnl_timeout *timeout;
+ struct nf_ct_timeout *timeout;
struct nf_conn_timeout *timeout_ext;
const char *errmsg = NULL;
int ret = 0;
@@ -159,7 +159,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
/* Make sure the timeout policy matches any existing protocol tracker,
* otherwise default to generic.
*/
- l4proto = __nf_ct_l4proto_find(par->family, proto);
+ l4proto = __nf_ct_l4proto_find(proto);
if (timeout->l4proto->l4proto != l4proto->l4proto) {
ret = -EINVAL;
pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n",
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 5ee859193783..c6acfc2d9c84 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -68,8 +68,6 @@ struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
{
struct idletimer_tg *entry;
- BUG_ON(!label);
-
list_for_each_entry(entry, &idletimer_tg_list, entry) {
if (!strcmp(label, entry->attr.attr.name))
return entry;
@@ -172,8 +170,6 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
pr_debug("resetting timer %s, timeout period %u\n",
info->label, info->timeout);
- BUG_ON(!info->timer);
-
mod_timer(&info->timer->timer,
msecs_to_jiffies(info->timeout * 1000) + jiffies);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 4ad5fe27e08b..f16202d26c20 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -35,8 +35,6 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
u32 secmark = 0;
const struct xt_secmark_target_info *info = par->targinfo;
- BUG_ON(info->mode != mode);
-
switch (mode) {
case SECMARK_MODE_SEL:
secmark = info->secid;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 475957cfcf50..1dae02a97ee3 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -14,6 +14,8 @@
#include <linux/skbuff.h>
#include <linux/route.h>
#include <linux/netfilter/x_tables.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/route.h>
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
#include <net/netfilter/ipv6/nf_dup_ipv6.h>
@@ -25,8 +27,15 @@ struct xt_tee_priv {
int oif;
};
+static unsigned int tee_net_id __read_mostly;
static const union nf_inet_addr tee_zero_address;
+struct tee_net {
+ struct list_head priv_list;
+ /* lock protects the priv_list */
+ struct mutex lock;
+};
+
static unsigned int
tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -38,7 +47,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -51,17 +60,16 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
}
#endif
-static DEFINE_MUTEX(priv_list_mutex);
-static LIST_HEAD(priv_list);
-
static int tee_netdev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct tee_net *tn = net_generic(net, tee_net_id);
struct xt_tee_priv *priv;
- mutex_lock(&priv_list_mutex);
- list_for_each_entry(priv, &priv_list, list) {
+ mutex_lock(&tn->lock);
+ list_for_each_entry(priv, &tn->priv_list, list) {
switch (event) {
case NETDEV_REGISTER:
if (!strcmp(dev->name, priv->tginfo->oif))
@@ -79,13 +87,14 @@ static int tee_netdev_event(struct notifier_block *this, unsigned long event,
break;
}
}
- mutex_unlock(&priv_list_mutex);
+ mutex_unlock(&tn->lock);
return NOTIFY_DONE;
}
static int tee_tg_check(const struct xt_tgchk_param *par)
{
+ struct tee_net *tn = net_generic(par->net, tee_net_id);
struct xt_tee_tginfo *info = par->targinfo;
struct xt_tee_priv *priv;
@@ -95,6 +104,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->oif[0]) {
+ struct net_device *dev;
+
if (info->oif[sizeof(info->oif)-1] != '\0')
return -EINVAL;
@@ -106,9 +117,14 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
priv->oif = -1;
info->priv = priv;
- mutex_lock(&priv_list_mutex);
- list_add(&priv->list, &priv_list);
- mutex_unlock(&priv_list_mutex);
+ dev = dev_get_by_name(par->net, info->oif);
+ if (dev) {
+ priv->oif = dev->ifindex;
+ dev_put(dev);
+ }
+ mutex_lock(&tn->lock);
+ list_add(&priv->list, &tn->priv_list);
+ mutex_unlock(&tn->lock);
} else
info->priv = NULL;
@@ -118,12 +134,13 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
static void tee_tg_destroy(const struct xt_tgdtor_param *par)
{
+ struct tee_net *tn = net_generic(par->net, tee_net_id);
struct xt_tee_tginfo *info = par->targinfo;
if (info->priv) {
- mutex_lock(&priv_list_mutex);
+ mutex_lock(&tn->lock);
list_del(&info->priv->list);
- mutex_unlock(&priv_list_mutex);
+ mutex_unlock(&tn->lock);
kfree(info->priv);
}
static_key_slow_dec(&xt_tee_enabled);
@@ -141,7 +158,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TEE",
.revision = 1,
@@ -156,6 +173,21 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
#endif
};
+static int __net_init tee_net_init(struct net *net)
+{
+ struct tee_net *tn = net_generic(net, tee_net_id);
+
+ INIT_LIST_HEAD(&tn->priv_list);
+ mutex_init(&tn->lock);
+ return 0;
+}
+
+static struct pernet_operations tee_net_ops = {
+ .init = tee_net_init,
+ .id = &tee_net_id,
+ .size = sizeof(struct tee_net),
+};
+
static struct notifier_block tee_netdev_notifier = {
.notifier_call = tee_netdev_event,
};
@@ -164,22 +196,32 @@ static int __init tee_tg_init(void)
{
int ret;
- ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
- if (ret)
+ ret = register_pernet_subsys(&tee_net_ops);
+ if (ret < 0)
return ret;
+
+ ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+ if (ret < 0)
+ goto cleanup_subsys;
+
ret = register_netdevice_notifier(&tee_netdev_notifier);
- if (ret) {
- xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
- return ret;
- }
+ if (ret < 0)
+ goto unregister_targets;
return 0;
+
+unregister_targets:
+ xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+cleanup_subsys:
+ unregister_pernet_subsys(&tee_net_ops);
+ return ret;
}
static void __exit tee_tg_exit(void)
{
unregister_netdevice_notifier(&tee_netdev_notifier);
xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+ unregister_pernet_subsys(&tee_net_ops);
}
module_init(tee_tg_init);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index d76550a8b642..ad7420cdc439 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -36,15 +36,6 @@
#include <net/netfilter/nf_tproxy.h>
#include <linux/netfilter/xt_TPROXY.h>
-/* assign a socket to the skb -- consumes sk */
-static void
-nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
-{
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = sock_edemux;
-}
-
static unsigned int
tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 7df2dece57d3..5cb1ecb29ea4 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -68,12 +68,45 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
return 0;
}
+static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
+{
+ struct xt_cgroup_info_v2 *info = par->matchinfo;
+ struct cgroup *cgrp;
+
+ if ((info->invert_path & ~1) || (info->invert_classid & ~1))
+ return -EINVAL;
+
+ if (!info->has_path && !info->has_classid) {
+ pr_info("xt_cgroup: no path or classid specified\n");
+ return -EINVAL;
+ }
+
+ if (info->has_path && info->has_classid) {
+ pr_info_ratelimited("path and classid specified\n");
+ return -EINVAL;
+ }
+
+ info->priv = NULL;
+ if (info->has_path) {
+ cgrp = cgroup_get_from_path(info->path);
+ if (IS_ERR(cgrp)) {
+ pr_info_ratelimited("invalid path, errno=%ld\n",
+ PTR_ERR(cgrp));
+ return -EINVAL;
+ }
+ info->priv = cgrp;
+ }
+
+ return 0;
+}
+
static bool
cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_cgroup_info_v0 *info = par->matchinfo;
+ struct sock *sk = skb->sk;
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
return false;
return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
@@ -85,8 +118,27 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_cgroup_info_v1 *info = par->matchinfo;
struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
struct cgroup *ancestor = info->priv;
+ struct sock *sk = skb->sk;
+
+ if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
+ return false;
+
+ if (ancestor)
+ return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
+ info->invert_path;
+ else
+ return (info->classid == sock_cgroup_classid(skcd)) ^
+ info->invert_classid;
+}
+
+static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_cgroup_info_v2 *info = par->matchinfo;
+ struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
+ struct cgroup *ancestor = info->priv;
+ struct sock *sk = skb->sk;
- if (!skb->sk || !sk_fullsock(skb->sk))
+ if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk)))
return false;
if (ancestor)
@@ -105,6 +157,14 @@ static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
cgroup_put(info->priv);
}
+static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par)
+{
+ struct xt_cgroup_info_v2 *info = par->matchinfo;
+
+ if (info->priv)
+ cgroup_put(info->priv);
+}
+
static struct xt_match cgroup_mt_reg[] __read_mostly = {
{
.name = "cgroup",
@@ -132,6 +192,20 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = {
(1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_IN),
},
+ {
+ .name = "cgroup",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cgroup_mt_check_v2,
+ .match = cgroup_mt_v2,
+ .matchsize = sizeof(struct xt_cgroup_info_v2),
+ .usersize = offsetof(struct xt_cgroup_info_v2, priv),
+ .destroy = cgroup_mt_destroy_v2,
+ .me = THIS_MODULE,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ },
};
static int __init cgroup_mt_init(void)
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index dfbdbb2fc0ed..51d0c257e7a5 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -125,6 +125,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
{
struct xt_cluster_match_info *info = par->matchinfo;
+ int ret;
if (info->total_nodes > XT_CLUSTER_NODES_MAX) {
pr_info_ratelimited("you have exceeded the maximum number of cluster nodes (%u > %u)\n",
@@ -135,7 +136,17 @@ static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
pr_info_ratelimited("node mask cannot exceed total number of nodes\n");
return -EDOM;
}
- return 0;
+
+ ret = nf_ct_netns_get(par->net, par->family);
+ if (ret < 0)
+ pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+}
+
+static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
}
static struct xt_match xt_cluster_match __read_mostly = {
@@ -144,6 +155,7 @@ static struct xt_match xt_cluster_match __read_mostly = {
.match = xt_cluster_mt,
.checkentry = xt_cluster_mt_checkentry,
.matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
.me = THIS_MODULE,
};
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 6275106ccf50..bc6c8ab0fa62 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -93,10 +93,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
/* init private data */
info->data = nf_conncount_init(par->net, par->family, keylen);
- if (IS_ERR(info->data))
- return PTR_ERR(info->data);
- return 0;
+ return PTR_ERR_OR_ZERO(info->data);
}
static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9b16402f29af..3e7d259e5d8d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1057,7 +1057,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
static void *dl_seq_start(struct seq_file *s, loff_t *pos)
__acquires(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket;
spin_lock_bh(&htable->lock);
@@ -1074,7 +1074,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
*pos = ++(*bucket);
@@ -1088,7 +1088,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void dl_seq_stop(struct seq_file *s, void *v)
__releases(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
if (!IS_ERR(bucket))
@@ -1130,7 +1130,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1145,7 +1145,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1160,7 +1160,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1174,7 +1174,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_show_v2(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
@@ -1188,7 +1188,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
static int dl_seq_show_v1(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
@@ -1202,7 +1202,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
static int dl_seq_show(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->private));
+ struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index 8af9707f8789..ac91170fc8c8 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -216,6 +216,8 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "DNAT",
.revision = 2,
+ .checkentry = xt_nat_checkentry,
+ .destroy = xt_nat_destroy,
.target = xt_dnat_target_v2,
.targetsize = sizeof(struct nf_nat_range2),
.table = "nat",
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 9cfef73b4107..7a103553d10d 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,129 +37,11 @@
#include <net/netfilter/nf_log.h>
#include <linux/netfilter/xt_osf.h>
-/*
- * Indexed by dont-fragment bit.
- * It is the only constant value in the fingerprint.
- */
-static struct list_head xt_osf_fingers[2];
-
-static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
- [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) },
-};
-
-static int xt_osf_add_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
-{
- struct xt_osf_user_finger *f;
- struct xt_osf_finger *kf = NULL, *sf;
- int err = 0;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!osf_attrs[OSF_ATTR_FINGER])
- return -EINVAL;
-
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
- return -EINVAL;
-
- f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
-
- kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
- if (!kf)
- return -ENOMEM;
-
- memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
-
- list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
- if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
- continue;
-
- kfree(kf);
- kf = NULL;
-
- if (nlh->nlmsg_flags & NLM_F_EXCL)
- err = -EEXIST;
- break;
- }
-
- /*
- * We are protected by nfnl mutex.
- */
- if (kf)
- list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
-
- return err;
-}
-
-static int xt_osf_remove_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
-{
- struct xt_osf_user_finger *f;
- struct xt_osf_finger *sf;
- int err = -ENOENT;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!osf_attrs[OSF_ATTR_FINGER])
- return -EINVAL;
-
- f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
-
- list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
- if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
- continue;
-
- /*
- * We are protected by nfnl mutex.
- */
- list_del_rcu(&sf->finger_entry);
- kfree_rcu(sf, rcu_head);
-
- err = 0;
- break;
- }
-
- return err;
-}
-
-static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
- [OSF_MSG_ADD] = {
- .call = xt_osf_add_callback,
- .attr_count = OSF_ATTR_MAX,
- .policy = xt_osf_policy,
- },
- [OSF_MSG_REMOVE] = {
- .call = xt_osf_remove_callback,
- .attr_count = OSF_ATTR_MAX,
- .policy = xt_osf_policy,
- },
-};
-
-static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
- .name = "osf",
- .subsys_id = NFNL_SUBSYS_OSF,
- .cb_count = OSF_MSG_MAX,
- .cb = xt_osf_nfnetlink_callbacks,
-};
-
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
- const struct xt_osf_info *info = p->matchinfo;
- struct net *net = xt_net(p);
-
- if (!info)
- return false;
-
return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
- xt_out(p), info, net, xt_osf_fingers);
+ xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers);
}
static struct xt_match xt_osf_match = {
@@ -177,52 +59,21 @@ static struct xt_match xt_osf_match = {
static int __init xt_osf_init(void)
{
- int err = -EINVAL;
- int i;
-
- for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
- INIT_LIST_HEAD(&xt_osf_fingers[i]);
-
- err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
- if (err < 0) {
- pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
- goto err_out_exit;
- }
+ int err;
err = xt_register_match(&xt_osf_match);
if (err) {
pr_err("Failed to register OS fingerprint "
"matching module (%d)\n", err);
- goto err_out_remove;
+ return err;
}
return 0;
-
-err_out_remove:
- nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
-err_out_exit:
- return err;
}
static void __exit xt_osf_fini(void)
{
- struct xt_osf_finger *f;
- int i;
-
- nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
xt_unregister_match(&xt_osf_match);
-
- rcu_read_lock();
- for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
-
- list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
- list_del_rcu(&f->finger_entry);
- kfree_rcu(f, rcu_head);
- }
- }
- rcu_read_unlock();
-
- rcu_barrier();
}
module_init(xt_osf_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 3d705c688a27..46686fb73784 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
struct sock *sk = skb_to_full_sk(skb);
struct net *net = xt_net(par);
- if (sk == NULL || sk->sk_socket == NULL)
+ if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk)))
return (info->match ^ info->invert) == 0;
else if (info->match & info->invert & XT_OWNER_SOCKET)
/*
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 07085c22b19c..f44de4bc2100 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
/* use TTL as seen before forwarding */
- if (xt_out(par) != NULL && skb->sk == NULL)
+ if (xt_out(par) != NULL &&
+ (!skb->sk || !net_eq(net, sock_net(skb->sk))))
ttl++;
spin_lock_bh(&recent_lock);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 5c0779c4fa3c..ada144e5645b 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
+ if (sk && !net_eq(xt_net(par), sock_net(sk)))
+ sk = NULL;
+
if (!sk)
sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
+
if (sk) {
bool wildcard;
bool transparent = true;
@@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
+ if (sk && !net_eq(xt_net(par), sock_net(sk)))
+ sk = NULL;
+
if (!sk)
sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
+
if (sk) {
bool wildcard;
bool transparent = true;
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index c070dfc0190a..c92894c3e40a 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -781,7 +781,8 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
{
u32 addr_len;
- if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
+ if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] &&
+ info->attrs[NLBL_UNLABEL_A_IPV4MASK]) {
addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
if (addr_len != sizeof(struct in_addr) &&
addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 2f328af91a52..4676f5bb16ae 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -101,7 +101,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
char *secctx;
u32 secctx_len;
- if (audit_enabled == 0)
+ if (audit_enabled == AUDIT_OFF)
return NULL;
audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 56704d95f82d..6bb9f3cde0b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -574,11 +574,6 @@ static int netlink_insert(struct sock *sk, u32 portid)
if (nlk_sk(sk)->bound)
goto err;
- err = -ENOMEM;
- if (BITS_PER_LONG > 32 &&
- unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
- goto err;
-
nlk_sk(sk)->portid = portid;
sock_hold(sk);
@@ -993,7 +988,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
int err = 0;
- long unsigned int groups = nladdr->nl_groups;
+ unsigned long groups = nladdr->nl_groups;
bool bound;
if (addr_len < sizeof(struct sockaddr_nl))
@@ -1011,9 +1006,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
return err;
}
- if (nlk->ngroups == 0)
- groups = 0;
- else if (nlk->ngroups < 8*sizeof(groups))
+ if (nlk->ngroups < BITS_PER_LONG)
groups &= (1UL << nlk->ngroups) - 1;
bound = nlk->bound;
@@ -1713,6 +1706,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
nlk->flags &= ~NETLINK_F_EXT_ACK;
err = 0;
break;
+ case NETLINK_DUMP_STRICT_CHK:
+ if (val)
+ nlk->flags |= NETLINK_F_STRICT_CHK;
+ else
+ nlk->flags &= ~NETLINK_F_STRICT_CHK;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
@@ -1806,6 +1806,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
err = 0;
break;
+ case NETLINK_DUMP_STRICT_CHK:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0;
+ if (put_user(len, optlen) || put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
@@ -2178,6 +2187,7 @@ EXPORT_SYMBOL(__nlmsg_put);
static int netlink_dump(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
+ struct netlink_ext_ack extack = {};
struct netlink_callback *cb;
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
@@ -2229,8 +2239,11 @@ static int netlink_dump(struct sock *sk)
skb_reserve(skb, skb_tailroom(skb) - alloc_size);
netlink_skb_set_owner_r(skb, sk);
- if (nlk->dump_done_errno > 0)
+ if (nlk->dump_done_errno > 0) {
+ cb->extack = &extack;
nlk->dump_done_errno = cb->dump(skb, cb);
+ cb->extack = NULL;
+ }
if (nlk->dump_done_errno > 0 ||
skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
@@ -2244,7 +2257,8 @@ static int netlink_dump(struct sock *sk)
}
nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
- sizeof(nlk->dump_done_errno), NLM_F_MULTI);
+ sizeof(nlk->dump_done_errno),
+ NLM_F_MULTI | cb->answer_flags);
if (WARN_ON(!nlh))
goto errout_skb;
@@ -2253,6 +2267,12 @@ static int netlink_dump(struct sock *sk)
memcpy(nlmsg_data(nlh), &nlk->dump_done_errno,
sizeof(nlk->dump_done_errno));
+ if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) {
+ nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
+ if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg))
+ nlmsg_end(skb, nlh);
+ }
+
if (sk_filter(sk, skb))
kfree_skb(skb);
else
@@ -2279,9 +2299,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct netlink_dump_control *control)
{
+ struct netlink_sock *nlk, *nlk2;
struct netlink_callback *cb;
struct sock *sk;
- struct netlink_sock *nlk;
int ret;
refcount_inc(&skb->users);
@@ -2307,7 +2327,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
cb = &nlk->cb;
memset(cb, 0, sizeof(*cb));
- cb->start = control->start;
cb->dump = control->dump;
cb->done = control->done;
cb->nlh = nlh;
@@ -2316,8 +2335,11 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
cb->min_dump_alloc = control->min_dump_alloc;
cb->skb = skb;
- if (cb->start) {
- ret = cb->start(cb);
+ nlk2 = nlk_sk(NETLINK_CB(skb).sk);
+ cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK);
+
+ if (control->start) {
+ ret = control->start(cb);
if (ret)
goto error_put;
}
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 962de7b3c023..5f454c8de6a4 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -15,6 +15,7 @@
#define NETLINK_F_LISTEN_ALL_NSID 0x10
#define NETLINK_F_CAP_ACK 0x20
#define NETLINK_F_EXT_ACK 0x40
+#define NETLINK_F_STRICT_CHK 0x80
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index ac8030c4bcf8..19cb2e473ea6 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
}
create_info = (struct hci_create_pipe_resp *)skb->data;
+ if (create_info->pipe >= NFC_HCI_MAX_PIPES) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+
/* Save the new created pipe and bind with local gate,
* the description for skb->data[3] is destination gate id
* but since we received this cmd from host controller, we
@@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
}
delete_info = (struct hci_delete_pipe_noti *)skb->data;
+ if (delete_info->pipe >= NFC_HCI_MAX_PIPES) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+
hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE;
hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST;
break;
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ea0c0c6f1874..ae296273ce3d 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
pr_debug("%p\n", sk);
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
if (sk->sk_state == LLCP_LISTEN)
return llcp_accept_poll(sk);
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index a66f102c6c01..78fe622eba65 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -192,10 +192,8 @@ static void nci_uart_tty_close(struct tty_struct *tty)
if (!nu)
return;
- if (nu->tx_skb)
- kfree_skb(nu->tx_skb);
- if (nu->rx_skb)
- kfree_skb(nu->rx_skb);
+ kfree_skb(nu->tx_skb);
+ kfree_skb(nu->rx_skb);
skb_queue_purge(&nu->tx_q);
@@ -465,6 +463,7 @@ static struct tty_ldisc_ops nci_uart_ldisc = {
.receive_buf = nci_uart_tty_receive,
.write_wakeup = nci_uart_tty_wakeup,
.ioctl = nci_uart_tty_ioctl,
+ .compat_ioctl = nci_uart_tty_ioctl,
};
static int __init nci_uart_init(void)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 30a5df27116e..85ae53d8fd09 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
clone_flow_key);
}
+/* When 'last' is true, clone() should always consume the 'skb'.
+ * Otherwise, clone() should keep 'skb' intact regardless what
+ * actions are executed within clone().
+ */
+static int clone(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, const struct nlattr *attr,
+ bool last)
+{
+ struct nlattr *actions;
+ struct nlattr *clone_arg;
+ int rem = nla_len(attr);
+ bool dont_clone_flow_key;
+
+ /* The first action is always 'OVS_CLONE_ATTR_ARG'. */
+ clone_arg = nla_data(attr);
+ dont_clone_flow_key = nla_get_u32(clone_arg);
+ actions = nla_next(clone_arg, &rem);
+
+ return clone_execute(dp, skb, key, 0, actions, rem, last,
+ !dont_clone_flow_key);
+}
+
static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
const struct nlattr *attr)
{
@@ -1336,6 +1358,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
consume_skb(skb);
return 0;
}
+ break;
+
+ case OVS_ACTION_ATTR_CLONE: {
+ bool last = nla_is_last(a, rem);
+
+ err = clone(dp, skb, key, a, last);
+ if (last)
+ return err;
+
+ break;
+ }
}
if (unlikely(err)) {
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 284aca2a252d..6bec37ab4472 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -26,6 +26,7 @@
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <linux/netfilter/nf_nat.h>
@@ -607,23 +608,12 @@ static struct nf_conn *
ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
u8 l3num, struct sk_buff *skb, bool natted)
{
- const struct nf_conntrack_l3proto *l3proto;
- const struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- unsigned int dataoff;
- u8 protonum;
- l3proto = __nf_ct_l3proto_find(l3num);
- if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
- &protonum) <= 0) {
- pr_debug("ovs_ct_find_existing: Can't get protonum\n");
- return NULL;
- }
- l4proto = __nf_ct_l4proto_find(l3num, protonum);
- if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- protonum, net, &tuple, l3proto, l4proto)) {
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num,
+ net, &tuple)) {
pr_debug("ovs_ct_find_existing: Can't get tuple\n");
return NULL;
}
@@ -632,7 +622,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
if (natted) {
struct nf_conntrack_tuple inverse;
- if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+ if (!nf_ct_invert_tuplepr(&inverse, &tuple)) {
pr_debug("ovs_ct_find_existing: Inversion failed!\n");
return NULL;
}
@@ -943,6 +933,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
struct nf_conn *ct;
if (!cached) {
+ struct nf_hook_state state = {
+ .hook = NF_INET_PRE_ROUTING,
+ .pf = info->family,
+ .net = net,
+ };
struct nf_conn *tmpl = info->ct;
int err;
@@ -954,8 +949,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
nf_ct_set(skb, tmpl, IP_CT_NEW);
}
- err = nf_conntrack_in(net, info->family,
- NF_INET_PRE_ROUTING, skb);
+ err = nf_conntrack_in(skb, &state);
if (err != NF_ACCEPT)
return -ENOENT;
@@ -1314,7 +1308,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
return -EINVAL;
}
- help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+ help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
if (!help) {
nf_conntrack_helper_put(helper);
return -ENOMEM;
@@ -1322,6 +1316,10 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
rcu_assign_pointer(help->helper, helper);
info->helper = helper;
+
+ if (info->nat)
+ request_module("ip_nat_%s", name);
+
return 0;
}
@@ -1634,10 +1632,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
OVS_NLERR(log, "Failed to allocate conntrack template");
return -ENOMEM;
}
-
- __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
- nf_conntrack_get(&ct_info.ct->ct_general);
-
if (helper) {
err = ovs_ct_add_helper(&ct_info, helper, key, log);
if (err)
@@ -1649,6 +1643,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
if (err)
goto err_free_ct;
+ __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
+ nf_conntrack_get(&ct_info.ct->ct_general);
return 0;
err_free_ct:
__ovs_ct_free_action(&ct_info);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 0f5ce77460d4..6679e96ab1dc 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1182,14 +1182,14 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
ovs_header->dp_ifindex,
reply, info->snd_portid,
info->snd_seq, 0,
- OVS_FLOW_CMD_NEW,
+ OVS_FLOW_CMD_SET,
ufid_flags);
BUG_ON(error < 0);
}
} else {
/* Could not alloc without acts before locking. */
reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex,
- info, OVS_FLOW_CMD_NEW, false,
+ info, OVS_FLOW_CMD_SET, false,
ufid_flags);
if (IS_ERR(reply)) {
@@ -1265,7 +1265,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
}
reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info,
- OVS_FLOW_CMD_NEW, true, ufid_flags);
+ OVS_FLOW_CMD_GET, true, ufid_flags);
if (IS_ERR(reply)) {
err = PTR_ERR(reply);
goto unlock;
@@ -1389,7 +1389,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- OVS_FLOW_CMD_NEW, ufid_flags) < 0)
+ OVS_FLOW_CMD_GET, ufid_flags) < 0)
break;
cb->args[0] = bucket;
@@ -1730,7 +1730,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
ovs_dp_change(dp, info->attrs);
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
- info->snd_seq, 0, OVS_DP_CMD_NEW);
+ info->snd_seq, 0, OVS_DP_CMD_SET);
BUG_ON(err < 0);
ovs_unlock();
@@ -1761,7 +1761,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_free;
}
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
- info->snd_seq, 0, OVS_DP_CMD_NEW);
+ info->snd_seq, 0, OVS_DP_CMD_GET);
BUG_ON(err < 0);
ovs_unlock();
@@ -1785,7 +1785,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (i >= skip &&
ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- OVS_DP_CMD_NEW) < 0)
+ OVS_DP_CMD_GET) < 0)
break;
i++;
}
@@ -2101,7 +2101,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_NEW);
+ OVS_VPORT_CMD_SET);
BUG_ON(err < 0);
ovs_unlock();
@@ -2182,7 +2182,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock_free;
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_NEW);
+ OVS_VPORT_CMD_GET);
BUG_ON(err < 0);
rcu_read_unlock();
@@ -2218,7 +2218,7 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
- OVS_VPORT_CMD_NEW) < 0)
+ OVS_VPORT_CMD_GET) < 0)
goto out;
j++;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 56b8e7167790..35966da84769 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -254,21 +254,18 @@ static bool icmphdr_ok(struct sk_buff *skb)
static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
{
+ unsigned short frag_off;
+ unsigned int payload_ofs = 0;
unsigned int nh_ofs = skb_network_offset(skb);
unsigned int nh_len;
- int payload_ofs;
struct ipv6hdr *nh;
- uint8_t nexthdr;
- __be16 frag_off;
- int err;
+ int err, nexthdr, flags = 0;
err = check_header(skb, nh_ofs + sizeof(*nh));
if (unlikely(err))
return err;
nh = ipv6_hdr(skb);
- nexthdr = nh->nexthdr;
- payload_ofs = (u8 *)(nh + 1) - skb->data;
key->ip.proto = NEXTHDR_NONE;
key->ip.tos = ipv6_get_dsfield(nh);
@@ -277,10 +274,9 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
key->ipv6.addr.src = nh->saddr;
key->ipv6.addr.dst = nh->daddr;
- payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
-
- if (frag_off) {
- if (frag_off & htons(~0x7))
+ nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
+ if (flags & IP6_FH_F_FRAG) {
+ if (frag_off)
key->ip.frag = OVS_FRAG_TYPE_LATER;
else
key->ip.frag = OVS_FRAG_TYPE_FIRST;
@@ -288,11 +284,11 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
key->ip.frag = OVS_FRAG_TYPE_NONE;
}
- /* Delayed handling of error in ipv6_skip_exthdr() as it
- * always sets frag_off to a valid value which may be
+ /* Delayed handling of error in ipv6_find_hdr() as it
+ * always sets flags and frag_off to a valid value which may be
* used to set key->ip.frag above.
*/
- if (unlikely(payload_ofs < 0))
+ if (unlikely(nexthdr < 0))
return -EPROTO;
nh_len = payload_ofs - nh_ofs;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 492ab0c36f7c..a70097ecf33c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2460,6 +2460,40 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
return 0;
}
+static int validate_and_copy_clone(struct net *net,
+ const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci,
+ bool log, bool last)
+{
+ int start, err;
+ u32 exec;
+
+ if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN)
+ return -EINVAL;
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log);
+ if (start < 0)
+ return start;
+
+ exec = last || !actions_may_change_flow(attr);
+
+ err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec,
+ sizeof(exec), log);
+ if (err)
+ return err;
+
+ err = __ovs_nla_copy_actions(net, attr, key, sfa,
+ eth_type, vlan_tci, log);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, start);
+
+ return 0;
+}
+
void ovs_match_init(struct sw_flow_match *match,
struct sw_flow_key *key,
bool reset_key,
@@ -2516,7 +2550,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
struct ovs_tunnel_info *ovs_tun;
struct nlattr *a;
int err = 0, start, opts_type;
+ __be16 dst_opt_type;
+ dst_opt_type = 0;
ovs_match_init(&match, &key, true, NULL);
opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
if (opts_type < 0)
@@ -2528,10 +2564,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
err = validate_geneve_opts(&key);
if (err < 0)
return err;
+ dst_opt_type = TUNNEL_GENEVE_OPT;
break;
case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
+ dst_opt_type = TUNNEL_VXLAN_OPT;
break;
case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+ dst_opt_type = TUNNEL_ERSPAN_OPT;
break;
}
}
@@ -2574,7 +2613,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
*/
ip_tunnel_info_opts_set(tun_info,
TUN_METADATA_OPTS(&key, key.tun_opts_len),
- key.tun_opts_len);
+ key.tun_opts_len, dst_opt_type);
add_nested_action_end(*sfa, start);
return err;
@@ -2844,6 +2883,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1,
[OVS_ACTION_ATTR_POP_NSH] = 0,
[OVS_ACTION_ATTR_METER] = sizeof(u32),
+ [OVS_ACTION_ATTR_CLONE] = (u32)-1,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -3033,6 +3073,18 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
/* Non-existent meters are simply ignored. */
break;
+ case OVS_ACTION_ATTR_CLONE: {
+ bool last = nla_is_last(a, rem);
+
+ err = validate_and_copy_clone(net, a, key, sfa,
+ eth_type, vlan_tci,
+ log, last);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+ }
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
@@ -3111,6 +3163,26 @@ out:
return err;
}
+static int clone_action_to_attr(const struct nlattr *attr,
+ struct sk_buff *skb)
+{
+ struct nlattr *start;
+ int err = 0, rem = nla_len(attr);
+
+ start = nla_nest_start(skb, OVS_ACTION_ATTR_CLONE);
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+
+ if (err)
+ nla_nest_cancel(skb, start);
+ else
+ nla_nest_end(skb, start);
+
+ return err;
+}
+
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -3199,6 +3271,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
return err;
break;
+ case OVS_ACTION_ATTR_CLONE:
+ err = clone_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index bb95c43aae76..26f71cbf7527 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -43,7 +43,8 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev)
}
/* Called with rcu_read_lock_bh. */
-static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
+static netdev_tx_t
+internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
int len, err;
@@ -62,7 +63,7 @@ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
} else {
netdev->stats.tx_errors++;
}
- return 0;
+ return NETDEV_TX_OK;
}
static int internal_dev_open(struct net_device *netdev)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e6445d8f3f57..ec3095f13aae 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -275,9 +275,10 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
return po->xmit == packet_direct_xmit;
}
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
{
- return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
+ return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
}
static u16 packet_pick_tx_queue(struct sk_buff *skb)
@@ -291,7 +292,7 @@ static u16 packet_pick_tx_queue(struct sk_buff *skb)
__packet_pick_tx_queue);
queue_index = netdev_cap_txqueue(dev, queue_index);
} else {
- queue_index = __packet_pick_tx_queue(dev, skb);
+ queue_index = __packet_pick_tx_queue(dev, skb, NULL);
}
return queue_index;
@@ -1581,7 +1582,7 @@ static int fanout_set_data(struct packet_sock *po, char __user *data,
return fanout_set_data_ebpf(po, data, len);
default:
return -EINVAL;
- };
+ }
}
static void fanout_release_data(struct packet_fanout *f)
@@ -1590,7 +1591,7 @@ static void fanout_release_data(struct packet_fanout *f)
case PACKET_FANOUT_CBPF:
case PACKET_FANOUT_EBPF:
__fanout_set_data_bpf(f, NULL);
- };
+ }
}
static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
@@ -1951,7 +1952,7 @@ retry:
goto out_unlock;
}
- sockc.tsflags = sk->sk_tsflags;
+ sockcm_init(&sockc, sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
if (unlikely(err))
@@ -1962,6 +1963,7 @@ retry:
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
+ skb->tstamp = sockc.transmit_time;
sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags);
@@ -2457,6 +2459,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb->dev = dev;
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
+ skb->tstamp = sockc->transmit_time;
sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
skb_shinfo(skb)->destructor_arg = ph.raw;
@@ -2633,7 +2636,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
- sockc.tsflags = po->sk.sk_tsflags;
+ sockcm_init(&sockc, &po->sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(&po->sk, msg, &sockc);
if (unlikely(err))
@@ -2712,10 +2715,12 @@ tpacket_error:
}
}
- if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
- vio_le())) {
- tp_len = -EINVAL;
- goto tpacket_error;
+ if (po->has_vnet_hdr) {
+ if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
+ tp_len = -EINVAL;
+ goto tpacket_error;
+ }
+ virtio_net_hdr_set_proto(skb, vnet_hdr);
}
skb->destructor = tpacket_destruct_skb;
@@ -2829,7 +2834,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_unlock;
- sockc.tsflags = sk->sk_tsflags;
+ sockcm_init(&sockc, sk);
sockc.mark = sk->sk_mark;
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
@@ -2905,12 +2910,14 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
skb->dev = dev;
skb->priority = sk->sk_priority;
skb->mark = sockc.mark;
+ skb->tstamp = sockc.transmit_time;
if (has_vnet_hdr) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
if (err)
goto out_free;
len += sizeof(vnet_hdr);
+ virtio_net_hdr_set_proto(skb, &vnet_hdr);
}
skb_probe_transport_header(skb, reserve);
@@ -3801,6 +3808,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return fanout_set_data(po, optval, optlen);
}
+ case PACKET_IGNORE_OUTGOING:
+ {
+ int val;
+
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ po->prot_hook.ignore_outgoing = !!val;
+ return 0;
+ }
case PACKET_TX_HAS_OFF:
{
unsigned int val;
@@ -3924,6 +3945,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
((u32)po->fanout->flags << 24)) :
0);
break;
+ case PACKET_IGNORE_OUTGOING:
+ val = po->prot_hook.ignore_outgoing;
+ break;
case PACKET_ROLLOVER_STATS:
if (!po->rollover)
return -EINVAL;
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index bffde4b46c5d..b9092111bc45 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -1,6 +1,6 @@
config RDS
- tristate "The RDS Protocol"
+ tristate "The Reliable Datagram Sockets Protocol"
depends on INET
---help---
The RDS (Reliable Datagram Sockets) protocol provides reliable,
@@ -16,6 +16,7 @@ config RDS_RDMA
config RDS_TCP
tristate "RDS over TCP"
depends on RDS
+ depends on IPV6 || !IPV6
---help---
Allow RDS to use TCP as a transport.
This transport does not support RDMA operations.
@@ -24,4 +25,3 @@ config RDS_DEBUG
bool "RDS debugging messages"
depends on RDS
default n
-
diff --git a/net/rds/Makefile b/net/rds/Makefile
index b5d568bd479c..e647f9de104a 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -15,4 +15,3 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
tcp_send.o tcp_stats.o
ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG
-
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ab751a150f70..65387e1e6964 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include <linux/poll.h>
#include <net/sock.h>
@@ -113,26 +114,82 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
int peer)
{
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
-
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int uaddr_len;
/* racey, don't care */
if (peer) {
- if (!rs->rs_conn_addr)
+ if (ipv6_addr_any(&rs->rs_conn_addr))
return -ENOTCONN;
- sin->sin_port = rs->rs_conn_port;
- sin->sin_addr.s_addr = rs->rs_conn_addr;
+ if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_conn_port;
+ sin6->sin6_addr = rs->rs_conn_addr;
+ sin6->sin6_flowinfo = 0;
+ /* scope_id is the same as in the bound address. */
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
} else {
- sin->sin_port = rs->rs_bound_port;
- sin->sin_addr.s_addr = rs->rs_bound_addr;
+ /* If socket is not yet bound and the socket is connected,
+ * set the return address family to be the same as the
+ * connected address, but with 0 address value. If it is not
+ * connected, set the family to be AF_UNSPEC (value 0) and
+ * the address size to be that of an IPv4 address.
+ */
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
+ if (ipv6_addr_any(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_UNSPEC;
+ return sizeof(*sin);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!(ipv6_addr_type(&rs->rs_conn_addr) &
+ IPV6_ADDR_MAPPED)) {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ memset(sin6, 0, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ return sizeof(*sin6);
+ }
+#endif
+
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ return sizeof(*sin);
+ }
+ if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_bound_port;
+ sin6->sin6_addr = rs->rs_bound_addr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
}
- sin->sin_family = AF_INET;
-
- return sizeof(*sin);
+ return uaddr_len;
}
/*
@@ -203,11 +260,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
int len)
{
+ struct sockaddr_in6 sin6;
struct sockaddr_in sin;
int ret = 0;
/* racing with another thread binding seems ok here */
- if (rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
@@ -215,14 +273,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
if (len < sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
+ } else if (len < sizeof(struct sockaddr_in6)) {
+ /* Assume IPv4 */
+ if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
+ sin6.sin6_port = sin.sin_port;
+ } else {
+ if (copy_from_user(&sin6, optval,
+ sizeof(struct sockaddr_in6))) {
+ ret = -EFAULT;
+ goto out;
+ }
}
- if (copy_from_user(&sin, optval, sizeof(sin))) {
- ret = -EFAULT;
- goto out;
- }
-
- rds_send_drop_to(rs, &sin);
+ rds_send_drop_to(rs, &sin6);
out:
return ret;
}
@@ -435,31 +502,91 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct sockaddr_in *sin;
struct rds_sock *rs = rds_sk_to_rs(sk);
int ret = 0;
lock_sock(sk);
- if (addr_len != sizeof(struct sockaddr_in)) {
- ret = -EINVAL;
- goto out;
- }
+ switch (uaddr->sa_family) {
+ case AF_INET:
+ sin = (struct sockaddr_in *)uaddr;
+ if (addr_len < sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ break;
+ }
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
+ ret = -EINVAL;
+ break;
+ }
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
+ rs->rs_conn_port = sin->sin_port;
+ break;
- if (sin->sin_family != AF_INET) {
- ret = -EAFNOSUPPORT;
- goto out;
- }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6;
+ int addr_type;
- if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
- ret = -EDESTADDRREQ;
- goto out;
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ if (addr_len < sizeof(struct sockaddr_in6)) {
+ ret = -EINVAL;
+ break;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED)) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+
+ /* It is a mapped address. Need to do some sanity
+ * checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(addr4))) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+ }
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ /* If socket is arleady bound to a link local address,
+ * the peer address must be on the same link.
+ */
+ if (sin6->sin6_scope_id == 0 ||
+ (!ipv6_addr_any(&rs->rs_bound_addr) &&
+ rs->rs_bound_scope_id &&
+ sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Remember the connected address scope ID. It will
+ * be checked against the binding local address when
+ * the socket is bound.
+ */
+ rs->rs_bound_scope_id = sin6->sin6_scope_id;
+ }
+ rs->rs_conn_addr = sin6->sin6_addr;
+ rs->rs_conn_port = sin6->sin6_port;
+ break;
}
+#endif
- rs->rs_conn_addr = sin->sin_addr.s_addr;
- rs->rs_conn_port = sin->sin_port;
+ default:
+ ret = -EAFNOSUPPORT;
+ break;
+ }
-out:
release_sock(sk);
return ret;
}
@@ -578,8 +705,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
total++;
if (total <= len)
- rds_inc_info_copy(inc, iter, inc->i_saddr,
- rs->rs_bound_addr, 1);
+ rds_inc_info_copy(inc, iter,
+ inc->i_saddr.s6_addr32[3],
+ rs->rs_bound_addr_v4,
+ 1);
}
read_unlock(&rs->rs_recv_lock);
@@ -608,8 +737,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
list_for_each_entry(rs, &rds_sock_list, rs_item) {
sinfo.sndbuf = rds_sk_sndbuf(rs);
sinfo.rcvbuf = rds_sk_rcvbuf(rs);
- sinfo.bound_addr = rs->rs_bound_addr;
- sinfo.connected_addr = rs->rs_conn_addr;
+ sinfo.bound_addr = rs->rs_bound_addr_v4;
+ sinfo.connected_addr = rs->rs_conn_addr_v4;
sinfo.bound_port = rs->rs_bound_port;
sinfo.connected_port = rs->rs_conn_port;
sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64aa4f0..762d2c6788a3 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
#include <linux/kernel.h>
#include <net/sock.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include <linux/if_arp.h>
#include <linux/jhash.h>
#include <linux/ratelimit.h>
@@ -42,42 +43,60 @@ static struct rhashtable bind_hash_table;
static const struct rhashtable_params ht_parms = {
.nelem_hint = 768,
- .key_len = sizeof(u64),
+ .key_len = RDS_BOUND_KEY_LEN,
.key_offset = offsetof(struct rds_sock, rs_bound_key),
.head_offset = offsetof(struct rds_sock, rs_bound_node),
.max_size = 16384,
.min_size = 1024,
};
+/* Create a key for the bind hash table manipulation. Port is in network byte
+ * order.
+ */
+static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
+ __be16 port, __u32 scope_id)
+{
+ memcpy(key, addr, sizeof(*addr));
+ key += sizeof(*addr);
+ memcpy(key, &port, sizeof(port));
+ key += sizeof(port);
+ memcpy(key, &scope_id, sizeof(scope_id));
+}
+
/*
* Return the rds_sock bound at the given local address.
*
* The rx path can race with rds_release. We notice if rds_release() has
* marked this socket and don't return a rs ref to the rx path.
*/
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id)
{
- u64 key = ((u64)addr << 32) | port;
+ u8 key[RDS_BOUND_KEY_LEN];
struct rds_sock *rs;
- rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
+ __rds_create_bind_key(key, addr, port, scope_id);
+ rcu_read_lock();
+ rs = rhashtable_lookup(&bind_hash_table, key, ht_parms);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
+ rcu_read_unlock();
- rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
- ntohs(port));
+ rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
+ ntohs(port));
return rs;
}
/* returns -ve errno or +ve port */
-static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
+ __be16 *port, __u32 scope_id)
{
int ret = -EADDRINUSE;
u16 rover, last;
- u64 key;
+ u8 key[RDS_BOUND_KEY_LEN];
if (*port != 0) {
rover = be16_to_cpu(*port);
@@ -95,12 +114,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (rover == RDS_FLAG_PROBE_PORT)
continue;
- key = ((u64)addr << 32) | cpu_to_be16(rover);
- if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+ __rds_create_bind_key(key, addr, cpu_to_be16(rover),
+ scope_id);
+ if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
continue;
- rs->rs_bound_key = key;
- rs->rs_bound_addr = addr;
+ memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
+ rs->rs_bound_addr = *addr;
net_get_random_once(&rs->rs_hash_initval,
sizeof(rs->rs_hash_initval));
rs->rs_bound_port = cpu_to_be16(rover);
@@ -109,12 +129,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (!rhashtable_insert_fast(&bind_hash_table,
&rs->rs_bound_node, ht_parms)) {
*port = rs->rs_bound_port;
+ rs->rs_bound_scope_id = scope_id;
ret = 0;
- rdsdebug("rs %p binding to %pI4:%d\n",
- rs, &addr, (int)ntohs(*port));
+ rdsdebug("rs %p binding to %pI6c:%d\n",
+ rs, addr, (int)ntohs(*port));
break;
} else {
- rs->rs_bound_addr = 0;
+ rs->rs_bound_addr = in6addr_any;
rds_sock_put(rs);
ret = -ENOMEM;
break;
@@ -127,44 +148,104 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
void rds_remove_bound(struct rds_sock *rs)
{
- if (!rs->rs_bound_addr)
+ if (ipv6_addr_any(&rs->rs_bound_addr))
return;
- rdsdebug("rs %p unbinding from %pI4:%d\n",
+ rdsdebug("rs %p unbinding from %pI6c:%d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));
rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
rds_sock_put(rs);
- rs->rs_bound_addr = 0;
+ rs->rs_bound_addr = in6addr_any;
}
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct in6_addr v6addr, *binding_addr;
struct rds_transport *trans;
+ __u32 scope_id = 0;
int ret = 0;
+ __be16 port;
+
+ /* We allow an RDS socket to be bound to either IPv4 or IPv6
+ * address.
+ */
+ if (uaddr->sa_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ if (addr_len < sizeof(struct sockaddr_in) ||
+ sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ return -EINVAL;
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
+ binding_addr = &v6addr;
+ port = sin->sin_port;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (uaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
+ int addr_type;
+
+ if (addr_len < sizeof(struct sockaddr_in6))
+ return -EINVAL;
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+ if (!(addr_type & IPV6_ADDR_MAPPED))
+ return -EINVAL;
+
+ /* It is a mapped address. Need to do some sanity
+ * checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(addr4)))
+ return -EINVAL;
+ }
+ /* The scope ID must be specified for link local address. */
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0)
+ return -EINVAL;
+ scope_id = sin6->sin6_scope_id;
+ }
+ binding_addr = &sin6->sin6_addr;
+ port = sin6->sin6_port;
+#endif
+ } else {
+ return -EINVAL;
+ }
lock_sock(sk);
- if (addr_len != sizeof(struct sockaddr_in) ||
- sin->sin_family != AF_INET ||
- rs->rs_bound_addr ||
- sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ /* RDS socket does not allow re-binding. */
+ if (!ipv6_addr_any(&rs->rs_bound_addr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Socket is connected. The binding address should have the same
+ * scope ID as the connected address, except the case when one is
+ * non-link local address (scope_id is 0).
+ */
+ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id &&
+ rs->rs_bound_scope_id &&
+ scope_id != rs->rs_bound_scope_id) {
ret = -EINVAL;
goto out;
}
- ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ ret = rds_add_bound(rs, binding_addr, &port, scope_id);
if (ret)
goto out;
if (rs->rs_transport) { /* previously bound */
trans = rs->rs_transport;
if (trans->laddr_check(sock_net(sock->sk),
- sin->sin_addr.s_addr) != 0) {
+ binding_addr, scope_id) != 0) {
ret = -ENOPROTOOPT;
rds_remove_bound(rs);
} else {
@@ -172,13 +253,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
}
goto out;
}
- trans = rds_trans_get_preferred(sock_net(sock->sk),
- sin->sin_addr.s_addr);
+ trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
+ scope_id);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
- pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
- __func__, &sin->sin_addr.s_addr);
+ pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+ __func__, binding_addr);
goto out;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 63da9d2f142d..ccdff09a79c8 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007 Oracle. All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock);
static DEFINE_SPINLOCK(rds_cong_lock);
static struct rb_root rds_cong_tree = RB_ROOT;
-static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
struct rds_cong_map *insert)
{
struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
struct rds_cong_map *map;
while (*p) {
+ int diff;
+
parent = *p;
map = rb_entry(parent, struct rds_cong_map, m_rb_node);
- if (addr < map->m_addr)
+ diff = rds_addr_cmp(addr, &map->m_addr);
+ if (diff < 0)
p = &(*p)->rb_left;
- else if (addr > map->m_addr)
+ else if (diff > 0)
p = &(*p)->rb_right;
else
return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
* these bitmaps in the process getting pointers to them. The bitmaps are only
* ever freed as the module is removed after all connections have been freed.
*/
-static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
{
struct rds_cong_map *map;
struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
if (!map)
return NULL;
- map->m_addr = addr;
+ map->m_addr = *addr;
init_waitqueue_head(&map->m_waitq);
INIT_LIST_HEAD(&map->m_conn_list);
@@ -171,7 +174,7 @@ out:
kfree(map);
}
- rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+ rdsdebug("map %p for addr %pI6c\n", ret, addr);
return ret;
}
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
int rds_cong_get_maps(struct rds_connection *conn)
{
- conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
- conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+ conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
+ conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
if (!(conn->c_lcong && conn->c_fcong))
return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
/* update congestion map for now-closed port */
spin_lock_irqsave(&rds_cong_lock, flags);
- map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+ map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
spin_unlock_irqrestore(&rds_cong_lock, flags);
if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index cfb05953b0e5..3bd2f4a5a30d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,9 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <net/inet_hashtables.h>
+#include <net/ipv6.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
#include "rds.h"
#include "loop.h"
@@ -49,18 +51,25 @@ static unsigned long rds_conn_count;
static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
static struct kmem_cache *rds_conn_slab;
-static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
+ const struct in6_addr *faddr)
{
+ static u32 rds6_hash_secret __read_mostly;
static u32 rds_hash_secret __read_mostly;
- unsigned long hash;
+ u32 lhash, fhash, hash;
net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+ net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+#if IS_ENABLED(CONFIG_IPV6)
+ fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+#else
+ fhash = (__force u32)faddr->s6_addr32[3];
+#endif
+ hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
- /* Pass NULL, don't need struct net for hash */
- hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
- be32_to_cpu(faddr), 0,
- rds_hash_secret);
return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
}
@@ -72,20 +81,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
/* rcu read lock must be held or the connection spinlock */
static struct rds_connection *rds_conn_lookup(struct net *net,
struct hlist_head *head,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ int dev_if)
{
struct rds_connection *conn, *ret = NULL;
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
- if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
- conn->c_trans == trans && net == rds_conn_net(conn)) {
+ if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
+ ipv6_addr_equal(&conn->c_laddr, laddr) &&
+ conn->c_trans == trans &&
+ net == rds_conn_net(conn) &&
+ conn->c_dev_if == dev_if) {
ret = conn;
break;
}
}
- rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
- &laddr, &faddr);
+ rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
+ laddr, faddr);
return ret;
}
@@ -99,8 +113,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
- rdsdebug("connection %pI4 to %pI4 reset\n",
- &conn->c_laddr, &conn->c_faddr);
+ rdsdebug("connection %pI6c to %pI6c reset\n",
+ &conn->c_laddr, &conn->c_faddr);
rds_stats_inc(s_conn_reset);
rds_send_path_reset(cp);
@@ -142,9 +156,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
* are torn down as the module is removed, if ever.
*/
static struct rds_connection *__rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp,
- int is_outgoing)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp,
+ int is_outgoing,
+ int dev_if)
{
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +171,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
rcu_read_lock();
- conn = rds_conn_lookup(net, head, laddr, faddr, trans);
- if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
- laddr == faddr && !is_outgoing) {
+ conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+ if (conn &&
+ conn->c_loopback &&
+ conn->c_trans != &rds_loop_transport &&
+ ipv6_addr_equal(laddr, faddr) &&
+ !is_outgoing) {
/* This is a looped back IB connection, and we're
* called by the code handling the incoming connect.
* We need a second connection object into which we
@@ -181,8 +201,22 @@ static struct rds_connection *__rds_conn_create(struct net *net,
}
INIT_HLIST_NODE(&conn->c_hash_node);
- conn->c_laddr = laddr;
- conn->c_faddr = faddr;
+ conn->c_laddr = *laddr;
+ conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
+ conn->c_faddr = *faddr;
+ conn->c_dev_if = dev_if;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* If the local address is link local, set c_bound_if to be the
+ * index used for this connection. Otherwise, set it to 0 as
+ * the socket is not bound to an interface. c_bound_if is used
+ * to look up a socket when a packet is received
+ */
+ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
+ conn->c_bound_if = dev_if;
+ else
+#endif
+ conn->c_bound_if = 0;
rds_conn_net_set(conn, net);
@@ -199,7 +233,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(net, faddr);
+ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -233,10 +267,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
goto out;
}
- rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
- conn, &laddr, &faddr,
- strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
- "[unknown]", is_outgoing ? "(outgoing)" : "");
+ rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
+ conn, laddr, faddr,
+ strnlen(trans->t_name, sizeof(trans->t_name)) ?
+ trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
/*
* Since we ran without holding the conn lock, someone could
@@ -262,7 +296,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
/* Creating normal conn */
struct rds_connection *found;
- found = rds_conn_lookup(net, head, laddr, faddr, trans);
+ found = rds_conn_lookup(net, head, laddr, faddr, trans,
+ dev_if);
if (found) {
struct rds_conn_path *cp;
int i;
@@ -295,18 +330,22 @@ out:
}
struct rds_connection *rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp, int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
@@ -464,10 +503,23 @@ void rds_conn_destroy(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL(rds_conn_destroy);
-static void rds_conn_message_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens,
- int want_send)
+static void __rds_inc_msg_cp(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ void *saddr, void *daddr, int flip, bool isv6)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (isv6)
+ rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+ else
+#endif
+ rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+ *(__be32 *)daddr, flip);
+}
+
+static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send, bool isv6)
{
struct hlist_head *head;
struct list_head *list;
@@ -478,7 +530,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
size_t i;
int j;
- len /= sizeof(struct rds_info_message);
+ if (isv6)
+ len /= sizeof(struct rds6_info_message);
+ else
+ len /= sizeof(struct rds_info_message);
rcu_read_lock();
@@ -488,6 +543,9 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct rds_conn_path *cp;
int npaths;
+ if (!isv6 && conn->c_isv6)
+ continue;
+
npaths = (conn->c_trans->t_mp_capable ?
RDS_MPATH_WORKERS : 1);
@@ -504,11 +562,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
list_for_each_entry(rm, list, m_conn_item) {
total++;
if (total <= len)
- rds_inc_info_copy(&rm->m_inc,
- iter,
- conn->c_laddr,
- conn->c_faddr,
- 0);
+ __rds_inc_msg_cp(&rm->m_inc,
+ iter,
+ &conn->c_laddr,
+ &conn->c_faddr,
+ 0, isv6);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -518,9 +576,30 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
rcu_read_unlock();
lens->nr = total;
- lens->each = sizeof(struct rds_info_message);
+ if (isv6)
+ lens->each = sizeof(struct rds6_info_message);
+ else
+ lens->each = sizeof(struct rds_info_message);
}
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
+}
+#endif
+
static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -528,6 +607,15 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
rds_conn_message_info(sock, len, iter, lens, 1);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+#endif
+
static void rds_conn_message_info_retrans(struct socket *sock,
unsigned int len,
struct rds_info_iterator *iter,
@@ -536,6 +624,16 @@ static void rds_conn_message_info_retrans(struct socket *sock,
rds_conn_message_info(sock, len, iter, lens, 0);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info_retrans(struct socket *sock,
+ unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+#endif
+
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -584,7 +682,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
struct hlist_head *head;
struct rds_connection *conn;
size_t i;
- int j;
rcu_read_lock();
@@ -595,17 +692,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
struct rds_conn_path *cp;
- int npaths;
- npaths = (conn->c_trans->t_mp_capable ?
- RDS_MPATH_WORKERS : 1);
- for (j = 0; j < npaths; j++) {
- cp = &conn->c_path[j];
+ /* XXX We only copy the information from the first
+ * path for now. The problem is that if there are
+ * more than one underlying paths, we cannot report
+ * information of all of them using the existing
+ * API. For example, there is only one next_tx_seq,
+ * which path's next_tx_seq should we report? It is
+ * a bug in the design of MPRDS.
+ */
+ cp = conn->c_path;
- /* XXX no cp_lock usage.. */
- if (!visitor(cp, buffer))
- continue;
- }
+ /* XXX no cp_lock usage.. */
+ if (!visitor(cp, buffer))
+ continue;
/* We copy as much as we can fit in the buffer,
* but we count all items so that the caller
@@ -624,12 +724,16 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
{
struct rds_info_connection *cinfo = buffer;
+ struct rds_connection *conn = cp->cp_conn;
+
+ if (conn->c_isv6)
+ return 0;
cinfo->next_tx_seq = cp->cp_next_tx_seq;
cinfo->next_rx_seq = cp->cp_next_rx_seq;
- cinfo->laddr = cp->cp_conn->c_laddr;
- cinfo->faddr = cp->cp_conn->c_faddr;
- strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
+ cinfo->laddr = conn->c_laddr.s6_addr32[3];
+ cinfo->faddr = conn->c_faddr.s6_addr32[3];
+ strncpy(cinfo->transport, conn->c_trans->t_name,
sizeof(cinfo->transport));
cinfo->flags = 0;
@@ -645,6 +749,36 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
return 1;
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+ struct rds6_info_connection *cinfo6 = buffer;
+ struct rds_connection *conn = cp->cp_conn;
+
+ cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+ cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+ cinfo6->laddr = conn->c_laddr;
+ cinfo6->faddr = conn->c_faddr;
+ strncpy(cinfo6->transport, conn->c_trans->t_name,
+ sizeof(cinfo6->transport));
+ cinfo6->flags = 0;
+
+ rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+ SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rds_conn_info_set(cinfo6->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+ CONNECTING);
+ rds_conn_info_set(cinfo6->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_UP,
+ CONNECTED);
+ /* Just return 1 as there is no error case. This is a helper function
+ * for rds_walk_conn_path_info() and it wants a return value.
+ */
+ return 1;
+}
+#endif
+
static void rds_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -657,6 +791,20 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_connection));
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
+
+ rds_walk_conn_path_info(sock, len, iter, lens,
+ rds6_conn_info_visitor,
+ buffer,
+ sizeof(struct rds6_info_connection));
+}
+#endif
+
int rds_conn_init(void)
{
int ret;
@@ -678,7 +826,13 @@ int rds_conn_init(void)
rds_conn_message_info_send);
rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
rds_conn_message_info_retrans);
-
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+ rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+ rds6_conn_message_info_send);
+ rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+ rds6_conn_message_info_retrans);
+#endif
return 0;
}
@@ -696,6 +850,13 @@ void rds_conn_exit(void)
rds_conn_message_info_send);
rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
rds_conn_message_info_retrans);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+ rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+ rds6_conn_message_info_send);
+ rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+ rds6_conn_message_info_retrans);
+#endif
}
/*
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b6ad38e48f62..eba75c1ba359 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <net/addrconf.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -143,7 +144,7 @@ static void rds_ib_add_one(struct ib_device *device)
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
- rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
has_fr = (device->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS);
@@ -295,9 +296,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_ib_transport)
return 0;
+ if (conn->c_isv6)
+ return 0;
- iinfo->src_addr = conn->c_laddr;
- iinfo->dst_addr = conn->c_faddr;
+ iinfo->src_addr = conn->c_laddr.s6_addr32[3];
+ iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -318,6 +321,40 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
return 1;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds6_info_rdma_connection *iinfo6 = buffer;
+ struct rds_ib_connection *ic;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_ib_transport)
+ return 0;
+
+ iinfo6->src_addr = conn->c_laddr;
+ iinfo6->dst_addr = conn->c_faddr;
+
+ memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+ memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_ib_device *rds_ibdev;
+
+ ic = conn->c_transport_data;
+ rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid,
+ (union ib_gid *)&iinfo6->dst_gid);
+ rds_ibdev = ic->rds_ibdev;
+ iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo6->max_send_sge = rds_ibdev->max_sge;
+ rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+ }
+ return 1;
+}
+#endif
+
static void rds_ib_ic_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -330,6 +367,20 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_rdma_connection));
}
+#if IS_ENABLED(CONFIG_IPV6)
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
+
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds6_ib_conn_info_visitor,
+ buffer,
+ sizeof(struct rds6_info_rdma_connection));
+}
+#endif
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
@@ -341,12 +392,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_ib_laddr_check(struct net *net, __be32 addr)
+static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
{
int ret;
struct rdma_cm_id *cm_id;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6;
+#endif
struct sockaddr_in sin;
+ struct sockaddr *sa;
+ bool isv4;
+ isv4 = ipv6_addr_v4mapped(addr);
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
@@ -355,22 +413,66 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr;
+ if (isv4) {
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr->s6_addr32[3];
+ sa = (struct sockaddr *)&sin;
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = *addr;
+ sin6.sin6_scope_id = scope_id;
+ sa = (struct sockaddr *)&sin6;
+
+ /* XXX Do a special IPv6 link local address check here. The
+ * reason is that rdma_bind_addr() always succeeds with IPv6
+ * link local address regardless it is indeed configured in a
+ * system.
+ */
+ if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
+ struct net_device *dev;
+
+ if (scope_id == 0) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ /* Use init_net for now as RDS is not network
+ * name space aware.
+ */
+ dev = dev_get_by_index(&init_net, scope_id);
+ if (!dev) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+ if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
+ dev_put(dev);
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+ dev_put(dev);
+ }
+#else
+ ret = -EADDRNOTAVAIL;
+ goto out;
+#endif
+ }
/* rdma_bind_addr will only succeed for IB & iWARP devices */
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ ret = rdma_bind_addr(cm_id, sa);
/* due to this, we will claim to support iWARP devices unless we
check node_type. */
if (ret || !cm_id->device ||
cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
- rdsdebug("addr %pI4 ret %d node type %d\n",
- &addr, ret,
- cm_id->device ? cm_id->device->node_type : -1);
+ rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
+ addr, scope_id, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+out:
rdma_destroy_id(cm_id);
return ret;
@@ -401,6 +503,9 @@ void rds_ib_exit(void)
rds_ib_set_unloading();
synchronize_rcu();
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
+#endif
rds_ib_unregister_client();
rds_ib_destroy_nodev_conns();
rds_ib_sysctl_exit();
@@ -462,6 +567,9 @@ int rds_ib_init(void)
rds_trans_register(&rds_ib_transport);
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
+#endif
goto out;
@@ -476,4 +584,3 @@ out:
}
MODULE_LICENSE("GPL");
-
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a6f4d7d68e95..71ff356ee702 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -57,16 +57,44 @@ struct rds_ib_refill_cache {
struct list_head *ready;
};
+/* This is the common structure for the IB private data exchange in setting up
+ * an RDS connection. The exchange is different for IPv4 and IPv6 connections.
+ * The reason is that the address size is different and the addresses
+ * exchanged are in the beginning of the structure. Hence it is not possible
+ * for interoperability if same structure is used.
+ */
+struct rds_ib_conn_priv_cmn {
+ u8 ricpc_protocol_major;
+ u8 ricpc_protocol_minor;
+ __be16 ricpc_protocol_minor_mask; /* bitmask */
+ __be32 ricpc_reserved1;
+ __be64 ricpc_ack_seq;
+ __be32 ricpc_credit; /* non-zero enables flow ctl */
+};
+
struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
- __be32 dp_saddr;
- __be32 dp_daddr;
- u8 dp_protocol_major;
- u8 dp_protocol_minor;
- __be16 dp_protocol_minor_mask; /* bitmask */
- __be32 dp_reserved1;
- __be64 dp_ack_seq;
- __be32 dp_credit; /* non-zero enables flow ctl */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+struct rds6_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ struct in6_addr dp_saddr;
+ struct in6_addr dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+#define dp_protocol_major dp_cmn.ricpc_protocol_major
+#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
+#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
+#define dp_ack_seq dp_cmn.ricpc_ack_seq
+#define dp_credit dp_cmn.ricpc_credit
+
+union rds_ib_conn_priv {
+ struct rds_ib_connect_private ricp_v4;
+ struct rds6_ib_connect_private ricp_v6;
};
struct rds_ib_send_work {
@@ -351,8 +379,8 @@ void rds_ib_listen_stop(void);
__printf(2, 3)
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+ struct rdma_cm_event *event, bool isv6);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
@@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
/* ib_rdma.c */
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
@@ -371,7 +400,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
int rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
int rds_ib_recv_path(struct rds_conn_path *conn);
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
void rds_ib_inc_free(struct rds_incoming *inc);
@@ -414,7 +443,7 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
/* ib_stats.c */
-DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
#define rds_ib_stats_add(member, count) \
rds_stats_add_which(rds_ib_stats, member, count)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f1684ae6abfd..bfbb31f0c7fd 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
+#include <net/addrconf.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
*/
void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
{
- const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data;
+ const union rds_ib_conn_priv *dp = NULL;
struct ib_qp_attr qp_attr;
+ __be64 ack_seq = 0;
+ __be32 credit = 0;
+ u8 major = 0;
+ u8 minor = 0;
int err;
- if (event->param.conn.private_data_len >= sizeof(*dp)) {
- dp = event->param.conn.private_data;
-
- /* make sure it isn't empty data */
- if (dp->dp_protocol_major) {
- rds_ib_set_protocol(conn,
- RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ dp = event->param.conn.private_data;
+ if (conn->c_isv6) {
+ if (event->param.conn.private_data_len >=
+ sizeof(struct rds6_ib_connect_private)) {
+ major = dp->ricp_v6.dp_protocol_major;
+ minor = dp->ricp_v6.dp_protocol_minor;
+ credit = dp->ricp_v6.dp_credit;
+ /* dp structure start is not guaranteed to be 8 bytes
+ * aligned. Since dp_ack_seq is 64-bit extended load
+ * operations can be used so go through get_unaligned
+ * to avoid unaligned errors.
+ */
+ ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
}
+ } else if (event->param.conn.private_data_len >=
+ sizeof(struct rds_ib_connect_private)) {
+ major = dp->ricp_v4.dp_protocol_major;
+ minor = dp->ricp_v4.dp_protocol_minor;
+ credit = dp->ricp_v4.dp_credit;
+ ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
+ }
+
+ /* make sure it isn't empty data */
+ if (major) {
+ rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(credit));
}
if (conn->c_version < RDS_PROTOCOL(3, 1)) {
- pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+ pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
@@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_conn_destroy(conn);
return;
} else {
- pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+ pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
ic->i_active_side ? "Active" : "Passive",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
@@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
/* update ib_device with this local ipaddr */
- err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+ err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
if (err)
printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
err);
@@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
if (dp) {
- /* dp structure start is not guaranteed to be 8 bytes aligned.
- * Since dp_ack_seq is 64-bit extended load operations can be
- * used so go through get_unaligned to avoid unaligned errors.
- */
- __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
- if (dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+ if (ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
NULL);
}
@@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
- struct rdma_conn_param *conn_param,
- struct rds_ib_connect_private *dp,
- u32 protocol_version,
- u32 max_responder_resources,
- u32 max_initiator_depth)
+ struct rdma_conn_param *conn_param,
+ union rds_ib_conn_priv *dp,
+ u32 protocol_version,
+ u32 max_responder_resources,
+ u32 max_initiator_depth,
+ bool isv6)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
if (dp) {
memset(dp, 0, sizeof(*dp));
- dp->dp_saddr = conn->c_laddr;
- dp->dp_daddr = conn->c_faddr;
- dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
- dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
- dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
- dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
+ if (isv6) {
+ dp->ricp_v6.dp_saddr = conn->c_laddr;
+ dp->ricp_v6.dp_daddr = conn->c_faddr;
+ dp->ricp_v6.dp_protocol_major =
+ RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->ricp_v6.dp_protocol_minor =
+ RDS_PROTOCOL_MINOR(protocol_version);
+ dp->ricp_v6.dp_protocol_minor_mask =
+ cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->ricp_v6.dp_ack_seq =
+ cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+ conn_param->private_data = &dp->ricp_v6;
+ conn_param->private_data_len = sizeof(dp->ricp_v6);
+ } else {
+ dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
+ dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
+ dp->ricp_v4.dp_protocol_major =
+ RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->ricp_v4.dp_protocol_minor =
+ RDS_PROTOCOL_MINOR(protocol_version);
+ dp->ricp_v4.dp_protocol_minor_mask =
+ cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->ricp_v4.dp_ack_seq =
+ cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+ conn_param->private_data = &dp->ricp_v4;
+ conn_param->private_data_len = sizeof(dp->ricp_v4);
+ }
/* Advertise flow control */
if (ic->i_flowctl) {
unsigned int credits;
- credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
- dp->dp_credit = cpu_to_be32(credits);
- atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+ credits = IB_GET_POST_CREDITS
+ (atomic_read(&ic->i_credits));
+ if (isv6)
+ dp->ricp_v6.dp_credit = cpu_to_be32(credits);
+ else
+ dp->ricp_v4.dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits),
+ &ic->i_credits);
}
-
- conn_param->private_data = dp;
- conn_param->private_data_len = sizeof(*dp);
}
}
@@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
break;
default:
rdsdebug("Fatal QP Event %u (%s) "
- "- connection %pI4->%pI4, reconnecting\n",
+ "- connection %pI6c->%pI6c, reconnecting\n",
event->event, ib_event_msg(event->event),
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
@@ -580,11 +621,13 @@ out:
return ret;
}
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
{
- const struct rds_ib_connect_private *dp = event->param.conn.private_data;
- u16 common;
+ const union rds_ib_conn_priv *dp = event->param.conn.private_data;
+ u8 data_len, major, minor;
u32 version = 0;
+ __be16 mask;
+ u16 common;
/*
* rdma_cm private data is odd - when there is any private data in the
@@ -603,51 +646,140 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
return 0;
}
+ if (isv6) {
+ data_len = sizeof(struct rds6_ib_connect_private);
+ major = dp->ricp_v6.dp_protocol_major;
+ minor = dp->ricp_v6.dp_protocol_minor;
+ mask = dp->ricp_v6.dp_protocol_minor_mask;
+ } else {
+ data_len = sizeof(struct rds_ib_connect_private);
+ major = dp->ricp_v4.dp_protocol_major;
+ minor = dp->ricp_v4.dp_protocol_minor;
+ mask = dp->ricp_v4.dp_protocol_minor_mask;
+ }
+
/* Even if len is crap *now* I still want to check it. -ASG */
- if (event->param.conn.private_data_len < sizeof (*dp) ||
- dp->dp_protocol_major == 0)
+ if (event->param.conn.private_data_len < data_len || major == 0)
return RDS_PROTOCOL_3_0;
- common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
- if (dp->dp_protocol_major == 3 && common) {
+ common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+ if (major == 3 && common) {
version = RDS_PROTOCOL_3_0;
while ((common >>= 1) != 0)
version++;
- } else
- printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
- &dp->dp_saddr,
- dp->dp_protocol_major,
- dp->dp_protocol_minor);
+ } else {
+ if (isv6)
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
+ &dp->ricp_v6.dp_saddr, major, minor);
+ else
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+ &dp->ricp_v4.dp_saddr, major, minor);
+ }
return version;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/* Given an IPv6 address, find the net_device which hosts that address and
+ * return its index. This is used by the rds_ib_cm_handle_connect() code to
+ * find the interface index of where an incoming request comes from when
+ * the request is using a link local address.
+ *
+ * Note one problem in this search. It is possible that two interfaces have
+ * the same link local address. Unfortunately, this cannot be solved unless
+ * the underlying layer gives us the interface which an incoming RDMA connect
+ * request comes from.
+ */
+static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
+{
+ struct net_device *dev;
+ int idx = 0;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (ipv6_chk_addr(net, addr, dev, 1)) {
+ idx = dev->ifindex;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return idx;
+}
+#endif
+
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
+ struct rdma_cm_event *event, bool isv6)
{
__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
- const struct rds_ib_connect_private *dp = event->param.conn.private_data;
- struct rds_ib_connect_private dp_rep;
+ const struct rds_ib_conn_priv_cmn *dp_cmn;
struct rds_connection *conn = NULL;
struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param;
+ const union rds_ib_conn_priv *dp;
+ union rds_ib_conn_priv dp_rep;
+ struct in6_addr s_mapped_addr;
+ struct in6_addr d_mapped_addr;
+ const struct in6_addr *saddr6;
+ const struct in6_addr *daddr6;
+ int destroy = 1;
+ u32 ifindex = 0;
u32 version;
- int err = 1, destroy = 1;
+ int err = 1;
/* Check whether the remote protocol version matches ours. */
- version = rds_ib_protocol_compatible(event);
+ version = rds_ib_protocol_compatible(event, isv6);
if (!version)
goto out;
- rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
- "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+ dp = event->param.conn.private_data;
+ if (isv6) {
+#if IS_ENABLED(CONFIG_IPV6)
+ dp_cmn = &dp->ricp_v6.dp_cmn;
+ saddr6 = &dp->ricp_v6.dp_saddr;
+ daddr6 = &dp->ricp_v6.dp_daddr;
+ /* If either address is link local, need to find the
+ * interface index in order to create a proper RDS
+ * connection.
+ */
+ if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
+ /* Using init_net for now .. */
+ ifindex = __rds_find_ifindex(&init_net, daddr6);
+ /* No index found... Need to bail out. */
+ if (ifindex == 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
+ /* Use our address to find the correct index. */
+ ifindex = __rds_find_ifindex(&init_net, daddr6);
+ /* No index found... Need to bail out. */
+ if (ifindex == 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+#else
+ err = -EOPNOTSUPP;
+ goto out;
+#endif
+ } else {
+ dp_cmn = &dp->ricp_v4.dp_cmn;
+ ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
+ ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
+ saddr6 = &s_mapped_addr;
+ daddr6 = &d_mapped_addr;
+ }
+
+ rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
+ "0x%llx\n", saddr6, daddr6,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
/* RDS/IB is not currently netns aware, thus init_net */
- conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
- &rds_ib_transport, GFP_KERNEL);
+ conn = rds_conn_create(&init_net, daddr6, saddr6,
+ &rds_ib_transport, GFP_KERNEL, ifindex);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
@@ -678,12 +810,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
ic = conn->c_transport_data;
rds_ib_set_protocol(conn, version);
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
- if (dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+ if (dp_cmn->ricpc_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
+ NULL);
BUG_ON(cm_id->context);
BUG_ON(ic->i_cm_id);
@@ -702,8 +835,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
- event->param.conn.responder_resources,
- event->param.conn.initiator_depth);
+ event->param.conn.responder_resources,
+ event->param.conn.initiator_depth, isv6);
/* rdma_accept() calls rdma_reject() internally if it fails */
if (rdma_accept(cm_id, &conn_param))
@@ -718,12 +851,12 @@ out:
}
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
{
struct rds_connection *conn = cm_id->context;
struct rds_ib_connection *ic = conn->c_transport_data;
struct rdma_conn_param conn_param;
- struct rds_ib_connect_private dp;
+ union rds_ib_conn_priv dp;
int ret;
/* If the peer doesn't do protocol negotiation, we must
@@ -738,7 +871,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
- UINT_MAX, UINT_MAX);
+ UINT_MAX, UINT_MAX, isv6);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -758,13 +891,22 @@ out:
int rds_ib_conn_path_connect(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
- struct rds_ib_connection *ic = conn->c_transport_data;
- struct sockaddr_in src, dest;
+ struct sockaddr_storage src, dest;
+ rdma_cm_event_handler handler;
+ struct rds_ib_connection *ic;
int ret;
+ ic = conn->c_transport_data;
+
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
+#if IS_ENABLED(CONFIG_IPV6)
+ if (conn->c_isv6)
+ handler = rds6_rdma_cm_event_handler;
+ else
+#endif
+ handler = rds_rdma_cm_event_handler;
+ ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
@@ -775,13 +917,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
+ if (ipv6_addr_v4mapped(&conn->c_faddr)) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&src;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin->sin_port = 0;
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_PORT);
+ sin = (struct sockaddr_in *)&dest;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin->sin_port = htons(RDS_PORT);
+ } else {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&src;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = conn->c_laddr;
+ sin6->sin6_port = 0;
+ sin6->sin6_scope_id = conn->c_dev_if;
+
+ sin6 = (struct sockaddr_in6 *)&dest;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = conn->c_faddr;
+ sin6->sin6_port = htons(RDS_CM_PORT);
+ sin6->sin6_scope_id = conn->c_dev_if;
+ }
ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
(struct sockaddr *)&dest,
@@ -949,7 +1111,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
if (!ic)
return -ENOMEM;
- ret = rds_ib_recv_alloc_caches(ic);
+ ret = rds_ib_recv_alloc_caches(ic, gfp);
if (ret) {
kfree(ic);
return ret;
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index d152e48ea371..6431a023ac89 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -61,6 +61,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
pool->fmr_attr.max_pages);
if (IS_ERR(frmr->mr)) {
pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+ err = PTR_ERR(frmr->mr);
goto out_no_cigar;
}
@@ -102,7 +103,6 @@ static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
{
struct rds_ib_frmr *frmr = &ibmr->u.frmr;
- struct ib_send_wr *failed_wr;
struct ib_reg_wr reg_wr;
int ret, off = 0;
@@ -135,9 +135,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
IB_ACCESS_REMOTE_WRITE;
reg_wr.wr.send_flags = IB_SEND_SIGNALED;
- failed_wr = &reg_wr.wr;
- ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, &failed_wr);
- WARN_ON(failed_wr != &reg_wr.wr);
+ ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
if (unlikely(ret)) {
/* Failure here can be because of -ENOMEM as well */
frmr->fr_state = FRMR_IS_STALE;
@@ -230,7 +228,7 @@ out_unmap:
static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
{
- struct ib_send_wr *s_wr, *failed_wr;
+ struct ib_send_wr *s_wr;
struct rds_ib_frmr *frmr = &ibmr->u.frmr;
struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
int ret = -EINVAL;
@@ -255,9 +253,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
s_wr->ex.invalidate_rkey = frmr->mr->rkey;
s_wr->send_flags = IB_SEND_SIGNALED;
- failed_wr = s_wr;
- ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr);
- WARN_ON(failed_wr != s_wr);
+ ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
if (unlikely(ret)) {
frmr->fr_state = FRMR_IS_STALE;
frmr->fr_inv = false;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 655f01d427fe..5da12c248431 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds6_info_rdma_connection *iinfo6);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 2e49a40a5e11..63c8d107adcf 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
kfree_rcu(to_free, rcu);
}
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr)
{
struct rds_ib_device *rds_ibdev_old;
- rds_ibdev_old = rds_ib_get_device(ipaddr);
+ rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
if (!rds_ibdev_old)
- return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
if (rds_ibdev_old != rds_ibdev) {
- rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+ rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
rds_ib_dev_put(rds_ibdev_old);
- return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
}
rds_ib_dev_put(rds_ibdev_old);
@@ -179,6 +180,17 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}
+#if IS_ENABLED(CONFIG_IPV6)
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds6_info_rdma_connection *iinfo6)
+{
+ struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+ iinfo6->rdma_mr_max = pool_1m->max_items;
+ iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+#endif
+
struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
@@ -545,7 +557,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_ib_connection *ic = NULL;
int ret;
- rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+ rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
if (!rds_ibdev) {
ret = -ENODEV;
goto out;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index b4e421aa9727..2f16146e4ec9 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -98,12 +98,12 @@ static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
}
}
-static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
{
struct rds_ib_cache_head *head;
int cpu;
- cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+ cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
if (!cache->percpu)
return -ENOMEM;
@@ -118,13 +118,13 @@ static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
return 0;
}
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
{
int ret;
- ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
if (!ret) {
- ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
if (ret)
free_percpu(ic->i_cache_incs.percpu);
}
@@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
rds_ib_stats_inc(s_ib_rx_total_incs);
}
INIT_LIST_HEAD(&ibinc->ii_frags);
- rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+ rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
return ibinc;
}
@@ -376,14 +376,11 @@ static void release_refill(struct rds_connection *conn)
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets.
- *
- * -1 is returned if posting fails due to temporary resource exhaustion.
*/
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_recv_work *recv;
- struct ib_recv_wr *failed_wr;
unsigned int posted = 0;
int ret = 0;
bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
@@ -417,10 +414,10 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
&recv->r_frag->f_sg));
/* XXX when can this fail? */
- ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
if (ret) {
rds_ib_conn_error(conn, "recv post on "
- "%pI4 returned %d, disconnecting and "
+ "%pI6c returned %d, disconnecting and "
"reconnecting\n", &conn->c_faddr,
ret);
break;
@@ -650,7 +647,6 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
{
struct rds_header *hdr = ic->i_ack;
- struct ib_send_wr *failed_wr;
u64 seq;
int ret;
@@ -663,7 +659,7 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
rds_message_make_checksum(hdr);
ic->i_ack_queued = jiffies;
- ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+ ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
if (unlikely(ret)) {
/* Failed to send. Release the WR, and
* force another ACK.
@@ -850,7 +846,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
if (data_len < sizeof(struct rds_header)) {
rds_ib_conn_error(conn, "incoming message "
- "from %pI4 didn't include a "
+ "from %pI6c didn't include a "
"header, disconnecting and "
"reconnecting\n",
&conn->c_faddr);
@@ -863,7 +859,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_ib_conn_error(conn, "incoming message "
- "from %pI4 has corrupted header - "
+ "from %pI6c has corrupted header - "
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
@@ -943,10 +939,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ic->i_recv_data_rem = 0;
ic->i_ibinc = NULL;
- if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
rds_ib_cong_recv(conn, ibinc);
- else {
- rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ } else {
+ rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
&ibinc->ii_inc, GFP_ATOMIC);
state->ack_next = be64_to_cpu(hdr->h_sequence);
state->ack_next_valid = 1;
@@ -990,7 +986,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
} else {
/* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
wc->status,
ib_wc_status_msg(wc->status));
@@ -1025,7 +1021,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
- int ret = 0;
rdsdebug("conn %p\n", conn);
if (rds_conn_up(conn)) {
@@ -1034,7 +1029,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
rds_ib_stats_inc(s_ib_rx_refill_from_thread);
}
- return ret;
+ return 0;
}
int rds_ib_recv_init(void)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 8557a1cae041..2dcb555e6350 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr, wc->status,
ib_wc_status_msg(wc->status));
}
@@ -492,7 +492,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
struct rds_ib_send_work *send = NULL;
struct rds_ib_send_work *first;
struct rds_ib_send_work *prev;
- struct ib_send_wr *failed_wr;
+ const struct ib_send_wr *failed_wr;
struct scatterlist *scat;
u32 pos;
u32 i;
@@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
first, &first->s_wr, ret, failed_wr);
BUG_ON(failed_wr != &first->s_wr);
if (ret) {
- printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+ printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
@@ -758,15 +758,12 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_send_work *send = NULL;
- struct ib_send_wr *failed_wr;
- struct rds_ib_device *rds_ibdev;
+ const struct ib_send_wr *failed_wr;
u32 pos;
u32 work_alloc;
int ret;
int nr_sig = 0;
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
if (work_alloc != 1) {
rds_ib_stats_inc(s_ib_tx_ring_full);
@@ -827,7 +824,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
send, &send->s_atomic_wr, ret, failed_wr);
BUG_ON(failed_wr != &send->s_atomic_wr.wr);
if (ret) {
- printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
@@ -849,7 +846,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
struct rds_ib_send_work *send = NULL;
struct rds_ib_send_work *first;
struct rds_ib_send_work *prev;
- struct ib_send_wr *failed_wr;
+ const struct ib_send_wr *failed_wr;
struct scatterlist *scat;
unsigned long len;
u64 remote_addr = op->op_remote_addr;
@@ -967,7 +964,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
first, &first->s_rdma_wr.wr, ret, failed_wr);
BUG_ON(failed_wr != &first->s_rdma_wr.wr);
if (ret) {
- printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+ printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
diff --git a/net/rds/loop.c b/net/rds/loop.c
index feea1f96ee2a..1d73ad79c847 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/in.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/ipv6.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -88,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
BUG_ON(hdr_off || sg || off);
- rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+ rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
/* For the embedded inc. Matching put is in loop_inc_free() */
rds_message_addref(rm);
- rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+ rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
GFP_KERNEL);
rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
diff --git a/net/rds/message.c b/net/rds/message.c
index a35f76971984..4b00b1152a5f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -514,4 +514,3 @@ void rds_message_unmapped(struct rds_message *rm)
wake_up_interruptible(&rm->m_flush_wait);
}
EXPORT_SYMBOL_GPL(rds_message_unmapped);
-
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 80920e47f2c7..98237feb607a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007 Oracle. All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -184,7 +184,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
long i;
int ret;
- if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
+ if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
@@ -576,7 +576,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
args = CMSG_DATA(cmsg);
- if (rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out_ret;
}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index fc59821f0a27..6b0f57c83a2a 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009 Oracle. All rights reserved.
+ * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -37,10 +37,15 @@
#include "rdma_transport.h"
#include "ib.h"
+/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
static struct rdma_cm_id *rds_rdma_listen_id;
+#if IS_ENABLED(CONFIG_IPV6)
+static struct rdma_cm_id *rds6_rdma_listen_id;
+#endif
-int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
+static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event,
+ bool isv6)
{
/* this can be null in the listening path */
struct rds_connection *conn = cm_id->context;
@@ -72,7 +77,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
- ret = trans->cm_handle_connect(cm_id, event);
+ ret = trans->cm_handle_connect(cm_id, event, isv6);
break;
case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -90,7 +95,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
ibic = conn->c_transport_data;
if (ibic && ibic->i_cm_id == cm_id)
- ret = trans->cm_initiate_connect(cm_id);
+ ret = trans->cm_initiate_connect(cm_id, isv6);
else
rds_conn_drop(conn);
}
@@ -116,14 +121,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_DISCONNECTED:
rdsdebug("DISCONNECT event - dropping connection "
- "%pI4->%pI4\n", &conn->c_laddr,
+ "%pI6c->%pI6c\n", &conn->c_laddr,
&conn->c_faddr);
rds_conn_drop(conn);
break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
if (conn) {
- pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+ pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
}
@@ -146,13 +151,28 @@ out:
return ret;
}
-static int rds_rdma_listen_init(void)
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
+}
+#endif
+
+static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
+ struct sockaddr *sa,
+ struct rdma_cm_id **ret_cm_id)
{
- struct sockaddr_in sin;
struct rdma_cm_id *cm_id;
int ret;
- cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+ cm_id = rdma_create_id(&init_net, handler, NULL,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
@@ -161,15 +181,11 @@ static int rds_rdma_listen_init(void)
return ret;
}
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
- sin.sin_port = (__force u16)htons(RDS_PORT);
-
/*
* XXX I bet this binds the cm_id to a device. If we want to support
* fail-over we'll have to take this into consideration.
*/
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ ret = rdma_bind_addr(cm_id, sa);
if (ret) {
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_bind_addr() returned %d\n", ret);
@@ -185,7 +201,7 @@ static int rds_rdma_listen_init(void)
rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
- rds_rdma_listen_id = cm_id;
+ *ret_cm_id = cm_id;
cm_id = NULL;
out:
if (cm_id)
@@ -193,6 +209,45 @@ out:
return ret;
}
+/* Initialize the RDS RDMA listeners. We create two listeners for
+ * compatibility reason. The one on RDS_PORT is used for IPv4
+ * requests only. The one on RDS_CM_PORT is used for IPv6 requests
+ * only. So only IPv6 enabled RDS module will communicate using this
+ * port.
+ */
+static int rds_rdma_listen_init(void)
+{
+ int ret;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6;
+#endif
+ struct sockaddr_in sin;
+
+ sin.sin_family = PF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(RDS_PORT);
+ ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
+ (struct sockaddr *)&sin,
+ &rds_rdma_listen_id);
+ if (ret != 0)
+ return ret;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ sin6.sin6_family = PF_INET6;
+ sin6.sin6_addr = in6addr_any;
+ sin6.sin6_port = htons(RDS_CM_PORT);
+ sin6.sin6_scope_id = 0;
+ sin6.sin6_flowinfo = 0;
+ ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
+ (struct sockaddr *)&sin6,
+ &rds6_rdma_listen_id);
+ /* Keep going even when IPv6 is not enabled in the system. */
+ if (ret != 0)
+ rdsdebug("Cannot set up IPv6 RDMA listener\n");
+#endif
+ return 0;
+}
+
static void rds_rdma_listen_stop(void)
{
if (rds_rdma_listen_id) {
@@ -200,6 +255,13 @@ static void rds_rdma_listen_stop(void)
rdma_destroy_id(rds_rdma_listen_id);
rds_rdma_listen_id = NULL;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ if (rds6_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds6_rdma_listen_id);
+ rdma_destroy_id(rds6_rdma_listen_id);
+ rds6_rdma_listen_id = NULL;
+ }
+#endif
}
static int rds_rdma_init(void)
@@ -229,4 +291,3 @@ module_exit(rds_rdma_exit);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: IB transport");
MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index d309c4430124..200d3134aaae 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -6,11 +6,16 @@
#include <rdma/rdma_cm.h>
#include "rds.h"
+/* RDMA_CM also uses 16385 as the listener port. */
+#define RDS_CM_PORT 16385
+
#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
/* from ib.c */
extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 60b3b787fbdb..6bfaf05b63b2 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -10,6 +10,7 @@
#include <linux/rds.h>
#include <linux/rhashtable.h>
#include <linux/refcount.h>
+#include <linux/in6.h>
#include "info.h"
@@ -23,11 +24,13 @@
#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
-/*
- * XXX randomly chosen, but at least seems to be unused:
- * # 18464-18768 Unassigned
- * We should do better. We want a reserved port to discourage unpriv'ed
- * userspace from listening.
+/* The following ports, 16385, 18634, 18635, are registered with IANA as
+ * the ports to be used for RDS over TCP and UDP. Currently, only RDS over
+ * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value
+ * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After
+ * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept
+ * to ensure compatibility with older RDS modules. Those ports are defined
+ * in each transport's header file.
*/
#define RDS_PORT 18634
@@ -61,7 +64,7 @@ void rdsdebug(char *fmt, ...)
struct rds_cong_map {
struct rb_node m_rb_node;
- __be32 m_addr;
+ struct in6_addr m_addr;
wait_queue_head_t m_waitq;
struct list_head m_conn_list;
unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
@@ -136,11 +139,14 @@ struct rds_conn_path {
/* One rds_connection per RDS address pair */
struct rds_connection {
struct hlist_node c_hash_node;
- __be32 c_laddr;
- __be32 c_faddr;
+ struct in6_addr c_laddr;
+ struct in6_addr c_faddr;
+ int c_dev_if; /* ifindex used for this conn */
+ int c_bound_if; /* ifindex of c_laddr */
unsigned int c_loopback:1,
+ c_isv6:1,
c_ping_triggered:1,
- c_pad_to_32:30;
+ c_pad_to_32:29;
int c_npaths;
struct rds_connection *c_passive;
struct rds_transport *c_trans;
@@ -269,10 +275,10 @@ struct rds_incoming {
struct rds_conn_path *i_conn_path;
struct rds_header i_hdr;
unsigned long i_rx_jiffies;
- __be32 i_saddr;
+ struct in6_addr i_saddr;
rds_rdma_cookie_t i_rdma_cookie;
- struct timeval i_rx_tstamp;
+ ktime_t i_rx_tstamp;
u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
};
@@ -386,7 +392,7 @@ struct rds_message {
struct list_head m_conn_item;
struct rds_incoming m_inc;
u64 m_ack_seq;
- __be32 m_daddr;
+ struct in6_addr m_daddr;
unsigned long m_flags;
/* Never access m_rs without holding m_rs_lock.
@@ -521,7 +527,8 @@ struct rds_transport {
t_mp_capable:1;
unsigned int t_type;
- int (*laddr_check)(struct net *net, __be32 addr);
+ int (*laddr_check)(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_path_connect)(struct rds_conn_path *cp);
@@ -537,8 +544,8 @@ struct rds_transport {
void (*inc_free)(struct rds_incoming *inc);
int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
- int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+ struct rdma_cm_event *event, bool isv6);
+ int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
void (*cm_connect_complete)(struct rds_connection *conn,
struct rdma_cm_event *event);
@@ -554,6 +561,12 @@ struct rds_transport {
bool (*t_unloading)(struct rds_connection *conn);
};
+/* Bind hash table key length. It is the sum of the size of a struct
+ * in6_addr, a scope_id and a port.
+ */
+#define RDS_BOUND_KEY_LEN \
+ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
+
struct rds_sock {
struct sock rs_sk;
@@ -565,10 +578,14 @@ struct rds_sock {
* support.
*/
struct rhash_head rs_bound_node;
- u64 rs_bound_key;
- __be32 rs_bound_addr;
- __be32 rs_conn_addr;
- __be16 rs_bound_port;
+ u8 rs_bound_key[RDS_BOUND_KEY_LEN];
+ struct sockaddr_in6 rs_bound_sin6;
+#define rs_bound_addr rs_bound_sin6.sin6_addr
+#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3]
+#define rs_bound_port rs_bound_sin6.sin6_port
+#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id
+ struct in6_addr rs_conn_addr;
+#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3]
__be16 rs_conn_port;
struct rds_transport *rs_transport;
@@ -704,7 +721,8 @@ extern wait_queue_head_t rds_poll_waitq;
/* bind.c */
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
void rds_remove_bound(struct rds_sock *rs);
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id);
int rds_bind_lock_init(void);
void rds_bind_lock_destroy(void);
@@ -723,16 +741,20 @@ void rds_cong_remove_socket(struct rds_sock *);
void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
-/* conn.c */
+/* connection.c */
extern u32 rds_gen_num;
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp);
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int dev_if);
struct rds_connection *rds_conn_create_outgoing(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp);
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp, int dev_if);
void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
@@ -843,11 +865,12 @@ void rds_page_exit(void);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
- __be32 saddr);
+ struct in6_addr *saddr);
void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
- __be32 saddr);
+ struct in6_addr *saddr);
void rds_inc_put(struct rds_incoming *inc);
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+ struct in6_addr *daddr,
struct rds_incoming *inc, gfp_t gfp);
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int msg_flags);
@@ -856,13 +879,17 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
void rds_inc_info_copy(struct rds_incoming *inc,
struct rds_info_iterator *iter,
__be32 saddr, __be32 daddr, int flip);
+void rds6_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ int flip);
/* send.c */
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
void rds_send_path_reset(struct rds_conn_path *conn);
int rds_send_xmit(struct rds_conn_path *cp);
struct sockaddr_in;
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
@@ -949,11 +976,14 @@ void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
+int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
/* transport.c */
void rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+ const struct in6_addr *addr,
+ __u32 scope_id);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 192ac6f78ded..727639dac8a7 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -41,34 +41,29 @@
#include "rds.h"
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
- __be32 saddr)
+ struct in6_addr *saddr)
{
- int i;
-
refcount_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = conn;
- inc->i_saddr = saddr;
+ inc->i_saddr = *saddr;
inc->i_rdma_cookie = 0;
- inc->i_rx_tstamp.tv_sec = 0;
- inc->i_rx_tstamp.tv_usec = 0;
+ inc->i_rx_tstamp = ktime_set(0, 0);
- for (i = 0; i < RDS_RX_MAX_TRACES; i++)
- inc->i_rx_lat_trace[i] = 0;
+ memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
}
EXPORT_SYMBOL_GPL(rds_inc_init);
void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
- __be32 saddr)
+ struct in6_addr *saddr)
{
refcount_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = cp->cp_conn;
inc->i_conn_path = cp;
- inc->i_saddr = saddr;
+ inc->i_saddr = *saddr;
inc->i_rdma_cookie = 0;
- inc->i_rx_tstamp.tv_sec = 0;
- inc->i_rx_tstamp.tv_usec = 0;
+ inc->i_rx_tstamp = ktime_set(0, 0);
}
EXPORT_SYMBOL_GPL(rds_inc_path_init);
@@ -110,7 +105,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
- rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+ rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
"now_cong %d delta %d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
@@ -260,7 +255,7 @@ static void rds_start_mprds(struct rds_connection *conn)
struct rds_conn_path *cp;
if (conn->c_npaths > 1 &&
- IS_CANONICAL(conn->c_laddr, conn->c_faddr)) {
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
for (i = 0; i < conn->c_npaths; i++) {
cp = &conn->c_path[i];
rds_conn_path_connect_if_down(cp);
@@ -284,7 +279,8 @@ static void rds_start_mprds(struct rds_connection *conn)
* conn. This lets loopback, who only has one conn for both directions,
* tell us which roles the addrs in the conn are playing for this message.
*/
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+ struct in6_addr *daddr,
struct rds_incoming *inc, gfp_t gfp)
{
struct rds_sock *rs = NULL;
@@ -339,7 +335,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
if (inc->i_hdr.h_sport == 0) {
- rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+ rdsdebug("ignore ping with 0 sport from %pI6c\n",
+ saddr);
goto out;
}
rds_stats_inc(s_recv_ping);
@@ -362,7 +359,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
goto out;
}
- rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+ rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
if (!rs) {
rds_stats_inc(s_recv_drop_no_sock);
goto out;
@@ -383,7 +380,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
if (sock_flag(sk, SOCK_RCVTSTAMP))
- do_gettimeofday(&inc->i_rx_tstamp);
+ inc->i_rx_tstamp = ktime_get_real();
rds_inc_addref(inc);
inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
@@ -550,11 +547,11 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
goto out;
}
- if ((inc->i_rx_tstamp.tv_sec != 0) &&
+ if ((inc->i_rx_tstamp != 0) &&
sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
+ struct timeval tv = ktime_to_timeval(inc->i_rx_tstamp);
ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
- sizeof(struct timeval),
- &inc->i_rx_tstamp);
+ sizeof(tv), &tv);
if (ret)
goto out;
}
@@ -625,6 +622,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
struct rds_sock *rs = rds_sk_to_rs(sk);
long timeo;
int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct rds_incoming *inc = NULL;
@@ -673,7 +671,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
break;
}
- rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+ rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
&inc->i_conn->c_faddr,
ntohs(inc->i_hdr.h_sport));
ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
@@ -707,12 +705,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
rds_stats_inc(s_recv_delivered);
- if (sin) {
- sin->sin_family = AF_INET;
- sin->sin_port = inc->i_hdr.h_sport;
- sin->sin_addr.s_addr = inc->i_saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- msg->msg_namelen = sizeof(*sin);
+ if (msg->msg_name) {
+ if (ipv6_addr_v4mapped(&inc->i_saddr)) {
+ sin = (struct sockaddr_in *)msg->msg_name;
+
+ sin->sin_family = AF_INET;
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr =
+ inc->i_saddr.s6_addr32[3];
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ msg->msg_namelen = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)msg->msg_name;
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = inc->i_hdr.h_sport;
+ sin6->sin6_addr = inc->i_saddr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ msg->msg_namelen = sizeof(*sin6);
+ }
}
break;
}
@@ -775,3 +787,30 @@ void rds_inc_info_copy(struct rds_incoming *inc,
rds_info_copy(iter, &minfo, sizeof(minfo));
}
+
+#if IS_ENABLED(CONFIG_IPV6)
+void rds6_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ int flip)
+{
+ struct rds6_info_message minfo6;
+
+ minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+ minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+
+ if (flip) {
+ minfo6.laddr = *daddr;
+ minfo6.faddr = *saddr;
+ minfo6.lport = inc->i_hdr.h_dport;
+ minfo6.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo6.laddr = *saddr;
+ minfo6.faddr = *daddr;
+ minfo6.lport = inc->i_hdr.h_sport;
+ minfo6.fport = inc->i_hdr.h_dport;
+ }
+
+ rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
+#endif
diff --git a/net/rds/send.c b/net/rds/send.c
index 59f17a2335f4..fe785ee819dd 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
}
EXPORT_SYMBOL_GPL(rds_send_drop_acked);
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
@@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
spin_lock_irqsave(&rs->rs_lock, flags);
list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
- if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
- dest->sin_port != rm->m_inc.i_hdr.h_dport))
+ if (dest &&
+ (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
+ dest->sin6_port != rm->m_inc.i_hdr.h_dport))
continue;
list_move(&rm->m_sock_item, &list);
@@ -1006,7 +1007,8 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
return ret;
}
-static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
+static int rds_send_mprds_hash(struct rds_sock *rs,
+ struct rds_connection *conn, int nonblock)
{
int hash;
@@ -1022,10 +1024,16 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn)
* used. But if we are interrupted, we have to use the zero
* c_path in case the connection ends up being non-MP capable.
*/
- if (conn->c_npaths == 0)
+ if (conn->c_npaths == 0) {
+ /* Cannot wait for the connection be made, so just use
+ * the base c_path.
+ */
+ if (nonblock)
+ return 0;
if (wait_event_interruptible(conn->c_hs_waitq,
conn->c_npaths != 0))
hash = 0;
+ }
if (conn->c_npaths == 1)
hash = 0;
}
@@ -1059,8 +1067,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
{
struct sock *sk = sock->sk;
struct rds_sock *rs = rds_sk_to_rs(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
- __be32 daddr;
__be16 dport;
struct rds_message *rm = NULL;
struct rds_connection *conn;
@@ -1069,10 +1077,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
struct rds_conn_path *cpath;
+ struct in6_addr daddr;
+ __u32 scope_id = 0;
size_t total_payload_len = payload_len, rdma_payload_len = 0;
bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
int num_sgs = ceil(payload_len, PAGE_SIZE);
+ int namelen;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1081,27 +1092,108 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
- if (msg->msg_namelen) {
- /* XXX fail non-unicast destination IPs? */
- if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+ namelen = msg->msg_namelen;
+ if (namelen != 0) {
+ if (namelen < sizeof(*usin)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ switch (usin->sin_family) {
+ case AF_INET:
+ if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+ usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
+ dport = usin->sin_port;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ int addr_type;
+
+ if (namelen < sizeof(*sin6)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* It is a mapped address. Need to do some
+ * sanity checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(addr4))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ scope_id = sin6->sin6_scope_id;
+ }
+
+ daddr = sin6->sin6_addr;
+ dport = sin6->sin6_port;
+ break;
+ }
+#endif
+
+ default:
ret = -EINVAL;
goto out;
}
- daddr = usin->sin_addr.s_addr;
- dport = usin->sin_port;
} else {
/* We only care about consistency with ->connect() */
lock_sock(sk);
daddr = rs->rs_conn_addr;
dport = rs->rs_conn_port;
+ scope_id = rs->rs_bound_scope_id;
release_sock(sk);
}
lock_sock(sk);
- if (daddr == 0 || rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
release_sock(sk);
- ret = -ENOTCONN; /* XXX not a great errno */
+ ret = -ENOTCONN;
goto out;
+ } else if (namelen != 0) {
+ /* Cannot send to an IPv4 address using an IPv6 source
+ * address and cannot send to an IPv6 address using an
+ * IPv4 source address.
+ */
+ if (ipv6_addr_v4mapped(&daddr) ^
+ ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ release_sock(sk);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ /* If the socket is already bound to a link local address,
+ * it can only send to peers on the same link. But allow
+ * communicating beween link local and non-link local address.
+ */
+ if (scope_id != rs->rs_bound_scope_id) {
+ if (!scope_id) {
+ scope_id = rs->rs_bound_scope_id;
+ } else if (rs->rs_bound_scope_id) {
+ release_sock(sk);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
}
release_sock(sk);
@@ -1155,13 +1247,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */
- if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
conn = rs->rs_conn;
else {
conn = rds_conn_create_outgoing(sock_net(sock->sk),
- rs->rs_bound_addr, daddr,
- rs->rs_transport,
- sock->sk->sk_allocation);
+ &rs->rs_bound_addr, &daddr,
+ rs->rs_transport,
+ sock->sk->sk_allocation,
+ scope_id);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
goto out;
@@ -1170,7 +1263,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
}
if (conn->c_trans->t_mp_capable)
- cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
+ cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
else
cpath = &conn->c_path[0];
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 351a28474667..b9bbcf3d6c63 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,7 @@
#include <net/tcp.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/addrconf.h>
#include "rds.h"
#include "tcp.h"
@@ -44,7 +45,14 @@
/* only for info exporting */
static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
static LIST_HEAD(rds_tcp_tc_list);
+
+/* rds_tcp_tc_count counts only IPv4 connections.
+ * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
+ */
static unsigned int rds_tcp_tc_count;
+#if IS_ENABLED(CONFIG_IPV6)
+static unsigned int rds6_tcp_tc_count;
+#endif
/* Track rds_tcp_connection structs so they can be cleaned up */
static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -111,7 +119,11 @@ void rds_tcp_restore_callbacks(struct socket *sock,
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_del_init(&tc->t_list_item);
- rds_tcp_tc_count--;
+#if IS_ENABLED(CONFIG_IPV6)
+ rds6_tcp_tc_count--;
+#endif
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count--;
spin_unlock(&rds_tcp_tc_list_lock);
tc->t_sock = NULL;
@@ -198,7 +210,11 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
- rds_tcp_tc_count++;
+#if IS_ENABLED(CONFIG_IPV6)
+ rds6_tcp_tc_count++;
+#endif
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count++;
spin_unlock(&rds_tcp_tc_list_lock);
/* accepted sockets need our listen data ready undone */
@@ -219,6 +235,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
write_unlock_bh(&sock->sk->sk_callback_lock);
}
+/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4
+ * connections for backward compatibility.
+ */
static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -226,8 +245,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_tcp_socket tsinfo;
struct rds_tcp_connection *tc;
unsigned long flags;
- struct sockaddr_in sin;
- struct socket *sock;
spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
@@ -235,16 +252,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
goto out;
list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct inet_sock *inet = inet_sk(tc->t_sock->sk);
- sock = tc->t_sock;
- if (sock) {
- sock->ops->getname(sock, (struct sockaddr *)&sin, 0);
- tsinfo.local_addr = sin.sin_addr.s_addr;
- tsinfo.local_port = sin.sin_port;
- sock->ops->getname(sock, (struct sockaddr *)&sin, 1);
- tsinfo.peer_addr = sin.sin_addr.s_addr;
- tsinfo.peer_port = sin.sin_port;
- }
+ if (tc->t_cpath->cp_conn->c_isv6)
+ continue;
+
+ tsinfo.local_addr = inet->inet_saddr;
+ tsinfo.local_port = inet->inet_sport;
+ tsinfo.peer_addr = inet->inet_daddr;
+ tsinfo.peer_port = inet->inet_dport;
tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -262,10 +278,82 @@ out:
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}
-static int rds_tcp_laddr_check(struct net *net, __be32 addr)
+#if IS_ENABLED(CONFIG_IPV6)
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds6_info_tcp_socket tsinfo6;
+ struct rds_tcp_connection *tc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+ if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+ goto out;
+
+ list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct sock *sk = tc->t_sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+
+ tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+ tsinfo6.local_port = inet->inet_sport;
+ tsinfo6.peer_addr = sk->sk_v6_daddr;
+ tsinfo6.peer_port = inet->inet_dport;
+
+ tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+ tsinfo6.data_rem = tc->t_tinc_data_rem;
+ tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+ tsinfo6.last_expected_una = tc->t_last_expected_una;
+ tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+ rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+ }
+
+out:
+ lens->nr = rds6_tcp_tc_count;
+ lens->each = sizeof(tsinfo6);
+
+ spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+#endif
+
+static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
{
- if (inet_addr_type(net, addr) == RTN_LOCAL)
+ struct net_device *dev = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ int ret;
+#endif
+
+ if (ipv6_addr_v4mapped(addr)) {
+ if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
+ return 0;
+ return -EADDRNOTAVAIL;
+ }
+
+ /* If the scope_id is specified, check only those addresses
+ * hosted on the specified interface.
+ */
+ if (scope_id != 0) {
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, scope_id);
+ /* scope_id is not valid... */
+ if (!dev) {
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ }
+ rcu_read_unlock();
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = ipv6_chk_addr(net, addr, dev, 0);
+ if (ret)
return 0;
+#endif
return -EADDRNOTAVAIL;
}
@@ -468,13 +556,27 @@ static __net_init int rds_tcp_init_net(struct net *net)
err = -ENOMEM;
goto fail;
}
- rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
+#else
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+#endif
if (!rtn->rds_tcp_listen_sock) {
- pr_warn("could not set up listen sock\n");
- unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
- rtn->rds_tcp_sysctl = NULL;
- err = -EAFNOSUPPORT;
- goto fail;
+ pr_warn("could not set up IPv6 listen sock\n");
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* Try IPv4 as some systems disable IPv6 */
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+ if (!rtn->rds_tcp_listen_sock) {
+#endif
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+ rtn->rds_tcp_sysctl = NULL;
+ err = -EAFNOSUPPORT;
+ goto fail;
+#if IS_ENABLED(CONFIG_IPV6)
+ }
+#endif
}
INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
return 0;
@@ -588,6 +690,9 @@ static void rds_tcp_exit(void)
rds_tcp_set_unloading();
synchronize_rcu();
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
+#endif
unregister_pernet_device(&rds_tcp_net_ops);
rds_tcp_destroy_conns();
rds_trans_unregister(&rds_tcp_transport);
@@ -619,6 +724,9 @@ static int rds_tcp_init(void)
rds_trans_register(&rds_tcp_transport);
rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
+#endif
goto out;
out_recv:
@@ -633,4 +741,3 @@ module_init(rds_tcp_init);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: TCP transport");
MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c6fa080e9b6d..3c69361d21c7 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
-struct socket *rds_tcp_listen_init(struct net *);
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index d999e7075645..008f50fb25dd 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk)
* RDS connection as RDS_CONN_UP until the reconnect,
* to avoid RDS datagram loss.
*/
- if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) &&
+ if (rds_addr_cmp(&cp->cp_conn->c_laddr,
+ &cp->cp_conn->c_faddr) >= 0 &&
rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
RDS_CONN_ERROR)) {
rds_conn_path_drop(cp, false);
@@ -88,7 +89,11 @@ out:
int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
{
struct socket *sock = NULL;
- struct sockaddr_in src, dest;
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ struct sockaddr *addr;
+ int addrlen;
+ bool isv6;
int ret;
struct rds_connection *conn = cp->cp_conn;
struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -105,37 +110,68 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
mutex_unlock(&tc->t_conn_path_lock);
return 0;
}
- ret = sock_create_kern(rds_conn_net(conn), PF_INET,
- SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ipv6_addr_v4mapped(&conn->c_laddr)) {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = false;
+ } else {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = true;
+ }
+
if (ret < 0)
goto out;
rds_tcp_tune(sock);
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_laddr;
+ sin6.sin6_port = 0;
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin.sin_port = 0;
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
- ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+ ret = sock->ops->bind(sock, addr, addrlen);
if (ret) {
- rdsdebug("bind failed with %d at address %pI4\n",
+ rdsdebug("bind failed with %d at address %pI6c\n",
ret, &conn->c_laddr);
goto out;
}
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_faddr;
+ sin6.sin6_port = htons(RDS_TCP_PORT);
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin.sin_port = htons(RDS_TCP_PORT);
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
/*
* once we call connect() we can start getting callbacks and they
* own the socket
*/
rds_tcp_set_callbacks(sock, cp);
- ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
- O_NONBLOCK);
+ ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
- rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+ rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
if (ret == 0) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 22571189f21e..c12203f646da 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -83,13 +83,12 @@ static
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{
int i;
- bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
int npaths = max_t(int, 1, conn->c_npaths);
/* for mprds, all paths MUST be initiated by the peer
* with the smaller address.
*/
- if (!peer_is_smaller) {
+ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
/* Make sure we initiate at least one path if this
* has not already been done; rds_start_mprds() will
* take care of additional paths, if necessary.
@@ -132,6 +131,11 @@ int rds_tcp_accept_one(struct socket *sock)
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
struct rds_conn_path *cp;
+ struct in6_addr *my_addr, *peer_addr;
+#if !IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr saddr, daddr;
+#endif
+ int dev_if = 0;
if (!sock) /* module unload or netns delete in progress */
return -ENETUNREACH;
@@ -164,13 +168,40 @@ int rds_tcp_accept_one(struct socket *sock)
inet = inet_sk(new_sock->sk);
- rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
- &inet->inet_saddr, ntohs(inet->inet_sport),
- &inet->inet_daddr, ntohs(inet->inet_dport));
+#if IS_ENABLED(CONFIG_IPV6)
+ my_addr = &new_sock->sk->sk_v6_rcv_saddr;
+ peer_addr = &new_sock->sk->sk_v6_daddr;
+#else
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
+ ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
+ my_addr = &saddr;
+ peer_addr = &daddr;
+#endif
+ rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
+ sock->sk->sk_family,
+ my_addr, ntohs(inet->inet_sport),
+ peer_addr, ntohs(inet->inet_dport));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* sk_bound_dev_if is not set if the peer address is not link local
+ * address. In this case, it happens that mcast_oif is set. So
+ * just use it.
+ */
+ if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
+ !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
+ struct ipv6_pinfo *inet6;
+
+ inet6 = inet6_sk(new_sock->sk);
+ dev_if = inet6->mcast_oif;
+ } else {
+ dev_if = new_sock->sk->sk_bound_dev_if;
+ }
+#endif
conn = rds_conn_create(sock_net(sock->sk),
- inet->inet_saddr, inet->inet_daddr,
- &rds_tcp_transport, GFP_KERNEL);
+ my_addr, peer_addr,
+ &rds_tcp_transport, GFP_KERNEL, dev_if);
+
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
goto out;
@@ -254,15 +285,22 @@ out:
ready(sk);
}
-struct socket *rds_tcp_listen_init(struct net *net)
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
{
- struct sockaddr_in sin;
struct socket *sock = NULL;
+ struct sockaddr_storage ss;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int addr_len;
int ret;
- ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret < 0)
+ ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+ if (ret < 0) {
+ rdsdebug("could not create %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
goto out;
+ }
sock->sk->sk_reuse = SK_CAN_REUSE;
rds_tcp_nonagle(sock);
@@ -272,13 +310,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
write_unlock_bh(&sock->sk->sk_callback_lock);
- sin.sin_family = PF_INET;
- sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
- sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+ if (isv6) {
+ sin6 = (struct sockaddr_in6 *)&ss;
+ sin6->sin6_family = PF_INET6;
+ sin6->sin6_addr = in6addr_any;
+ sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+ sin6->sin6_scope_id = 0;
+ sin6->sin6_flowinfo = 0;
+ addr_len = sizeof(*sin6);
+ } else {
+ sin = (struct sockaddr_in *)&ss;
+ sin->sin_family = PF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
+ addr_len = sizeof(*sin);
+ }
- ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
- if (ret < 0)
+ ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
+ if (ret < 0) {
+ rdsdebug("could not bind %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
goto out;
+ }
ret = sock->ops->listen(sock, 64);
if (ret < 0)
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index b9fbd2ee74ef..42c5ff1eda95 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
tc->t_tinc = tinc;
rdsdebug("alloced tinc %p\n", tinc);
rds_inc_path_init(&tinc->ti_inc, cp,
- cp->cp_conn->c_faddr);
+ &cp->cp_conn->c_faddr);
tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
local_clock();
@@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
rds_tcp_cong_recv(conn, tinc);
else
- rds_recv_incoming(conn, conn->c_faddr,
- conn->c_laddr, &tinc->ti_inc,
+ rds_recv_incoming(conn, &conn->c_faddr,
+ &conn->c_laddr,
+ &tinc->ti_inc,
arg->gfp);
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7df869d37afd..78a2554a4497 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -153,7 +153,7 @@ out:
* an incoming RST.
*/
if (rds_conn_path_up(cp)) {
- pr_warn("RDS/tcp: send to %pI4 on cp [%d]"
+ pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
"returned %d, "
"disconnecting and reconnecting\n",
&conn->c_faddr, cp->cp_index, ret);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index c52861d77a59..e64f9e4c3cda 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
return;
}
- rdsdebug("conn %p for %pI4 to %pI4 complete\n",
- cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
+ rdsdebug("conn %p for %pI6c to %pI6c complete\n",
+ cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
cp->cp_reconnect_jiffies = 0;
set_bit(0, &cp->cp_conn->c_map_queued);
@@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
unsigned long rand;
struct rds_connection *conn = cp->cp_conn;
- rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
- conn, &conn->c_laddr, &conn->c_faddr,
- cp->cp_reconnect_jiffies);
+ rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
+ conn, &conn->c_laddr, &conn->c_faddr,
+ cp->cp_reconnect_jiffies);
/* let peer with smaller addr initiate reconnect, to avoid duels */
if (conn->c_trans->t_type == RDS_TRANS_TCP &&
- !IS_CANONICAL(conn->c_laddr, conn->c_faddr))
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
return;
set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
@@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
}
get_random_bytes(&rand, sizeof(rand));
- rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+ rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr);
rcu_read_lock();
@@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work)
int ret;
if (cp->cp_index > 0 &&
- !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr))
+ rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
return;
clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
if (ret) {
ret = conn->c_trans->conn_path_connect(cp);
- rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
- conn, &conn->c_laddr, &conn->c_faddr, ret);
+ rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
+ conn, &conn->c_laddr, &conn->c_faddr, ret);
if (ret) {
if (rds_conn_path_transition(cp,
@@ -259,3 +259,50 @@ int rds_threads_init(void)
return 0;
}
+
+/* Compare two IPv6 addresses. Return 0 if the two addresses are equal.
+ * Return 1 if the first is greater. Return -1 if the second is greater.
+ */
+int rds_addr_cmp(const struct in6_addr *addr1,
+ const struct in6_addr *addr2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+ const __be64 *a1, *a2;
+ u64 x, y;
+
+ a1 = (__be64 *)addr1;
+ a2 = (__be64 *)addr2;
+
+ if (*a1 != *a2) {
+ if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
+ return -1;
+ else
+ return 1;
+ } else {
+ x = be64_to_cpu(*++a1);
+ y = be64_to_cpu(*++a2);
+ if (x < y)
+ return -1;
+ else if (x > y)
+ return 1;
+ else
+ return 0;
+ }
+#else
+ u32 a, b;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
+ a = ntohl(addr1->s6_addr32[i]);
+ b = ntohl(addr2->s6_addr32[i]);
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ }
+ }
+ return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(rds_addr_cmp);
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 0b188dd0a344..46f709a4b577 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include "rds.h"
#include "loop.h"
@@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans)
module_put(trans->t_owner);
}
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+ const struct in6_addr *addr,
+ __u32 scope_id)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
unsigned int i;
- if (IN_LOOPBACK(ntohl(addr)))
+ if (ipv6_addr_v4mapped(addr)) {
+ if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
+ return &rds_loop_transport;
+ } else if (ipv6_addr_loopback(addr)) {
return &rds_loop_transport;
+ }
down_read(&rds_trans_sem);
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
- if (trans && (trans->laddr_check(net, addr) == 0) &&
+ if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;
@@ -152,4 +159,3 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
return total;
}
-
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index a7a4e6ff9be2..abca57040f37 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -141,13 +141,15 @@ static void rfkill_led_trigger_event(struct rfkill *rfkill)
led_trigger_event(trigger, LED_FULL);
}
-static void rfkill_led_trigger_activate(struct led_classdev *led)
+static int rfkill_led_trigger_activate(struct led_classdev *led)
{
struct rfkill *rfkill;
rfkill = container_of(led->trigger, struct rfkill, led_trigger);
rfkill_led_trigger_event(rfkill);
+
+ return 0;
}
const char *rfkill_get_led_trigger_name(struct rfkill *rfkill)
@@ -508,8 +510,8 @@ void rfkill_remove_epo_lock(void)
/**
* rfkill_is_epo_lock_active - returns true EPO is active
*
- * Returns 0 (false) if there is NOT an active EPO contidion,
- * and 1 (true) if there is an active EPO contition, which
+ * Returns 0 (false) if there is NOT an active EPO condition,
+ * and 1 (true) if there is an active EPO condition, which
* locks all radios in one of the BLOCKED states.
*
* Can be called in atomic context.
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 00192a996be0..0f8465852254 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/mod_devicetable.h>
#include <linux/rfkill.h>
#include <linux/platform_device.h>
#include <linux/clk.h>
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2b463047dd7b..64362d078da8 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -97,7 +97,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
srx->transport_len > len)
return -EINVAL;
- if (srx->transport.family != rx->family)
+ if (srx->transport.family != rx->family &&
+ srx->transport.family == AF_INET && rx->family != AF_INET6)
return -EAFNOSUPPORT;
switch (srx->transport.family) {
@@ -385,6 +386,20 @@ u32 rxrpc_kernel_check_life(struct socket *sock, struct rxrpc_call *call)
EXPORT_SYMBOL(rxrpc_kernel_check_life);
/**
+ * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
+ * @sock: The socket the call is on
+ * @call: The call to query
+ *
+ * Allow a kernel service to retrieve the epoch value from a service call to
+ * see if the client at the other end rebooted.
+ */
+u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call)
+{
+ return call->conn->proto.epoch;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_epoch);
+
+/**
* rxrpc_kernel_check_call - Check a call's state
* @sock: The socket the call is on
* @call: The call to check
@@ -741,7 +756,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
struct rxrpc_sock *rx = rxrpc_sk(sk);
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* the socket is readable if there are any messages waiting on the Rx
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 707630ab4713..0a7c49e8e053 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -40,17 +40,12 @@ struct rxrpc_crypt {
struct rxrpc_connection;
/*
- * Mark applied to socket buffers.
+ * Mark applied to socket buffers in skb->mark. skb->priority is used
+ * to pass supplementary information.
*/
enum rxrpc_skb_mark {
- RXRPC_SKB_MARK_DATA, /* data message */
- RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */
- RXRPC_SKB_MARK_BUSY, /* server busy message */
- RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */
- RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */
- RXRPC_SKB_MARK_NET_ERROR, /* network error message */
- RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */
- RXRPC_SKB_MARK_NEW_CALL, /* local error message */
+ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */
+ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */
};
/*
@@ -293,7 +288,6 @@ struct rxrpc_peer {
struct hlist_node hash_link;
struct rxrpc_local *local;
struct hlist_head error_targets; /* targets for net error distribution */
- struct work_struct error_distributor;
struct rb_root service_conns; /* Service connections */
struct list_head keepalive_link; /* Link in net->peer_keepalive[] */
time64_t last_tx_at; /* Last time packet sent here */
@@ -304,12 +298,11 @@ struct rxrpc_peer {
unsigned int maxdata; /* data size (MTU - hdrsize) */
unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
- int error_report; /* Net (+0) or local (+1000000) to distribute */
-#define RXRPC_LOCAL_ERROR_OFFSET 1000000
struct sockaddr_rxrpc srx; /* remote address */
/* calculated RTT cache */
#define RXRPC_RTT_CACHE_SIZE 32
+ spinlock_t rtt_input_lock; /* RTT lock for input routine */
ktime_t rtt_last_req; /* Time of last RTT request */
u64 rtt; /* Current RTT estimate (in nS) */
u64 rtt_sum; /* Sum of cache contents */
@@ -420,6 +413,7 @@ struct rxrpc_connection {
struct rxrpc_channel {
unsigned long final_ack_at; /* Time at which to issue final ACK */
struct rxrpc_call __rcu *call; /* Active call */
+ unsigned int call_debug_id; /* call->debug_id */
u32 call_id; /* ID of current call */
u32 call_counter; /* Call ID counter */
u32 last_call; /* ID of last call */
@@ -449,19 +443,29 @@ struct rxrpc_connection {
spinlock_t state_lock; /* state-change lock */
enum rxrpc_conn_cache_state cache_state;
enum rxrpc_conn_proto_state state; /* current state of connection */
- u32 local_abort; /* local abort code */
- u32 remote_abort; /* remote abort code */
+ u32 abort_code; /* Abort code of connection abort */
int debug_id; /* debug ID for printks */
atomic_t serial; /* packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
u32 security_nonce; /* response re-use preventer */
- u16 service_id; /* Service ID, possibly upgraded */
+ u32 service_id; /* Service ID, possibly upgraded */
u8 size_align; /* data size alignment (for security) */
u8 security_size; /* security header size */
u8 security_ix; /* security type */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
+ short error; /* Local error code */
};
+static inline bool rxrpc_to_server(const struct rxrpc_skb_priv *sp)
+{
+ return sp->hdr.flags & RXRPC_CLIENT_INITIATED;
+}
+
+static inline bool rxrpc_to_client(const struct rxrpc_skb_priv *sp)
+{
+ return !rxrpc_to_server(sp);
+}
+
/*
* Flags in call->flags.
*/
@@ -478,6 +482,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
+ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
};
/*
@@ -588,7 +593,7 @@ struct rxrpc_call {
*/
#define RXRPC_RXTX_BUFF_SIZE 64
#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
-#define RXRPC_INIT_RX_WINDOW_SIZE 32
+#define RXRPC_INIT_RX_WINDOW_SIZE 63
struct sk_buff **rxtx_buffer;
u8 *rxtx_annotations;
#define RXRPC_TX_ANNO_ACK 0
@@ -631,6 +636,8 @@ struct rxrpc_call {
bool tx_phase; /* T if transmission phase, F if receive phase */
u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */
+ spinlock_t input_lock; /* Lock for packet input to this call */
+
/* receive-phase ACK management */
u8 ackr_reason; /* reason to ACK */
u16 ackr_skew; /* skew on packet being ACK'd */
@@ -715,7 +722,7 @@ extern struct workqueue_struct *rxrpc_workqueue;
int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t);
void rxrpc_discard_prealloc(struct rxrpc_sock *);
struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *,
- struct rxrpc_connection *,
+ struct rxrpc_sock *,
struct sk_buff *);
void rxrpc_accept_incoming_calls(struct rxrpc_local *);
struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long,
@@ -885,8 +892,9 @@ extern unsigned long rxrpc_conn_idle_client_fast_expiry;
extern struct idr rxrpc_client_conn_ids;
void rxrpc_destroy_client_conn_ids(void);
-int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *,
- struct sockaddr_rxrpc *, gfp_t);
+int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *,
+ struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *,
+ gfp_t);
void rxrpc_expose_client_call(struct rxrpc_call *);
void rxrpc_disconnect_client_call(struct rxrpc_call *);
void rxrpc_put_client_conn(struct rxrpc_connection *);
@@ -906,7 +914,8 @@ extern unsigned int rxrpc_closed_conn_expiry;
struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
- struct sk_buff *);
+ struct sk_buff *,
+ struct rxrpc_peer **);
void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *);
void rxrpc_disconnect_call(struct rxrpc_call *);
void rxrpc_kill_connection(struct rxrpc_connection *);
@@ -958,7 +967,7 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
* input.c
*/
-void rxrpc_data_ready(struct sock *);
+int rxrpc_input_packet(struct sock *, struct sk_buff *);
/*
* insecure.c
@@ -1029,7 +1038,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *);
* peer_event.c
*/
void rxrpc_error_report(struct sock *);
-void rxrpc_peer_error_distributor(struct work_struct *);
void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
void rxrpc_peer_keepalive_worker(struct work_struct *);
@@ -1039,22 +1047,22 @@ void rxrpc_peer_keepalive_worker(struct work_struct *);
*/
struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *,
const struct sockaddr_rxrpc *);
-struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *,
struct sockaddr_rxrpc *, gfp_t);
struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
-struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
- struct rxrpc_peer *);
+void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *,
+ struct rxrpc_peer *);
void rxrpc_destroy_all_peers(struct rxrpc_net *);
struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
void rxrpc_put_peer(struct rxrpc_peer *);
-void __rxrpc_queue_peer_error(struct rxrpc_peer *);
/*
* proc.c
*/
extern const struct seq_operations rxrpc_call_seq_ops;
extern const struct seq_operations rxrpc_connection_seq_ops;
+extern const struct seq_operations rxrpc_peer_seq_ops;
/*
* recvmsg.c
@@ -1091,7 +1099,6 @@ void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
-void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
/*
@@ -1108,8 +1115,7 @@ static inline void rxrpc_sysctl_exit(void) {}
/*
* utils.c
*/
-int rxrpc_extract_addr_from_skb(struct rxrpc_local *, struct sockaddr_rxrpc *,
- struct sk_buff *);
+int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *);
static inline bool before(u32 seq1, u32 seq2)
{
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 9d1e298b784c..44860505246d 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -249,11 +249,11 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
*/
static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_local *local,
+ struct rxrpc_peer *peer,
struct rxrpc_connection *conn,
struct sk_buff *skb)
{
struct rxrpc_backlog *b = rx->backlog;
- struct rxrpc_peer *peer, *xpeer;
struct rxrpc_call *call;
unsigned short call_head, conn_head, peer_head;
unsigned short call_tail, conn_tail, peer_tail;
@@ -276,21 +276,18 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
return NULL;
if (!conn) {
- /* No connection. We're going to need a peer to start off
- * with. If one doesn't yet exist, use a spare from the
- * preallocation set. We dump the address into the spare in
- * anticipation - and to save on stack space.
- */
- xpeer = b->peer_backlog[peer_tail];
- if (rxrpc_extract_addr_from_skb(local, &xpeer->srx, skb) < 0)
- return NULL;
-
- peer = rxrpc_lookup_incoming_peer(local, xpeer);
- if (peer == xpeer) {
+ if (peer && !rxrpc_get_peer_maybe(peer))
+ peer = NULL;
+ if (!peer) {
+ peer = b->peer_backlog[peer_tail];
+ if (rxrpc_extract_addr_from_skb(&peer->srx, skb) < 0)
+ return NULL;
b->peer_backlog[peer_tail] = NULL;
smp_store_release(&b->peer_backlog_tail,
(peer_tail + 1) &
(RXRPC_BACKLOG_MAX - 1));
+
+ rxrpc_new_incoming_peer(rx, local, peer);
}
/* Now allocate and set up the connection */
@@ -335,45 +332,38 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
* The call is returned with the user access mutex held.
*/
struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
- struct rxrpc_connection *conn,
+ struct rxrpc_sock *rx,
struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_sock *rx;
+ struct rxrpc_connection *conn;
+ struct rxrpc_peer *peer = NULL;
struct rxrpc_call *call;
- u16 service_id = sp->hdr.serviceId;
_enter("");
- /* Get the socket providing the service */
- rx = rcu_dereference(local->service);
- if (rx && (service_id == rx->srx.srx_service ||
- service_id == rx->second_service))
- goto found_service;
-
- trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
- RX_INVALID_OPERATION, EOPNOTSUPP);
- skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
- skb->priority = RX_INVALID_OPERATION;
- _leave(" = NULL [service]");
- return NULL;
-
-found_service:
spin_lock(&rx->incoming_lock);
if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
rx->sk.sk_state == RXRPC_CLOSE) {
trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber,
sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
- skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+ skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
skb->priority = RX_INVALID_OPERATION;
_leave(" = NULL [close]");
call = NULL;
goto out;
}
- call = rxrpc_alloc_incoming_call(rx, local, conn, skb);
+ /* The peer, connection and call may all have sprung into existence due
+ * to a duplicate packet being handled on another CPU in parallel, so
+ * we have to recheck the routing. However, we're now holding
+ * rx->incoming_lock, so the values should remain stable.
+ */
+ conn = rxrpc_find_connection_rcu(local, skb, &peer);
+
+ call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb);
if (!call) {
- skb->mark = RXRPC_SKB_MARK_BUSY;
+ skb->mark = RXRPC_SKB_MARK_REJECT_BUSY;
_leave(" = NULL [busy]");
call = NULL;
goto out;
@@ -413,20 +403,22 @@ found_service:
case RXRPC_CONN_SERVICE:
write_lock(&call->state_lock);
- if (rx->discard_new_call)
- call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
- else
- call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ if (call->state < RXRPC_CALL_COMPLETE) {
+ if (rx->discard_new_call)
+ call->state = RXRPC_CALL_SERVER_RECV_REQUEST;
+ else
+ call->state = RXRPC_CALL_SERVER_ACCEPTING;
+ }
write_unlock(&call->state_lock);
break;
case RXRPC_CONN_REMOTELY_ABORTED:
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
- conn->remote_abort, -ECONNABORTED);
+ conn->abort_code, conn->error);
break;
case RXRPC_CONN_LOCALLY_ABORTED:
rxrpc_abort_call("CON", call, sp->hdr.seq,
- conn->local_abort, -ECONNABORTED);
+ conn->abort_code, conn->error);
break;
default:
BUG();
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 20210418904b..8e7434e92097 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -162,7 +162,6 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
*/
static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
{
- struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
unsigned long resend_at;
rxrpc_seq_t cursor, seq, top;
@@ -207,7 +206,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
skb = call->rxtx_buffer[ix];
rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
- sp = rxrpc_skb(skb);
if (anno_type == RXRPC_TX_ANNO_UNACK) {
if (ktime_after(skb->tstamp, max_age)) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 9486293fef5c..8f1a8f85b1f9 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -138,6 +138,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->lock);
spin_lock_init(&call->notify_lock);
+ spin_lock_init(&call->input_lock);
rwlock_init(&call->state_lock);
atomic_set(&call->usage, 1);
call->debug_id = debug_id;
@@ -287,7 +288,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
/* Set up or get a connection record and set the protocol parameters,
* including channel number and call ID.
*/
- ret = rxrpc_connect_call(call, cp, srx, gfp);
+ ret = rxrpc_connect_call(rx, call, cp, srx, gfp);
if (ret < 0)
goto error;
@@ -339,7 +340,7 @@ int rxrpc_retry_client_call(struct rxrpc_sock *rx,
/* Set up or get a connection record and set the protocol parameters,
* including channel number and call ID.
*/
- ret = rxrpc_connect_call(call, cp, srx, gfp);
+ ret = rxrpc_connect_call(rx, call, cp, srx, gfp);
if (ret < 0)
goto error;
@@ -400,7 +401,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx,
rcu_assign_pointer(conn->channels[chan].call, call);
spin_lock(&conn->params.peer->lock);
- hlist_add_head(&call->error_link, &conn->params.peer->error_targets);
+ hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets);
spin_unlock(&conn->params.peer->lock);
_net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 5736f643c516..521189f4b666 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -276,7 +276,8 @@ dont_reuse:
* If we return with a connection, the call will be on its waiting list. It's
* left to the caller to assign a channel and wake up the call.
*/
-static int rxrpc_get_client_conn(struct rxrpc_call *call,
+static int rxrpc_get_client_conn(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
struct rxrpc_conn_parameters *cp,
struct sockaddr_rxrpc *srx,
gfp_t gfp)
@@ -289,7 +290,7 @@ static int rxrpc_get_client_conn(struct rxrpc_call *call,
_enter("{%d,%lx},", call->debug_id, call->user_call_ID);
- cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp);
+ cp->peer = rxrpc_lookup_peer(rx, cp->local, srx, gfp);
if (!cp->peer)
goto error;
@@ -590,6 +591,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
*/
smp_wmb();
chan->call_id = call_id;
+ chan->call_debug_id = call->debug_id;
rcu_assign_pointer(chan->call, call);
wake_up(&call->waitq);
}
@@ -682,7 +684,8 @@ out:
* find a connection for a call
* - called in process context with IRQs enabled
*/
-int rxrpc_connect_call(struct rxrpc_call *call,
+int rxrpc_connect_call(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
struct rxrpc_conn_parameters *cp,
struct sockaddr_rxrpc *srx,
gfp_t gfp)
@@ -695,7 +698,7 @@ int rxrpc_connect_call(struct rxrpc_call *call,
rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper);
rxrpc_cull_active_client_conns(rxnet);
- ret = rxrpc_get_client_conn(call, cp, srx, gfp);
+ ret = rxrpc_get_client_conn(rx, call, cp, srx, gfp);
if (ret < 0)
goto out;
@@ -709,8 +712,8 @@ int rxrpc_connect_call(struct rxrpc_call *call,
}
spin_lock_bh(&call->conn->params.peer->lock);
- hlist_add_head(&call->error_link,
- &call->conn->params.peer->error_targets);
+ hlist_add_head_rcu(&call->error_link,
+ &call->conn->params.peer->error_targets);
spin_unlock_bh(&call->conn->params.peer->lock);
out:
@@ -1051,7 +1054,6 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
container_of(work, struct rxrpc_net, client_conn_reaper);
unsigned long expiry, conn_expires_at, now;
unsigned int nr_conns;
- bool did_discard = false;
_enter("");
@@ -1113,7 +1115,6 @@ next:
* If someone re-sets the flag and re-gets the ref, that's fine.
*/
rxrpc_put_connection(conn);
- did_discard = true;
nr_conns--;
goto next;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 3fde001fcc39..b6fca8ebb117 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -126,11 +126,13 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
switch (chan->last_type) {
case RXRPC_PACKET_TYPE_ABORT:
- _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
+ _proto("Tx ABORT %%%u { %d } [re]", serial, conn->abort_code);
break;
case RXRPC_PACKET_TYPE_ACK:
- trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
- RXRPC_ACK_DUPLICATE, 0);
+ trace_rxrpc_tx_ack(chan->call_debug_id, serial,
+ ntohl(pkt.ack.firstPacket),
+ ntohl(pkt.ack.serial),
+ pkt.ack.reason, 0);
_proto("Tx ACK %%%u [re]", serial);
break;
}
@@ -138,8 +140,11 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
- trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_call_final_resend);
+ trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret,
+ rxrpc_tx_point_call_final_resend);
+ else
+ trace_rxrpc_tx_packet(chan->call_debug_id, &pkt.whdr,
+ rxrpc_tx_point_call_final_resend);
_leave("");
}
@@ -148,13 +153,12 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
* pass a connection-level abort onto all calls on that connection
*/
static void rxrpc_abort_calls(struct rxrpc_connection *conn,
- enum rxrpc_call_completion compl,
- u32 abort_code, int error)
+ enum rxrpc_call_completion compl)
{
struct rxrpc_call *call;
int i;
- _enter("{%d},%x", conn->debug_id, abort_code);
+ _enter("{%d},%x", conn->debug_id, conn->abort_code);
spin_lock(&conn->channel_lock);
@@ -167,9 +171,11 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
trace_rxrpc_abort(call->debug_id,
"CON", call->cid,
call->call_id, 0,
- abort_code, error);
+ conn->abort_code,
+ conn->error);
if (rxrpc_set_call_completion(call, compl,
- abort_code, error))
+ conn->abort_code,
+ conn->error))
rxrpc_notify_socket(call);
}
}
@@ -202,10 +208,12 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
return 0;
}
+ conn->error = error;
+ conn->abort_code = abort_code;
conn->state = RXRPC_CONN_LOCALLY_ABORTED;
spin_unlock_bh(&conn->state_lock);
- rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error);
+ rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED);
msg.msg_name = &conn->params.peer->srx.transport;
msg.msg_namelen = conn->params.peer->srx.transport_len;
@@ -224,7 +232,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
whdr._rsvd = 0;
whdr.serviceId = htons(conn->service_id);
- word = htonl(conn->local_abort);
+ word = htonl(conn->abort_code);
iov[0].iov_base = &whdr;
iov[0].iov_len = sizeof(whdr);
@@ -235,16 +243,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
serial = atomic_inc_return(&conn->serial);
whdr.serial = htonl(serial);
- _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort);
+ _proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code);
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_conn_abort);
+ rxrpc_tx_point_conn_abort);
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
}
+ trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort);
+
conn->params.peer->last_tx_at = ktime_get_seconds();
_leave(" = 0");
@@ -308,9 +318,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
abort_code = ntohl(wtmp);
_proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
+ conn->error = -ECONNABORTED;
+ conn->abort_code = abort_code;
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
- rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
- abort_code, -ECONNABORTED);
+ rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED);
return -ECONNABORTED;
case RXRPC_PACKET_TYPE_CHALLENGE:
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 77440a356b14..c332722820c2 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -69,10 +69,14 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
* If successful, a pointer to the connection is returned, but no ref is taken.
* NULL is returned if there is no match.
*
+ * When searching for a service call, if we find a peer but no connection, we
+ * return that through *_peer in case we need to create a new service call.
+ *
* The caller must be holding the RCU read lock.
*/
struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ struct rxrpc_peer **_peer)
{
struct rxrpc_connection *conn;
struct rxrpc_conn_proto k;
@@ -82,14 +86,12 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
_enter(",%x", sp->hdr.cid & RXRPC_CIDMASK);
- if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0)
+ if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
goto not_found;
- k.epoch = sp->hdr.epoch;
- k.cid = sp->hdr.cid & RXRPC_CIDMASK;
-
- /* We may have to handle mixing IPv4 and IPv6 */
- if (srx.transport.family != local->srx.transport.family) {
+ if (srx.transport.family != local->srx.transport.family &&
+ (srx.transport.family == AF_INET &&
+ local->srx.transport.family != AF_INET6)) {
pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n",
srx.transport.family,
local->srx.transport.family);
@@ -99,7 +101,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
k.epoch = sp->hdr.epoch;
k.cid = sp->hdr.cid & RXRPC_CIDMASK;
- if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) {
+ if (rxrpc_to_server(sp)) {
/* We need to look up service connections by the full protocol
* parameter set. We look up the peer first as an intermediate
* step and then the connection from the peer's tree.
@@ -107,6 +109,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local,
peer = rxrpc_lookup_peer_rcu(local, &srx);
if (!peer)
goto not_found;
+ *_peer = peer;
conn = rxrpc_find_service_conn_rcu(peer, skb);
if (!conn || atomic_read(&conn->usage) == 0)
goto not_found;
@@ -214,7 +217,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
call->peer->cong_cwnd = call->cong_cwnd;
spin_lock_bh(&conn->params.peer->lock);
- hlist_del_init(&call->error_link);
+ hlist_del_rcu(&call->error_link);
spin_unlock_bh(&conn->params.peer->lock);
if (rxrpc_is_client_call(call))
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 608d078a4981..9128aa0e40aa 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -216,10 +216,11 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb,
/*
* Apply a hard ACK by advancing the Tx window.
*/
-static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
+static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
struct rxrpc_ack_summary *summary)
{
struct sk_buff *skb, *list = NULL;
+ bool rot_last = false;
int ix;
u8 annotation;
@@ -243,15 +244,17 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
skb->next = list;
list = skb;
- if (annotation & RXRPC_TX_ANNO_LAST)
+ if (annotation & RXRPC_TX_ANNO_LAST) {
set_bit(RXRPC_CALL_TX_LAST, &call->flags);
+ rot_last = true;
+ }
if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK)
summary->nr_rot_new_acks++;
}
spin_unlock(&call->lock);
- trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ?
+ trace_rxrpc_transmit(call, (rot_last ?
rxrpc_transmit_rotate_last :
rxrpc_transmit_rotate));
wake_up(&call->waitq);
@@ -259,9 +262,11 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
while (list) {
skb = list;
list = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
}
+
+ return rot_last;
}
/*
@@ -273,23 +278,26 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
const char *abort_why)
{
+ unsigned int state;
ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
write_lock(&call->state_lock);
- switch (call->state) {
+ state = call->state;
+ switch (state) {
case RXRPC_CALL_CLIENT_SEND_REQUEST:
case RXRPC_CALL_CLIENT_AWAIT_REPLY:
if (reply_begun)
- call->state = RXRPC_CALL_CLIENT_RECV_REPLY;
+ call->state = state = RXRPC_CALL_CLIENT_RECV_REPLY;
else
- call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+ call->state = state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
break;
case RXRPC_CALL_SERVER_AWAIT_ACK:
__rxrpc_call_completed(call);
rxrpc_notify_socket(call);
+ state = call->state;
break;
default:
@@ -297,11 +305,10 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
}
write_unlock(&call->state_lock);
- if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) {
+ if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
trace_rxrpc_transmit(call, rxrpc_transmit_await_reply);
- } else {
+ else
trace_rxrpc_transmit(call, rxrpc_transmit_end);
- }
_leave(" = ok");
return true;
@@ -332,11 +339,11 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
}
- if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
- rxrpc_rotate_tx_window(call, top, &summary);
if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
- rxrpc_proto_abort("TXL", call, top);
- return false;
+ if (!rxrpc_rotate_tx_window(call, top, &summary)) {
+ rxrpc_proto_abort("TXL", call, top);
+ return false;
+ }
}
if (!rxrpc_end_tx_phase(call, true, "ETD"))
return false;
@@ -452,13 +459,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
}
}
+ spin_lock(&call->input_lock);
+
/* Received data implicitly ACKs all of the request packets we sent
* when we're acting as a client.
*/
if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST ||
state == RXRPC_CALL_CLIENT_AWAIT_REPLY) &&
!rxrpc_receiving_reply(call))
- return;
+ goto unlock;
call->ackr_prev_seq = seq;
@@ -488,15 +497,19 @@ next_subpacket:
if (flags & RXRPC_LAST_PACKET) {
if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
- seq != call->rx_top)
- return rxrpc_proto_abort("LSN", call, seq);
+ seq != call->rx_top) {
+ rxrpc_proto_abort("LSN", call, seq);
+ goto unlock;
+ }
} else {
if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
- after_eq(seq, call->rx_top))
- return rxrpc_proto_abort("LSA", call, seq);
+ after_eq(seq, call->rx_top)) {
+ rxrpc_proto_abort("LSA", call, seq);
+ goto unlock;
+ }
}
- trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
+ trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
if (before_eq(seq, hard_ack)) {
ack = RXRPC_ACK_DUPLICATE;
ack_serial = serial;
@@ -560,8 +573,10 @@ next_subpacket:
skip:
offset += len;
if (flags & RXRPC_JUMBO_PACKET) {
- if (skb_copy_bits(skb, offset, &flags, 1) < 0)
- return rxrpc_proto_abort("XJF", call, seq);
+ if (skb_copy_bits(skb, offset, &flags, 1) < 0) {
+ rxrpc_proto_abort("XJF", call, seq);
+ goto unlock;
+ }
offset += sizeof(struct rxrpc_jumbo_header);
seq++;
serial++;
@@ -592,9 +607,18 @@ ack:
rxrpc_propose_ACK(call, ack, skew, ack_serial,
immediate_ack, true,
rxrpc_propose_ack_input_data);
+ else
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial,
+ false, true,
+ rxrpc_propose_ack_input_data);
- if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
+ if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) {
+ trace_rxrpc_notify_socket(call->debug_id, serial);
rxrpc_notify_socket(call);
+ }
+
+unlock:
+ spin_unlock(&call->input_lock);
_leave(" [queued]");
}
@@ -616,13 +640,14 @@ static void rxrpc_input_requested_ack(struct rxrpc_call *call,
if (!skb)
continue;
+ sent_at = skb->tstamp;
+ smp_rmb(); /* Read timestamp before serial. */
sp = rxrpc_skb(skb);
if (sp->hdr.serial != orig_serial)
continue;
- smp_rmb();
- sent_at = skb->tstamp;
goto found;
}
+
return;
found:
@@ -680,15 +705,14 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call,
ping_time = call->ping_time;
smp_rmb();
- ping_serial = call->ping_serial;
+ ping_serial = READ_ONCE(call->ping_serial);
if (orig_serial == call->acks_lost_ping)
rxrpc_input_check_for_lost_ack(call);
- if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
- before(orig_serial, ping_serial))
+ if (before(orig_serial, ping_serial) ||
+ !test_and_clear_bit(RXRPC_CALL_PINGING, &call->flags))
return;
- clear_bit(RXRPC_CALL_PINGING, &call->flags);
if (after(orig_serial, ping_serial))
return;
@@ -854,15 +878,32 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_propose_ack_respond_to_ack);
}
+ /* Discard any out-of-order or duplicate ACKs. */
+ if (before_eq(sp->hdr.serial, call->acks_latest))
+ return;
+
+ buf.info.rxMTU = 0;
ioffset = offset + nr_acks + 3;
- if (skb->len >= ioffset + sizeof(buf.info)) {
- if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
- return rxrpc_proto_abort("XAI", call, 0);
+ if (skb->len >= ioffset + sizeof(buf.info) &&
+ skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0)
+ return rxrpc_proto_abort("XAI", call, 0);
+
+ spin_lock(&call->input_lock);
+
+ /* Discard any out-of-order or duplicate ACKs. */
+ if (before_eq(sp->hdr.serial, call->acks_latest))
+ goto out;
+ call->acks_latest_ts = skb->tstamp;
+ call->acks_latest = sp->hdr.serial;
+
+ /* Parse rwind and mtu sizes if provided. */
+ if (buf.info.rxMTU)
rxrpc_input_ackinfo(call, skb, &buf.info);
- }
- if (first_soft_ack == 0)
- return rxrpc_proto_abort("AK0", call, 0);
+ if (first_soft_ack == 0) {
+ rxrpc_proto_abort("AK0", call, 0);
+ goto out;
+ }
/* Ignore ACKs unless we are or have just been transmitting. */
switch (READ_ONCE(call->state)) {
@@ -872,39 +913,35 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
case RXRPC_CALL_SERVER_AWAIT_ACK:
break;
default:
- return;
- }
-
- /* Discard any out-of-order or duplicate ACKs. */
- if (before_eq(sp->hdr.serial, call->acks_latest)) {
- _debug("discard ACK %d <= %d",
- sp->hdr.serial, call->acks_latest);
- return;
+ goto out;
}
- call->acks_latest_ts = skb->tstamp;
- call->acks_latest = sp->hdr.serial;
if (before(hard_ack, call->tx_hard_ack) ||
- after(hard_ack, call->tx_top))
- return rxrpc_proto_abort("AKW", call, 0);
- if (nr_acks > call->tx_top - hard_ack)
- return rxrpc_proto_abort("AKN", call, 0);
+ after(hard_ack, call->tx_top)) {
+ rxrpc_proto_abort("AKW", call, 0);
+ goto out;
+ }
+ if (nr_acks > call->tx_top - hard_ack) {
+ rxrpc_proto_abort("AKN", call, 0);
+ goto out;
+ }
- if (after(hard_ack, call->tx_hard_ack))
- rxrpc_rotate_tx_window(call, hard_ack, &summary);
+ if (after(hard_ack, call->tx_hard_ack)) {
+ if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
+ rxrpc_end_tx_phase(call, false, "ETA");
+ goto out;
+ }
+ }
if (nr_acks > 0) {
- if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0)
- return rxrpc_proto_abort("XSA", call, 0);
+ if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) {
+ rxrpc_proto_abort("XSA", call, 0);
+ goto out;
+ }
rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks,
&summary);
}
- if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) {
- rxrpc_end_tx_phase(call, false, "ETA");
- return;
- }
-
if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] &
RXRPC_TX_ANNO_LAST &&
summary.nr_acks == call->tx_top - hard_ack &&
@@ -913,7 +950,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb,
false, true,
rxrpc_propose_ack_ping_for_lost_reply);
- return rxrpc_congestion_management(call, skb, &summary, acked_serial);
+ rxrpc_congestion_management(call, skb, &summary, acked_serial);
+out:
+ spin_unlock(&call->input_lock);
}
/*
@@ -926,9 +965,12 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb)
_proto("Rx ACKALL %%%u", sp->hdr.serial);
- rxrpc_rotate_tx_window(call, call->tx_top, &summary);
- if (test_bit(RXRPC_CALL_TX_LAST, &call->flags))
+ spin_lock(&call->input_lock);
+
+ if (rxrpc_rotate_tx_window(call, call->tx_top, &summary))
rxrpc_end_tx_phase(call, false, "ETL");
+
+ spin_unlock(&call->input_lock);
}
/*
@@ -1011,18 +1053,19 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
}
/*
- * Handle a new call on a channel implicitly completing the preceding call on
- * that channel.
+ * Handle a new service call on a channel implicitly completing the preceding
+ * call on that channel. This does not apply to client conns.
*
* TODO: If callNumber > call_id + 1, renegotiate security.
*/
-static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
+static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx,
+ struct rxrpc_connection *conn,
struct rxrpc_call *call)
{
switch (READ_ONCE(call->state)) {
case RXRPC_CALL_SERVER_AWAIT_ACK:
rxrpc_call_completed(call);
- break;
+ /* Fall through */
case RXRPC_CALL_COMPLETE:
break;
default:
@@ -1030,11 +1073,13 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn,
set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
+ trace_rxrpc_improper_term(call);
break;
}
- trace_rxrpc_improper_term(call);
+ spin_lock(&rx->incoming_lock);
__rxrpc_disconnect_call(conn, call);
+ spin_unlock(&rx->incoming_lock);
rxrpc_notify_socket(call);
}
@@ -1113,43 +1158,29 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
* The socket is locked by the caller and this prevents the socket from being
* shut down and the local endpoint from going away, thus sk_user_data will not
* be cleared until this function returns.
+ *
+ * Called with the RCU read lock held from the IP layer via UDP.
*/
-void rxrpc_data_ready(struct sock *udp_sk)
+int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
{
struct rxrpc_connection *conn;
struct rxrpc_channel *chan;
- struct rxrpc_call *call;
+ struct rxrpc_call *call = NULL;
struct rxrpc_skb_priv *sp;
struct rxrpc_local *local = udp_sk->sk_user_data;
- struct sk_buff *skb;
+ struct rxrpc_peer *peer = NULL;
+ struct rxrpc_sock *rx = NULL;
unsigned int channel;
- int ret, skew;
+ int skew = 0;
_enter("%p", udp_sk);
- ASSERT(!irqs_disabled());
-
- skb = skb_recv_udp(udp_sk, 0, 1, &ret);
- if (!skb) {
- if (ret == -EAGAIN)
- return;
- _debug("UDP socket error %d", ret);
- return;
- }
+ if (skb->tstamp == 0)
+ skb->tstamp = ktime_get_real();
rxrpc_new_skb(skb, rxrpc_skb_rx_received);
- _net("recv skb %p", skb);
-
- /* we'll probably need to checksum it (didn't call sock_recvmsg) */
- if (skb_checksum_complete(skb)) {
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
- __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0);
- _leave(" [CSUM failed]");
- return;
- }
-
- __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
+ skb_pull(skb, sizeof(struct udphdr));
/* The UDP protocol already released all skb resources;
* we are free to add our own data there.
@@ -1164,69 +1195,104 @@ void rxrpc_data_ready(struct sock *udp_sk)
static int lose;
if ((lose++ & 7) == 7) {
trace_rxrpc_rx_lose(sp);
- rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
- return;
+ rxrpc_free_skb(skb, rxrpc_skb_rx_lost);
+ return 0;
}
}
+ if (skb->tstamp == 0)
+ skb->tstamp = ktime_get_real();
trace_rxrpc_rx_packet(sp);
- _net("Rx RxRPC %s ep=%x call=%x:%x",
- sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
- sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
-
- if (sp->hdr.type >= RXRPC_N_PACKET_TYPES ||
- !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) {
- _proto("Rx Bad Packet Type %u", sp->hdr.type);
- goto bad_message;
- }
-
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_VERSION:
- if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED))
+ if (rxrpc_to_client(sp))
goto discard;
rxrpc_post_packet_to_local(local, skb);
goto out;
case RXRPC_PACKET_TYPE_BUSY:
- if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
+ if (rxrpc_to_server(sp))
goto discard;
/* Fall through */
+ case RXRPC_PACKET_TYPE_ACK:
+ case RXRPC_PACKET_TYPE_ACKALL:
+ if (sp->hdr.callNumber == 0)
+ goto bad_message;
+ /* Fall through */
+ case RXRPC_PACKET_TYPE_ABORT:
+ break;
case RXRPC_PACKET_TYPE_DATA:
- if (sp->hdr.callNumber == 0)
+ if (sp->hdr.callNumber == 0 ||
+ sp->hdr.seq == 0)
goto bad_message;
if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
!rxrpc_validate_jumbo(skb))
goto bad_message;
break;
+ case RXRPC_PACKET_TYPE_CHALLENGE:
+ if (rxrpc_to_server(sp))
+ goto discard;
+ break;
+ case RXRPC_PACKET_TYPE_RESPONSE:
+ if (rxrpc_to_client(sp))
+ goto discard;
+ break;
+
/* Packet types 9-11 should just be ignored. */
case RXRPC_PACKET_TYPE_PARAMS:
case RXRPC_PACKET_TYPE_10:
case RXRPC_PACKET_TYPE_11:
goto discard;
+
+ default:
+ _proto("Rx Bad Packet Type %u", sp->hdr.type);
+ goto bad_message;
}
- rcu_read_lock();
+ if (sp->hdr.serviceId == 0)
+ goto bad_message;
+
+ if (rxrpc_to_server(sp)) {
+ /* Weed out packets to services we're not offering. Packets
+ * that would begin a call are explicitly rejected and the rest
+ * are just discarded.
+ */
+ rx = rcu_dereference(local->service);
+ if (!rx || (sp->hdr.serviceId != rx->srx.srx_service &&
+ sp->hdr.serviceId != rx->second_service)) {
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA &&
+ sp->hdr.seq == 1)
+ goto unsupported_service;
+ goto discard;
+ }
+ }
- conn = rxrpc_find_connection_rcu(local, skb);
+ conn = rxrpc_find_connection_rcu(local, skb, &peer);
if (conn) {
if (sp->hdr.securityIndex != conn->security_ix)
goto wrong_security;
if (sp->hdr.serviceId != conn->service_id) {
- if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) ||
- conn->service_id != conn->params.service_id)
+ int old_id;
+
+ if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags))
+ goto reupgrade;
+ old_id = cmpxchg(&conn->service_id, conn->params.service_id,
+ sp->hdr.serviceId);
+
+ if (old_id != conn->params.service_id &&
+ old_id != sp->hdr.serviceId)
goto reupgrade;
- conn->service_id = sp->hdr.serviceId;
}
if (sp->hdr.callNumber == 0) {
/* Connection-level packet */
_debug("CONN %p {%d}", conn, conn->debug_id);
rxrpc_post_packet_to_conn(conn, skb);
- goto out_unlock;
+ goto out;
}
/* Note the serial number skew here */
@@ -1245,36 +1311,39 @@ void rxrpc_data_ready(struct sock *udp_sk)
/* Ignore really old calls */
if (sp->hdr.callNumber < chan->last_call)
- goto discard_unlock;
+ goto discard;
if (sp->hdr.callNumber == chan->last_call) {
if (chan->call ||
sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)
- goto discard_unlock;
+ goto discard;
/* For the previous service call, if completed
* successfully, we discard all further packets.
*/
if (rxrpc_conn_is_service(conn) &&
chan->last_type == RXRPC_PACKET_TYPE_ACK)
- goto discard_unlock;
+ goto discard;
/* But otherwise we need to retransmit the final packet
* from data cached in the connection record.
*/
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
+ trace_rxrpc_rx_data(chan->call_debug_id,
+ sp->hdr.seq,
+ sp->hdr.serial,
+ sp->hdr.flags, 0);
rxrpc_post_packet_to_conn(conn, skb);
- goto out_unlock;
+ goto out;
}
call = rcu_dereference(chan->call);
if (sp->hdr.callNumber > chan->call_id) {
- if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) {
- rcu_read_unlock();
+ if (rxrpc_to_client(sp))
goto reject_packet;
- }
if (call)
- rxrpc_input_implicit_end_call(conn, call);
+ rxrpc_input_implicit_end_call(rx, conn, call);
call = NULL;
}
@@ -1286,66 +1355,57 @@ void rxrpc_data_ready(struct sock *udp_sk)
if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
}
- } else {
- skew = 0;
- call = NULL;
}
if (!call || atomic_read(&call->usage) == 0) {
- if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) ||
- sp->hdr.callNumber == 0 ||
+ if (rxrpc_to_client(sp) ||
sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
- goto bad_message_unlock;
+ goto bad_message;
if (sp->hdr.seq != 1)
- goto discard_unlock;
- call = rxrpc_new_incoming_call(local, conn, skb);
- if (!call) {
- rcu_read_unlock();
+ goto discard;
+ call = rxrpc_new_incoming_call(local, rx, skb);
+ if (!call)
goto reject_packet;
- }
rxrpc_send_ping(call, skb, skew);
mutex_unlock(&call->user_mutex);
}
rxrpc_input_call_packet(call, skb, skew);
- goto discard_unlock;
+ goto discard;
-discard_unlock:
- rcu_read_unlock();
discard:
rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
out:
trace_rxrpc_rx_done(0, 0);
- return;
-
-out_unlock:
- rcu_read_unlock();
- goto out;
+ return 0;
wrong_security:
- rcu_read_unlock();
trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
RXKADINCONSISTENCY, EBADMSG);
skb->priority = RXKADINCONSISTENCY;
goto post_abort;
+unsupported_service:
+ trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_INVALID_OPERATION, EOPNOTSUPP);
+ skb->priority = RX_INVALID_OPERATION;
+ goto post_abort;
+
reupgrade:
- rcu_read_unlock();
trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
RX_PROTOCOL_ERROR, EBADMSG);
goto protocol_error;
-bad_message_unlock:
- rcu_read_unlock();
bad_message:
trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
RX_PROTOCOL_ERROR, EBADMSG);
protocol_error:
skb->priority = RX_PROTOCOL_ERROR;
post_abort:
- skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
+ skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
reject_packet:
trace_rxrpc_rx_done(skb->mark, skb->priority);
rxrpc_reject_packet(local, skb);
_leave(" [badmsg]");
+ return 0;
}
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index 8325f1b86840..927ead43df42 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -39,7 +39,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
_enter("");
- if (rxrpc_extract_addr_from_skb(local, &srx, skb) < 0)
+ if (rxrpc_extract_addr_from_skb(&srx, skb) < 0)
return;
msg.msg_name = &srx.transport;
@@ -72,7 +72,10 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
if (ret < 0)
trace_rxrpc_tx_fail(local->debug_id, 0, ret,
- rxrpc_tx_fail_version_reply);
+ rxrpc_tx_point_version_reply);
+ else
+ trace_rxrpc_tx_packet(local->debug_id, &whdr,
+ rxrpc_tx_point_version_reply);
_leave("");
}
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 777c3ed4cfc0..0906e51d3cfb 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -19,6 +19,7 @@
#include <linux/ip.h>
#include <linux/hashtable.h>
#include <net/sock.h>
+#include <net/udp.h>
#include <net/af_rxrpc.h>
#include "ar-internal.h"
@@ -108,7 +109,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
*/
static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
{
- struct sock *sock;
+ struct sock *usk;
int ret, opt;
_enter("%p{%d,%d}",
@@ -122,6 +123,28 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
return ret;
}
+ /* set the socket up */
+ usk = local->socket->sk;
+ inet_sk(usk)->mc_loop = 0;
+
+ /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
+ inet_inc_convert_csum(usk);
+
+ rcu_assign_sk_user_data(usk, local);
+
+ udp_sk(usk)->encap_type = UDP_ENCAP_RXRPC;
+ udp_sk(usk)->encap_rcv = rxrpc_input_packet;
+ udp_sk(usk)->encap_destroy = NULL;
+ udp_sk(usk)->gro_receive = NULL;
+ udp_sk(usk)->gro_complete = NULL;
+
+ udp_encap_enable();
+#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6)
+ if (local->srx.transport.family == AF_INET6)
+ udpv6_encap_enable();
+#endif
+ usk->sk_error_report = rxrpc_error_report;
+
/* if a local address was supplied then bind it */
if (local->srx.transport_len > sizeof(sa_family_t)) {
_debug("bind");
@@ -135,10 +158,10 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
}
switch (local->srx.transport.family) {
- case AF_INET:
- /* we want to receive ICMP errors */
+ case AF_INET6:
+ /* we want to receive ICMPv6 errors */
opt = 1;
- ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
+ ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR,
(char *) &opt, sizeof(opt));
if (ret < 0) {
_debug("setsockopt failed");
@@ -146,19 +169,22 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
}
/* we want to set the don't fragment bit */
- opt = IP_PMTUDISC_DO;
- ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
+ opt = IPV6_PMTUDISC_DO;
+ ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER,
(char *) &opt, sizeof(opt));
if (ret < 0) {
_debug("setsockopt failed");
goto error;
}
- break;
- case AF_INET6:
+ /* Fall through and set IPv4 options too otherwise we don't get
+ * errors from IPv4 packets sent through the IPv6 socket.
+ */
+
+ case AF_INET:
/* we want to receive ICMP errors */
opt = 1;
- ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR,
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR,
(char *) &opt, sizeof(opt));
if (ret < 0) {
_debug("setsockopt failed");
@@ -166,24 +192,28 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
}
/* we want to set the don't fragment bit */
- opt = IPV6_PMTUDISC_DO;
- ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_MTU_DISCOVER,
+ opt = IP_PMTUDISC_DO;
+ ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER,
(char *) &opt, sizeof(opt));
if (ret < 0) {
_debug("setsockopt failed");
goto error;
}
+
+ /* We want receive timestamps. */
+ opt = 1;
+ ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS,
+ (char *)&opt, sizeof(opt));
+ if (ret < 0) {
+ _debug("setsockopt failed");
+ goto error;
+ }
break;
default:
BUG();
}
- /* set the socket up */
- sock = local->socket->sk;
- sock->sk_user_data = local;
- sock->sk_data_ready = rxrpc_data_ready;
- sock->sk_error_report = rxrpc_error_report;
_leave(" = 0");
return 0;
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 417d80867c4f..fd7eba8467fa 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -102,6 +102,9 @@ static __net_init int rxrpc_init_net(struct net *net)
proc_create_net("conns", 0444, rxnet->proc_net,
&rxrpc_connection_seq_ops,
sizeof(struct seq_net_private));
+ proc_create_net("peers", 0444, rxnet->proc_net,
+ &rxrpc_peer_seq_ops,
+ sizeof(struct seq_net_private));
return 0;
err_proc:
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 4774c8f5634d..189418888839 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -124,7 +124,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
struct kvec iov[2];
rxrpc_serial_t serial;
rxrpc_seq_t hard_ack, top;
- ktime_t now;
size_t len, n;
int ret;
u8 reason;
@@ -183,7 +182,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
serial = atomic_inc_return(&conn->serial);
pkt->whdr.serial = htonl(serial);
- trace_rxrpc_tx_ack(call, serial,
+ trace_rxrpc_tx_ack(call->debug_id, serial,
ntohl(pkt->ack.firstPacket),
ntohl(pkt->ack.serial),
pkt->ack.reason, pkt->ack.nAcks);
@@ -196,9 +195,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
/* We need to stick a time in before we send the packet in case
* the reply gets back before kernel_sendmsg() completes - but
* asking UDP to send the packet can take a relatively long
- * time, so we update the time after, on the assumption that
- * the packet transmission is more likely to happen towards the
- * end of the kernel_sendmsg() call.
+ * time.
*/
call->ping_time = ktime_get_real();
set_bit(RXRPC_CALL_PINGING, &call->flags);
@@ -206,13 +203,13 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
}
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
- now = ktime_get_real();
- if (ping)
- call->ping_time = now;
conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_ack);
+ rxrpc_tx_point_call_ack);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr,
+ rxrpc_tx_point_call_ack);
if (call->state < RXRPC_CALL_COMPLETE) {
if (ret < 0) {
@@ -299,7 +296,10 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_abort);
+ rxrpc_tx_point_call_abort);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
+ rxrpc_tx_point_call_abort);
rxrpc_put_connection(conn);
@@ -357,8 +357,14 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
/* If our RTT cache needs working on, request an ACK. Also request
* ACKs if a DATA packet appears to have been lost.
+ *
+ * However, we mustn't request an ACK on the last reply packet of a
+ * service call, lest OpenAFS incorrectly send us an ACK with some
+ * soft-ACKs in it and then never follow up with a proper hard ACK.
*/
- if (!(sp->hdr.flags & RXRPC_LAST_PACKET) &&
+ if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) ||
+ rxrpc_to_server(sp)
+ ) &&
(test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) ||
retrans ||
call->cong_mode == RXRPC_CALL_SLOW_START ||
@@ -372,11 +378,13 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
if ((lose++ & 7) == 7) {
ret = 0;
lost = true;
- goto done;
}
}
- _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq);
+ trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
+ retrans, lost);
+ if (lost)
+ goto done;
/* send the packet with the don't fragment bit set if we currently
* think it's small enough */
@@ -384,6 +392,11 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
goto send_fragmentable;
down_read(&conn->params.local->defrag_sem);
+
+ sp->hdr.serial = serial;
+ smp_wmb(); /* Set serial before timestamp */
+ skb->tstamp = ktime_get_real();
+
/* send the packet by UDP
* - returns -EMSGSIZE if UDP would have to fragment the packet
* to go out of the interface
@@ -396,20 +409,17 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
up_read(&conn->params.local->defrag_sem);
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_data_nofrag);
+ rxrpc_tx_point_call_data_nofrag);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &whdr,
+ rxrpc_tx_point_call_data_nofrag);
if (ret == -EMSGSIZE)
goto send_fragmentable;
done:
- trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags,
- retrans, lost);
if (ret >= 0) {
- ktime_t now = ktime_get_real();
- skb->tstamp = now;
- smp_wmb();
- sp->hdr.serial = serial;
if (whdr.flags & RXRPC_REQUEST_ACK) {
- call->peer->rtt_last_req = now;
+ call->peer->rtt_last_req = skb->tstamp;
trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
if (call->peer->rtt_usage > 1) {
unsigned long nowj = jiffies, ack_lost_at;
@@ -448,6 +458,10 @@ send_fragmentable:
down_write(&conn->params.local->defrag_sem);
+ sp->hdr.serial = serial;
+ smp_wmb(); /* Set serial before timestamp */
+ skb->tstamp = ktime_get_real();
+
switch (conn->params.local->srx.transport.family) {
case AF_INET:
opt = IP_PMTUDISC_DONT;
@@ -488,7 +502,10 @@ send_fragmentable:
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_data_frag);
+ rxrpc_tx_point_call_data_frag);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &whdr,
+ rxrpc_tx_point_call_data_frag);
up_write(&conn->params.local->defrag_sem);
goto done;
@@ -507,7 +524,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
struct kvec iov[2];
size_t size;
__be32 code;
- int ret;
+ int ret, ioc;
_enter("%d", local->debug_id);
@@ -515,7 +532,6 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
iov[0].iov_len = sizeof(whdr);
iov[1].iov_base = &code;
iov[1].iov_len = sizeof(code);
- size = sizeof(whdr) + sizeof(code);
msg.msg_name = &srx.transport;
msg.msg_control = NULL;
@@ -523,16 +539,30 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
msg.msg_flags = 0;
memset(&whdr, 0, sizeof(whdr));
- whdr.type = RXRPC_PACKET_TYPE_ABORT;
while ((skb = skb_dequeue(&local->reject_queue))) {
rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
sp = rxrpc_skb(skb);
- if (rxrpc_extract_addr_from_skb(local, &srx, skb) == 0) {
- msg.msg_namelen = srx.transport_len;
-
+ switch (skb->mark) {
+ case RXRPC_SKB_MARK_REJECT_BUSY:
+ whdr.type = RXRPC_PACKET_TYPE_BUSY;
+ size = sizeof(whdr);
+ ioc = 1;
+ break;
+ case RXRPC_SKB_MARK_REJECT_ABORT:
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
code = htonl(skb->priority);
+ size = sizeof(whdr) + sizeof(code);
+ ioc = 2;
+ break;
+ default:
+ rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ continue;
+ }
+
+ if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
+ msg.msg_namelen = srx.transport_len;
whdr.epoch = htonl(sp->hdr.epoch);
whdr.cid = htonl(sp->hdr.cid);
@@ -542,10 +572,14 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
whdr.flags ^= RXRPC_CLIENT_INITIATED;
whdr.flags &= RXRPC_CLIENT_INITIATED;
- ret = kernel_sendmsg(local->socket, &msg, iov, 2, size);
+ ret = kernel_sendmsg(local->socket, &msg,
+ iov, ioc, size);
if (ret < 0)
trace_rxrpc_tx_fail(local->debug_id, 0, ret,
- rxrpc_tx_fail_reject);
+ rxrpc_tx_point_reject);
+ else
+ trace_rxrpc_tx_packet(local->debug_id, &whdr,
+ rxrpc_tx_point_reject);
}
rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
@@ -597,7 +631,10 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len);
if (ret < 0)
trace_rxrpc_tx_fail(peer->debug_id, 0, ret,
- rxrpc_tx_fail_version_keepalive);
+ rxrpc_tx_point_version_keepalive);
+ else
+ trace_rxrpc_tx_packet(peer->debug_id, &whdr,
+ rxrpc_tx_point_version_keepalive);
peer->last_tx_at = ktime_get_seconds();
_leave("");
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 4f9da2f51c69..bc05af89fc38 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -23,6 +23,8 @@
#include "ar-internal.h"
static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *);
+static void rxrpc_distribute_error(struct rxrpc_peer *, int,
+ enum rxrpc_call_completion);
/*
* Find the peer associated with an ICMP packet.
@@ -45,6 +47,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
*/
switch (srx->transport.family) {
case AF_INET:
+ srx->transport_len = sizeof(srx->transport.sin);
+ srx->transport.family = AF_INET;
srx->transport.sin.sin_port = serr->port;
switch (serr->ee.ee_origin) {
case SO_EE_ORIGIN_ICMP:
@@ -68,20 +72,20 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
#ifdef CONFIG_AF_RXRPC_IPV6
case AF_INET6:
- srx->transport.sin6.sin6_port = serr->port;
switch (serr->ee.ee_origin) {
case SO_EE_ORIGIN_ICMP6:
_net("Rx ICMP6");
+ srx->transport.sin6.sin6_port = serr->port;
memcpy(&srx->transport.sin6.sin6_addr,
skb_network_header(skb) + serr->addr_offset,
sizeof(struct in6_addr));
break;
case SO_EE_ORIGIN_ICMP:
_net("Rx ICMP on v6 sock");
- srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
- srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
- srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
- memcpy(srx->transport.sin6.sin6_addr.s6_addr + 12,
+ srx->transport_len = sizeof(srx->transport.sin);
+ srx->transport.family = AF_INET;
+ srx->transport.sin.sin_port = serr->port;
+ memcpy(&srx->transport.sin.sin_addr,
skb_network_header(skb) + serr->addr_offset,
sizeof(struct in_addr));
break;
@@ -193,9 +197,8 @@ void rxrpc_error_report(struct sock *sk)
rxrpc_store_error(peer, serr);
rcu_read_unlock();
rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_put_peer(peer);
- /* The ref we obtained is passed off to the work item */
- __rxrpc_queue_peer_error(peer);
_leave("");
}
@@ -205,6 +208,7 @@ void rxrpc_error_report(struct sock *sk)
static void rxrpc_store_error(struct rxrpc_peer *peer,
struct sock_exterr_skb *serr)
{
+ enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR;
struct sock_extended_err *ee;
int err;
@@ -255,7 +259,7 @@ static void rxrpc_store_error(struct rxrpc_peer *peer,
case SO_EE_ORIGIN_NONE:
case SO_EE_ORIGIN_LOCAL:
_proto("Rx Received local error { error=%d }", err);
- err += RXRPC_LOCAL_ERROR_OFFSET;
+ compl = RXRPC_CALL_LOCAL_ERROR;
break;
case SO_EE_ORIGIN_ICMP6:
@@ -264,48 +268,23 @@ static void rxrpc_store_error(struct rxrpc_peer *peer,
break;
}
- peer->error_report = err;
+ rxrpc_distribute_error(peer, err, compl);
}
/*
- * Distribute an error that occurred on a peer
+ * Distribute an error that occurred on a peer.
*/
-void rxrpc_peer_error_distributor(struct work_struct *work)
+static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error,
+ enum rxrpc_call_completion compl)
{
- struct rxrpc_peer *peer =
- container_of(work, struct rxrpc_peer, error_distributor);
struct rxrpc_call *call;
- enum rxrpc_call_completion compl;
- int error;
-
- _enter("");
-
- error = READ_ONCE(peer->error_report);
- if (error < RXRPC_LOCAL_ERROR_OFFSET) {
- compl = RXRPC_CALL_NETWORK_ERROR;
- } else {
- compl = RXRPC_CALL_LOCAL_ERROR;
- error -= RXRPC_LOCAL_ERROR_OFFSET;
- }
- _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error);
-
- spin_lock_bh(&peer->lock);
-
- while (!hlist_empty(&peer->error_targets)) {
- call = hlist_entry(peer->error_targets.first,
- struct rxrpc_call, error_link);
- hlist_del_init(&call->error_link);
+ hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) {
rxrpc_see_call(call);
-
- if (rxrpc_set_call_completion(call, compl, 0, -error))
+ if (call->state < RXRPC_CALL_COMPLETE &&
+ rxrpc_set_call_completion(call, compl, 0, -error))
rxrpc_notify_socket(call);
}
-
- spin_unlock_bh(&peer->lock);
-
- rxrpc_put_peer(peer);
- _leave("");
}
/*
@@ -325,6 +304,8 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
if (rtt < 0)
return;
+ spin_lock(&peer->rtt_input_lock);
+
/* Replace the oldest datum in the RTT buffer */
sum -= peer->rtt_cache[cursor];
sum += rtt;
@@ -336,6 +317,8 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
peer->rtt_usage = usage;
}
+ spin_unlock(&peer->rtt_input_lock);
+
/* Now recalculate the average */
if (usage == RXRPC_RTT_CACHE_SIZE) {
avg = sum / RXRPC_RTT_CACHE_SIZE;
@@ -344,6 +327,7 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
do_div(avg, usage);
}
+ /* Don't need to update this under lock */
peer->rtt = avg;
trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
usage, avg);
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 1dc7648e3eff..5691b7d266ca 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -124,11 +124,9 @@ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu(
struct rxrpc_net *rxnet = local->rxnet;
hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) {
- if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0) {
- if (atomic_read(&peer->usage) == 0)
- return NULL;
+ if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 &&
+ atomic_read(&peer->usage) > 0)
return peer;
- }
}
return NULL;
@@ -155,8 +153,10 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local,
* assess the MTU size for the network interface through which this peer is
* reached
*/
-static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
+static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx,
+ struct rxrpc_peer *peer)
{
+ struct net *net = sock_net(&rx->sk);
struct dst_entry *dst;
struct rtable *rt;
struct flowi fl;
@@ -171,7 +171,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
switch (peer->srx.transport.family) {
case AF_INET:
rt = ip_route_output_ports(
- &init_net, fl4, NULL,
+ net, fl4, NULL,
peer->srx.transport.sin.sin_addr.s_addr, 0,
htons(7000), htons(7001), IPPROTO_UDP, 0, 0);
if (IS_ERR(rt)) {
@@ -190,7 +190,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
sizeof(struct in6_addr));
fl6->fl6_dport = htons(7001);
fl6->fl6_sport = htons(7000);
- dst = ip6_route_output(&init_net, NULL, fl6);
+ dst = ip6_route_output(net, NULL, fl6);
if (dst->error) {
_leave(" [route err %d]", dst->error);
return;
@@ -222,11 +222,10 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
atomic_set(&peer->usage, 1);
peer->local = local;
INIT_HLIST_HEAD(&peer->error_targets);
- INIT_WORK(&peer->error_distributor,
- &rxrpc_peer_error_distributor);
peer->service_conns = RB_ROOT;
seqlock_init(&peer->service_conn_lock);
spin_lock_init(&peer->lock);
+ spin_lock_init(&peer->rtt_input_lock);
peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
if (RXRPC_TX_SMSS > 2190)
@@ -244,10 +243,11 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
/*
* Initialise peer record.
*/
-static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
+static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer,
+ unsigned long hash_key)
{
peer->hash_key = hash_key;
- rxrpc_assess_MTU_size(peer);
+ rxrpc_assess_MTU_size(rx, peer);
peer->mtu = peer->if_mtu;
peer->rtt_last_req = ktime_get_real();
@@ -279,7 +279,8 @@ static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key)
/*
* Set up a new peer.
*/
-static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
+static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx,
+ struct rxrpc_local *local,
struct sockaddr_rxrpc *srx,
unsigned long hash_key,
gfp_t gfp)
@@ -291,7 +292,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
peer = rxrpc_alloc_peer(local, gfp);
if (peer) {
memcpy(&peer->srx, srx, sizeof(*srx));
- rxrpc_init_peer(peer, hash_key);
+ rxrpc_init_peer(rx, peer, hash_key);
}
_leave(" = %p", peer);
@@ -299,40 +300,31 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local,
}
/*
- * Set up a new incoming peer. The address is prestored in the preallocated
- * peer.
+ * Set up a new incoming peer. There shouldn't be any other matching peers
+ * since we've already done a search in the list from the non-reentrant context
+ * (the data_ready handler) that is the only place we can add new peers.
*/
-struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
- struct rxrpc_peer *prealloc)
+void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local,
+ struct rxrpc_peer *peer)
{
- struct rxrpc_peer *peer;
struct rxrpc_net *rxnet = local->rxnet;
unsigned long hash_key;
- hash_key = rxrpc_peer_hash_key(local, &prealloc->srx);
- prealloc->local = local;
- rxrpc_init_peer(prealloc, hash_key);
+ hash_key = rxrpc_peer_hash_key(local, &peer->srx);
+ peer->local = local;
+ rxrpc_init_peer(rx, peer, hash_key);
spin_lock(&rxnet->peer_hash_lock);
-
- /* Need to check that we aren't racing with someone else */
- peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key);
- if (peer && !rxrpc_get_peer_maybe(peer))
- peer = NULL;
- if (!peer) {
- peer = prealloc;
- hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
- list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new);
- }
-
+ hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
+ list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new);
spin_unlock(&rxnet->peer_hash_lock);
- return peer;
}
/*
* obtain a remote transport endpoint for the specified address
*/
-struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
+struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx,
+ struct rxrpc_local *local,
struct sockaddr_rxrpc *srx, gfp_t gfp)
{
struct rxrpc_peer *peer, *candidate;
@@ -352,7 +344,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
/* The peer is not yet present in hash - create a candidate
* for a new record and then redo the search.
*/
- candidate = rxrpc_create_peer(local, srx, hash_key, gfp);
+ candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp);
if (!candidate) {
_leave(" = NULL [nomem]");
return NULL;
@@ -416,21 +408,6 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
}
/*
- * Queue a peer record. This passes the caller's ref to the workqueue.
- */
-void __rxrpc_queue_peer_error(struct rxrpc_peer *peer)
-{
- const void *here = __builtin_return_address(0);
- int n;
-
- n = atomic_read(&peer->usage);
- if (rxrpc_queue_work(&peer->error_distributor))
- trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here);
- else
- rxrpc_put_peer(peer);
-}
-
-/*
* Discard a peer record.
*/
static void __rxrpc_put_peer(struct rxrpc_peer *peer)
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index d9fca8c4bcdc..c7d976859d40 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -63,6 +63,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_peer *peer;
struct rxrpc_call *call;
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
+ unsigned long timeout = 0;
rxrpc_seq_t tx_hard_ack, rx_hard_ack;
char lbuff[50], rbuff[50];
@@ -71,7 +72,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
"Proto Local "
" Remote "
" SvID ConnID CallID End Use State Abort "
- " UserID\n");
+ " UserID TxSeq TW RxSeq RW RxSerial RxTimo\n");
return 0;
}
@@ -94,11 +95,16 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
else
strcpy(rbuff, "no_connection");
+ if (call->state != RXRPC_CALL_SERVER_PREALLOC) {
+ timeout = READ_ONCE(call->expect_rx_by);
+ timeout -= jiffies;
+ }
+
tx_hard_ack = READ_ONCE(call->tx_hard_ack);
rx_hard_ack = READ_ONCE(call->rx_hard_ack);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
- " %-8.8s %08x %lx %08x %02x %08x %02x\n",
+ " %-8.8s %08x %lx %08x %02x %08x %02x %08x %06lx\n",
lbuff,
rbuff,
call->service_id,
@@ -110,7 +116,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
call->abort_code,
call->user_call_ID,
tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
- rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
+ rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack,
+ call->rx_serial,
+ timeout);
return 0;
}
@@ -179,7 +187,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
print:
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %s %3u"
- " %s %08x %08x %08x\n",
+ " %s %08x %08x %08x %08x %08x %08x %08x\n",
lbuff,
rbuff,
conn->service_id,
@@ -189,7 +197,11 @@ print:
rxrpc_conn_states[conn->state],
key_serial(conn->params.key),
atomic_read(&conn->serial),
- conn->hi_serial);
+ conn->hi_serial,
+ conn->channels[0].call_id,
+ conn->channels[1].call_id,
+ conn->channels[2].call_id,
+ conn->channels[3].call_id);
return 0;
}
@@ -200,3 +212,129 @@ const struct seq_operations rxrpc_connection_seq_ops = {
.stop = rxrpc_connection_seq_stop,
.show = rxrpc_connection_seq_show,
};
+
+/*
+ * generate a list of extant virtual peers in /proc/net/rxrpc/peers
+ */
+static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
+{
+ struct rxrpc_peer *peer;
+ time64_t now;
+ char lbuff[50], rbuff[50];
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq,
+ "Proto Local "
+ " Remote "
+ " Use CW MTU LastUse RTT Rc\n"
+ );
+ return 0;
+ }
+
+ peer = list_entry(v, struct rxrpc_peer, hash_link);
+
+ sprintf(lbuff, "%pISpc", &peer->local->srx.transport);
+
+ sprintf(rbuff, "%pISpc", &peer->srx.transport);
+
+ now = ktime_get_seconds();
+ seq_printf(seq,
+ "UDP %-47.47s %-47.47s %3u"
+ " %3u %5u %6llus %12llu %2u\n",
+ lbuff,
+ rbuff,
+ atomic_read(&peer->usage),
+ peer->cong_cwnd,
+ peer->mtu,
+ now - peer->last_tx_at,
+ peer->rtt,
+ peer->rtt_cursor);
+
+ return 0;
+}
+
+static void *rxrpc_peer_seq_start(struct seq_file *seq, loff_t *_pos)
+ __acquires(rcu)
+{
+ struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
+ unsigned int bucket, n;
+ unsigned int shift = 32 - HASH_BITS(rxnet->peer_hash);
+ void *p;
+
+ rcu_read_lock();
+
+ if (*_pos >= UINT_MAX)
+ return NULL;
+
+ n = *_pos & ((1U << shift) - 1);
+ bucket = *_pos >> shift;
+ for (;;) {
+ if (bucket >= HASH_SIZE(rxnet->peer_hash)) {
+ *_pos = UINT_MAX;
+ return NULL;
+ }
+ if (n == 0) {
+ if (bucket == 0)
+ return SEQ_START_TOKEN;
+ *_pos += 1;
+ n++;
+ }
+
+ p = seq_hlist_start_rcu(&rxnet->peer_hash[bucket], n - 1);
+ if (p)
+ return p;
+ bucket++;
+ n = 1;
+ *_pos = (bucket << shift) | n;
+ }
+}
+
+static void *rxrpc_peer_seq_next(struct seq_file *seq, void *v, loff_t *_pos)
+{
+ struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
+ unsigned int bucket, n;
+ unsigned int shift = 32 - HASH_BITS(rxnet->peer_hash);
+ void *p;
+
+ if (*_pos >= UINT_MAX)
+ return NULL;
+
+ bucket = *_pos >> shift;
+
+ p = seq_hlist_next_rcu(v, &rxnet->peer_hash[bucket], _pos);
+ if (p)
+ return p;
+
+ for (;;) {
+ bucket++;
+ n = 1;
+ *_pos = (bucket << shift) | n;
+
+ if (bucket >= HASH_SIZE(rxnet->peer_hash)) {
+ *_pos = UINT_MAX;
+ return NULL;
+ }
+ if (n == 0) {
+ *_pos += 1;
+ n++;
+ }
+
+ p = seq_hlist_start_rcu(&rxnet->peer_hash[bucket], n - 1);
+ if (p)
+ return p;
+ }
+}
+
+static void rxrpc_peer_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+
+const struct seq_operations rxrpc_peer_seq_ops = {
+ .start = rxrpc_peer_seq_start,
+ .next = rxrpc_peer_seq_next,
+ .stop = rxrpc_peer_seq_stop,
+ .show = rxrpc_peer_seq_show,
+};
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 93da73bf7098..f9cb83c938f3 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -50,7 +50,6 @@ struct rxrpc_wire_header {
#define RXRPC_PACKET_TYPE_10 10 /* Ignored */
#define RXRPC_PACKET_TYPE_11 11 /* Ignored */
#define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */
-#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */
uint8_t flags; /* packet flags */
#define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */
@@ -72,20 +71,6 @@ struct rxrpc_wire_header {
} __packed;
-#define RXRPC_SUPPORTED_PACKET_TYPES ( \
- (1 << RXRPC_PACKET_TYPE_DATA) | \
- (1 << RXRPC_PACKET_TYPE_ACK) | \
- (1 << RXRPC_PACKET_TYPE_BUSY) | \
- (1 << RXRPC_PACKET_TYPE_ABORT) | \
- (1 << RXRPC_PACKET_TYPE_ACKALL) | \
- (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \
- (1 << RXRPC_PACKET_TYPE_RESPONSE) | \
- /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \
- (1 << RXRPC_PACKET_TYPE_PARAMS) | \
- (1 << RXRPC_PACKET_TYPE_10) | \
- (1 << RXRPC_PACKET_TYPE_11) | \
- (1 << RXRPC_PACKET_TYPE_VERSION))
-
/*****************************************************************************/
/*
* jumbo packet secondary header
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 7bff716e911e..eaf19ebaa964 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -144,13 +144,11 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
-#if 0 // TODO: May want to transmit final ACK under some circumstances anyway
if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
- rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, false, true,
rxrpc_propose_ack_terminal_ack);
- rxrpc_send_ack_packet(call, false, NULL);
+ //rxrpc_send_ack_packet(call, false, NULL);
}
-#endif
write_lock_bh(&call->state_lock);
@@ -315,6 +313,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
unsigned int rx_pkt_offset, rx_pkt_len;
int ix, copy, ret = -EAGAIN, ret2;
+ if (test_and_clear_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags) &&
+ call->ackr_reason)
+ rxrpc_send_ack_packet(call, false, NULL);
+
rx_pkt_offset = call->rx_pkt_offset;
rx_pkt_len = call->rx_pkt_len;
@@ -414,6 +416,8 @@ out:
done:
trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
rx_pkt_offset, rx_pkt_len, ret);
+ if (ret == -EAGAIN)
+ set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags);
return ret;
}
@@ -607,9 +611,7 @@ wait_error:
* rxrpc_kernel_recv_data - Allow a kernel service to receive data/info
* @sock: The socket that the call exists on
* @call: The call to send data through
- * @buf: The buffer to receive into
- * @size: The size of the buffer, including data already read
- * @_offset: The running offset into the buffer.
+ * @iter: The buffer to receive into
* @want_more: True if more data is expected to be read
* @_abort: Where the abort code is stored if -ECONNABORTED is returned
* @_service: Where to store the actual service ID (may be upgraded)
@@ -622,39 +624,30 @@ wait_error:
* Note that we may return -EAGAIN to drain empty packets at the end of the
* data, even if we've already copied over the requested data.
*
- * This function adds the amount it transfers to *_offset, so this should be
- * precleared as appropriate. Note that the amount remaining in the buffer is
- * taken to be size - *_offset.
- *
* *_abort should also be initialised to 0.
*/
int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
- void *buf, size_t size, size_t *_offset,
+ struct iov_iter *iter,
bool want_more, u32 *_abort, u16 *_service)
{
- struct iov_iter iter;
- struct kvec iov;
+ size_t offset = 0;
int ret;
- _enter("{%d,%s},%zu/%zu,%d",
+ _enter("{%d,%s},%zu,%d",
call->debug_id, rxrpc_call_states[call->state],
- *_offset, size, want_more);
+ iov_iter_count(iter), want_more);
- ASSERTCMP(*_offset, <=, size);
ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
- iov.iov_base = buf + *_offset;
- iov.iov_len = size - *_offset;
- iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
-
mutex_lock(&call->user_mutex);
switch (READ_ONCE(call->state)) {
case RXRPC_CALL_CLIENT_RECV_REPLY:
case RXRPC_CALL_SERVER_RECV_REQUEST:
case RXRPC_CALL_SERVER_ACK_REQUEST:
- ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
- _offset);
+ ret = rxrpc_recvmsg_data(sock, call, NULL, iter,
+ iov_iter_count(iter), 0,
+ &offset);
if (ret < 0)
goto out;
@@ -663,7 +656,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
* full buffer or have been given -EAGAIN.
*/
if (ret == 1) {
- if (*_offset < size)
+ if (iov_iter_count(iter) > 0)
goto short_data;
if (!want_more)
goto read_phase_complete;
@@ -686,10 +679,21 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
read_phase_complete:
ret = 1;
out:
+ switch (call->ackr_reason) {
+ case RXRPC_ACK_IDLE:
+ break;
+ case RXRPC_ACK_DELAY:
+ if (ret != -EAGAIN)
+ break;
+ /* Fall through */
+ default:
+ rxrpc_send_ack_packet(call, false, NULL);
+ }
+
if (_service)
*_service = call->service_id;
mutex_unlock(&call->user_mutex);
- _leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
+ _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort);
return ret;
short_data:
@@ -705,9 +709,52 @@ call_complete:
ret = call->error;
if (call->completion == RXRPC_CALL_SUCCEEDED) {
ret = 1;
- if (size > 0)
+ if (iov_iter_count(iter) > 0)
ret = -ECONNRESET;
}
goto out;
}
EXPORT_SYMBOL(rxrpc_kernel_recv_data);
+
+/**
+ * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet
+ * @sock: The socket that the call exists on
+ * @call: The call to query
+ * @_ts: Where to put the timestamp
+ *
+ * Retrieve the timestamp from the first DATA packet of the reply if it is
+ * in the ring. Returns true if successful, false if not.
+ */
+bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call,
+ ktime_t *_ts)
+{
+ struct sk_buff *skb;
+ rxrpc_seq_t hard_ack, top, seq;
+ bool success = false;
+
+ mutex_lock(&call->user_mutex);
+
+ if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY)
+ goto out;
+
+ hard_ack = call->rx_hard_ack;
+ if (hard_ack != 0)
+ goto out;
+
+ seq = hard_ack + 1;
+ top = smp_load_acquire(&call->rx_top);
+ if (after(seq, top))
+ goto out;
+
+ skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK];
+ if (!skb)
+ goto out;
+
+ *_ts = skb_get_ktime(skb);
+ success = true;
+
+out:
+ mutex_unlock(&call->user_mutex);
+ return success;
+}
+EXPORT_SYMBOL(rxrpc_kernel_get_reply_time);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 47cb019c521a..cea16838d588 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -146,10 +146,10 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
struct sk_buff *skb,
u32 data_size,
- void *sechdr)
+ void *sechdr,
+ struct skcipher_request *req)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxkad_level1_hdr hdr;
struct rxrpc_crypt iv;
struct scatterlist sg;
@@ -183,12 +183,12 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct sk_buff *skb,
u32 data_size,
- void *sechdr)
+ void *sechdr,
+ struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr rxkhdr;
struct rxrpc_skb_priv *sp;
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
struct sk_buff *trailer;
@@ -296,11 +296,12 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
- ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr);
+ ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr,
+ req);
break;
case RXRPC_SECURITY_ENCRYPT:
ret = rxkad_secure_packet_encrypt(call, skb, data_size,
- sechdr);
+ sechdr, req);
break;
default:
ret = -EPERM;
@@ -316,10 +317,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
*/
static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
unsigned int offset, unsigned int len,
- rxrpc_seq_t seq)
+ rxrpc_seq_t seq,
+ struct skcipher_request *req)
{
struct rxkad_level1_hdr sechdr;
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
struct sk_buff *trailer;
@@ -402,11 +403,11 @@ nomem:
*/
static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
unsigned int offset, unsigned int len,
- rxrpc_seq_t seq)
+ rxrpc_seq_t seq,
+ struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr sechdr;
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
struct sk_buff *trailer;
@@ -549,9 +550,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
case RXRPC_SECURITY_PLAIN:
return 0;
case RXRPC_SECURITY_AUTH:
- return rxkad_verify_packet_1(call, skb, offset, len, seq);
+ return rxkad_verify_packet_1(call, skb, offset, len, seq, req);
case RXRPC_SECURITY_ENCRYPT:
- return rxkad_verify_packet_2(call, skb, offset, len, seq);
+ return rxkad_verify_packet_2(call, skb, offset, len, seq, req);
default:
return -ENOANO;
}
@@ -665,11 +666,13 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_conn_challenge);
+ rxrpc_tx_point_rxkad_challenge);
return -EAGAIN;
}
conn->params.peer->last_tx_at = ktime_get_seconds();
+ trace_rxrpc_tx_packet(conn->debug_id, &whdr,
+ rxrpc_tx_point_rxkad_challenge);
_leave(" = 0");
return 0;
}
@@ -721,7 +724,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_conn_response);
+ rxrpc_tx_point_rxkad_response);
return -EAGAIN;
}
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index b8985d01876a..913dca65cc65 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -69,21 +69,6 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
}
/*
- * Note the injected loss of a socket buffer.
- */
-void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
-{
- const void *here = __builtin_return_address(0);
- if (skb) {
- int n;
- CHECK_SLAB_OKAY(&skb->users);
- n = atomic_dec_return(select_skb_count(op));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
- kfree_skb(skb);
- }
-}
-
-/*
* Clear a queue of socket buffers.
*/
void rxrpc_purge_queue(struct sk_buff_head *list)
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 4a7af7aff37d..d75bd15151e6 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -15,7 +15,6 @@
#include "ar-internal.h"
static struct ctl_table_header *rxrpc_sysctl_reg_table;
-static const unsigned int zero = 0;
static const unsigned int one = 1;
static const unsigned int four = 4;
static const unsigned int thirtytwo = 32;
diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c
index e801171fa351..ff7af71c4b49 100644
--- a/net/rxrpc/utils.c
+++ b/net/rxrpc/utils.c
@@ -17,28 +17,17 @@
/*
* Fill out a peer address from a socket buffer containing a packet.
*/
-int rxrpc_extract_addr_from_skb(struct rxrpc_local *local,
- struct sockaddr_rxrpc *srx,
- struct sk_buff *skb)
+int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb)
{
memset(srx, 0, sizeof(*srx));
switch (ntohs(skb->protocol)) {
case ETH_P_IP:
- if (local->srx.transport.family == AF_INET6) {
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = udp_hdr(skb)->source;
- srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
- srx->transport.sin6.sin6_addr.s6_addr32[3] = ip_hdr(skb)->saddr;
- } else {
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.sin.sin_family = AF_INET;
- srx->transport.sin.sin_port = udp_hdr(skb)->source;
- srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
- }
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin);
+ srx->transport.sin.sin_family = AF_INET;
+ srx->transport.sin.sin_port = udp_hdr(skb)->source;
+ srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
return 0;
#ifdef CONFIG_AF_RXRPC_IPV6
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..1b9afdee5ba9 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,6 @@
#
# Traffic control configuration.
-#
+#
menuconfig NET_SCHED
bool "QoS and/or fair queueing"
@@ -183,6 +183,28 @@ config NET_SCH_CBS
To compile this code as a module, choose M here: the
module will be called sch_cbs.
+config NET_SCH_ETF
+ tristate "Earliest TxTime First (ETF)"
+ help
+ Say Y here if you want to use the Earliest TxTime First (ETF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_etf.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_etf.
+
+config NET_SCH_TAPRIO
+ tristate "Time Aware Priority (taprio) Scheduler"
+ help
+ Say Y here if you want to use the Time Aware Priority (taprio) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_taprio.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_taprio.
+
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
@@ -240,6 +262,19 @@ config NET_SCH_MQPRIO
If unsure, say N.
+config NET_SCH_SKBPRIO
+ tristate "SKB priority queue scheduler (SKBPRIO)"
+ help
+ Say Y here if you want to use the SKB priority queue
+ scheduler. This schedules packets according to skb->priority,
+ which is useful for request packets in DoS mitigation systems such
+ as Gatekeeper.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_skbprio.
+
+ If unsure, say N.
+
config NET_SCH_CHOKE
tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
help
@@ -284,6 +319,17 @@ config NET_SCH_FQ_CODEL
If unsure, say N.
+config NET_SCH_CAKE
+ tristate "Common Applications Kept Enhanced (CAKE)"
+ help
+ Say Y here if you want to use the Common Applications Kept Enhanced
+ (CAKE) queue management algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_cake.
+
+ If unsure, say N.
+
config NET_SCH_FQ
tristate "Fair Queue"
help
@@ -684,7 +730,7 @@ config NET_CLS_ACT
config NET_ACT_POLICE
tristate "Traffic Policing"
- depends on NET_CLS_ACT
+ depends on NET_CLS_ACT
---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..8a40431d7b5c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
-obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
+obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
@@ -46,14 +46,18 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
+obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3f4cf930f809..9c1b0729aebf 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
if (!tp)
return -EINVAL;
- a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true);
+ a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
if (!a->goto_chain)
return -ENOMEM;
return 0;
@@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
static void tcf_action_goto_chain_fini(struct tc_action *a)
{
- tcf_chain_put(a->goto_chain);
+ tcf_chain_put_by_act(a->goto_chain);
}
static void tcf_action_goto_chain_exec(const struct tc_action *a,
@@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
}
+static void tcf_free_cookie_rcu(struct rcu_head *p)
+{
+ struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
+
+ kfree(cookie->data);
+ kfree(cookie);
+}
+
+static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
+ struct tc_cookie *new_cookie)
+{
+ struct tc_cookie *old;
+
+ old = xchg((__force struct tc_cookie **)old_cookie, new_cookie);
+ if (old)
+ call_rcu(&old->rcu, tcf_free_cookie_rcu);
+}
+
/* XXX: For standalone actions, we don't need a RCU grace period either, because
* actions are always connected to filters and filters are already destroyed in
* RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -63,46 +81,67 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
static void free_tcf(struct tc_action *p)
{
free_percpu(p->cpu_bstats);
+ free_percpu(p->cpu_bstats_hw);
free_percpu(p->cpu_qstats);
- if (p->act_cookie) {
- kfree(p->act_cookie->data);
- kfree(p->act_cookie);
- }
+ tcf_set_action_cookie(&p->act_cookie, NULL);
if (p->goto_chain)
tcf_action_goto_chain_fini(p);
kfree(p);
}
-static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
+static void tcf_action_cleanup(struct tc_action *p)
{
- spin_lock(&idrinfo->lock);
- idr_remove(&idrinfo->action_idr, p->tcfa_index);
- spin_unlock(&idrinfo->lock);
+ if (p->ops->cleanup)
+ p->ops->cleanup(p);
+
gen_kill_estimator(&p->tcfa_rate_est);
free_tcf(p);
}
+static int __tcf_action_put(struct tc_action *p, bool bind)
+{
+ struct tcf_idrinfo *idrinfo = p->idrinfo;
+
+ if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) {
+ if (bind)
+ atomic_dec(&p->tcfa_bindcnt);
+ idr_remove(&idrinfo->action_idr, p->tcfa_index);
+ mutex_unlock(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ return 1;
+ }
+
+ if (bind)
+ atomic_dec(&p->tcfa_bindcnt);
+
+ return 0;
+}
+
int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
{
int ret = 0;
- ASSERT_RTNL();
-
+ /* Release with strict==1 and bind==0 is only called through act API
+ * interface (classifiers always bind). Only case when action with
+ * positive reference count and zero bind count can exist is when it was
+ * also created with act API (unbinding last classifier will destroy the
+ * action if it was created by classifier). So only case when bind count
+ * can be changed after initial check is when unbound action is
+ * destroyed by act API while classifier binds to action with same id
+ * concurrently. This result either creation of new action(same behavior
+ * as before), or reusing existing action if concurrent process
+ * increments reference count before action is deleted. Both scenarios
+ * are acceptable.
+ */
if (p) {
- if (bind)
- p->tcfa_bindcnt--;
- else if (strict && p->tcfa_bindcnt > 0)
+ if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0)
return -EPERM;
- p->tcfa_refcnt--;
- if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
- if (p->ops->cleanup)
- p->ops->cleanup(p);
- tcf_idr_remove(p->idrinfo, p);
+ if (__tcf_action_put(p, bind))
ret = ACT_P_DELETED;
- }
}
return ret;
@@ -111,10 +150,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
+ struct tc_cookie *act_cookie;
u32 cookie_len = 0;
- if (act->act_cookie)
- cookie_len = nla_total_size(act->act_cookie->len);
+ rcu_read_lock();
+ act_cookie = rcu_dereference(act->act_cookie);
+
+ if (act_cookie)
+ cookie_len = nla_total_size(act_cookie->len);
+ rcu_read_unlock();
return nla_total_size(0) /* action number nested */
+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
@@ -156,7 +200,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
struct tc_action *p;
unsigned long id = 1;
- spin_lock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
s_i = cb->args[0];
@@ -191,7 +235,7 @@ done:
if (index >= 0)
cb->args[0] = index + 1;
- spin_unlock(&idrinfo->lock);
+ mutex_unlock(&idrinfo->lock);
if (n_i) {
if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
cb->args[1] = n_i;
@@ -203,6 +247,20 @@ nla_put_failure:
goto done;
}
+static int tcf_idr_release_unsafe(struct tc_action *p)
+{
+ if (atomic_read(&p->tcfa_bindcnt) > 0)
+ return -EPERM;
+
+ if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+ idr_remove(&p->idrinfo->action_idr, p->tcfa_index);
+ tcf_action_cleanup(p);
+ return ACT_P_DELETED;
+ }
+
+ return 0;
+}
+
static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
const struct tc_action_ops *ops)
{
@@ -219,15 +277,19 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
+ mutex_lock(&idrinfo->lock);
idr_for_each_entry_ul(idr, p, id) {
- ret = __tcf_idr_release(p, false, true);
+ ret = tcf_idr_release_unsafe(p);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
n_i++;
} else if (ret < 0) {
+ mutex_unlock(&idrinfo->lock);
goto nla_put_failure;
}
}
+ mutex_unlock(&idrinfo->lock);
+
if (nla_put_u32(skb, TCA_FCNT, n_i))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -257,46 +319,59 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
}
EXPORT_SYMBOL(tcf_generic_walker);
-static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
-{
- struct tc_action *p = NULL;
-
- spin_lock(&idrinfo->lock);
- p = idr_find(&idrinfo->action_idr, index);
- spin_unlock(&idrinfo->lock);
-
- return p;
-}
-
int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- struct tc_action *p = tcf_idr_lookup(index, idrinfo);
+ struct tc_action *p;
+
+ mutex_lock(&idrinfo->lock);
+ p = idr_find(&idrinfo->action_idr, index);
+ if (IS_ERR(p))
+ p = NULL;
+ else if (p)
+ refcount_inc(&p->tcfa_refcnt);
+ mutex_unlock(&idrinfo->lock);
if (p) {
*a = p;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
EXPORT_SYMBOL(tcf_idr_search);
-bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
- int bind)
+static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
{
- struct tcf_idrinfo *idrinfo = tn->idrinfo;
- struct tc_action *p = tcf_idr_lookup(index, idrinfo);
+ struct tc_action *p;
+ int ret = 0;
- if (index && p) {
- if (bind)
- p->tcfa_bindcnt++;
- p->tcfa_refcnt++;
- *a = p;
- return true;
+ mutex_lock(&idrinfo->lock);
+ p = idr_find(&idrinfo->action_idr, index);
+ if (!p) {
+ mutex_unlock(&idrinfo->lock);
+ return -ENOENT;
}
- return false;
+
+ if (!atomic_read(&p->tcfa_bindcnt)) {
+ if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+ struct module *owner = p->ops->owner;
+
+ WARN_ON(p != idr_remove(&idrinfo->action_idr,
+ p->tcfa_index));
+ mutex_unlock(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ module_put(owner);
+ return 0;
+ }
+ ret = 0;
+ } else {
+ ret = -EPERM;
+ }
+
+ mutex_unlock(&idrinfo->lock);
+ return ret;
}
-EXPORT_SYMBOL(tcf_idr_check);
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
@@ -304,38 +379,26 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
{
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- struct idr *idr = &idrinfo->action_idr;
int err = -ENOMEM;
if (unlikely(!p))
return -ENOMEM;
- p->tcfa_refcnt = 1;
+ refcount_set(&p->tcfa_refcnt, 1);
if (bind)
- p->tcfa_bindcnt = 1;
+ atomic_set(&p->tcfa_bindcnt, 1);
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
if (!p->cpu_bstats)
goto err1;
+ p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+ if (!p->cpu_bstats_hw)
+ goto err2;
p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
if (!p->cpu_qstats)
- goto err2;
+ goto err3;
}
spin_lock_init(&p->tcfa_lock);
- idr_preload(GFP_KERNEL);
- spin_lock(&idrinfo->lock);
- /* user doesn't specify an index */
- if (!index) {
- index = 1;
- err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC);
- } else {
- err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC);
- }
- spin_unlock(&idrinfo->lock);
- idr_preload_end();
- if (err)
- goto err3;
-
p->tcfa_index = index;
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
@@ -350,13 +413,12 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
p->idrinfo = idrinfo;
p->ops = ops;
- INIT_LIST_HEAD(&p->list);
*a = p;
return 0;
err4:
- idr_remove(idr, index);
-err3:
free_percpu(p->cpu_qstats);
+err3:
+ free_percpu(p->cpu_bstats_hw);
err2:
free_percpu(p->cpu_bstats);
err1:
@@ -369,12 +431,79 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
- spin_lock(&idrinfo->lock);
- idr_replace(&idrinfo->action_idr, a, a->tcfa_index);
- spin_unlock(&idrinfo->lock);
+ mutex_lock(&idrinfo->lock);
+ /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index)));
+ mutex_unlock(&idrinfo->lock);
}
EXPORT_SYMBOL(tcf_idr_insert);
+/* Cleanup idr index that was allocated but not initialized. */
+
+void tcf_idr_cleanup(struct tc_action_net *tn, u32 index)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+
+ mutex_lock(&idrinfo->lock);
+ /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */
+ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index)));
+ mutex_unlock(&idrinfo->lock);
+}
+EXPORT_SYMBOL(tcf_idr_cleanup);
+
+/* Check if action with specified index exists. If actions is found, increments
+ * its reference and bind counters, and return 1. Otherwise insert temporary
+ * error pointer (to prevent concurrent users from inserting actions with same
+ * index) and return 0.
+ */
+
+int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
+ struct tc_action **a, int bind)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ struct tc_action *p;
+ int ret;
+
+again:
+ mutex_lock(&idrinfo->lock);
+ if (*index) {
+ p = idr_find(&idrinfo->action_idr, *index);
+ if (IS_ERR(p)) {
+ /* This means that another process allocated
+ * index but did not assign the pointer yet.
+ */
+ mutex_unlock(&idrinfo->lock);
+ goto again;
+ }
+
+ if (p) {
+ refcount_inc(&p->tcfa_refcnt);
+ if (bind)
+ atomic_inc(&p->tcfa_bindcnt);
+ *a = p;
+ ret = 1;
+ } else {
+ *a = NULL;
+ ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
+ *index, GFP_KERNEL);
+ if (!ret)
+ idr_replace(&idrinfo->action_idr,
+ ERR_PTR(-EBUSY), *index);
+ }
+ } else {
+ *index = 1;
+ *a = NULL;
+ ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index,
+ UINT_MAX, GFP_KERNEL);
+ if (!ret)
+ idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY),
+ *index);
+ }
+ mutex_unlock(&idrinfo->lock);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_check_alloc);
+
void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tcf_idrinfo *idrinfo)
{
@@ -538,13 +667,15 @@ repeat:
}
EXPORT_SYMBOL(tcf_action_exec);
-int tcf_action_destroy(struct list_head *actions, int bind)
+int tcf_action_destroy(struct tc_action *actions[], int bind)
{
const struct tc_action_ops *ops;
- struct tc_action *a, *tmp;
- int ret = 0;
+ struct tc_action *a;
+ int ret = 0, i;
- list_for_each_entry_safe(a, tmp, actions, list) {
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ a = actions[i];
+ actions[i] = NULL;
ops = a->ops;
ret = __tcf_idr_release(a, bind, true);
if (ret == ACT_P_DELETED)
@@ -555,6 +686,35 @@ int tcf_action_destroy(struct list_head *actions, int bind)
return ret;
}
+static int tcf_action_destroy_1(struct tc_action *a, int bind)
+{
+ struct tc_action *actions[] = { a, NULL };
+
+ return tcf_action_destroy(actions, bind);
+}
+
+static int tcf_action_put(struct tc_action *p)
+{
+ return __tcf_action_put(p, false);
+}
+
+/* Put all actions in this array, skip those NULL's. */
+static void tcf_action_put_many(struct tc_action *actions[])
+{
+ int i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO; i++) {
+ struct tc_action *a = actions[i];
+ const struct tc_action_ops *ops;
+
+ if (!a)
+ continue;
+ ops = a->ops;
+ if (tcf_action_put(a))
+ module_put(ops->owner);
+ }
+}
+
int
tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
@@ -567,16 +727,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
+ struct tc_cookie *cookie;
if (nla_put_string(skb, TCA_KIND, a->ops->kind))
goto nla_put_failure;
if (tcf_action_copy_stats(skb, a, 0))
goto nla_put_failure;
- if (a->act_cookie) {
- if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
- a->act_cookie->data))
+
+ rcu_read_lock();
+ cookie = rcu_dereference(a->act_cookie);
+ if (cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+ rcu_read_unlock();
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
@@ -593,14 +759,15 @@ nla_put_failure:
}
EXPORT_SYMBOL(tcf_action_dump_1);
-int tcf_action_dump(struct sk_buff *skb, struct list_head *actions,
+int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
int bind, int ref)
{
struct tc_action *a;
- int err = -EINVAL;
+ int err = -EINVAL, i;
struct nlattr *nest;
- list_for_each_entry(a, actions, list) {
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ a = actions[i];
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
@@ -635,9 +802,19 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
return c;
}
+static bool tcf_action_valid(int action)
+{
+ int opcode = TC_ACT_EXT_OPCODE(action);
+
+ if (!opcode)
+ return action <= TC_ACT_VALUE_MAX;
+ return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
+}
+
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
+ bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tc_action *a;
@@ -688,9 +865,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
a_o = tc_lookup_action_n(act_name);
if (a_o == NULL) {
#ifdef CONFIG_MODULES
- rtnl_unlock();
+ if (rtnl_held)
+ rtnl_unlock();
request_module("act_%s", act_name);
- rtnl_lock();
+ if (rtnl_held)
+ rtnl_lock();
a_o = tc_lookup_action_n(act_name);
@@ -713,19 +892,15 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- extack);
+ rtnl_held, extack);
else
- err = a_o->init(net, nla, est, &a, ovr, bind, extack);
+ err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
+ extack);
if (err < 0)
goto err_mod;
- if (name == NULL && tb[TCA_ACT_COOKIE]) {
- if (a->act_cookie) {
- kfree(a->act_cookie->data);
- kfree(a->act_cookie);
- }
- a->act_cookie = cookie;
- }
+ if (!name && tb[TCA_ACT_COOKIE])
+ tcf_set_action_cookie(&a->act_cookie, cookie);
/* module count goes up only when brand new policy is created
* if it exists and is only bound to in a_o->init() then
@@ -737,15 +912,18 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) {
err = tcf_action_goto_chain_init(a, tp);
if (err) {
- LIST_HEAD(actions);
-
- list_add_tail(&a->list, &actions);
- tcf_action_destroy(&actions, bind);
+ tcf_action_destroy_1(a, bind);
NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
return ERR_PTR(err);
}
}
+ if (!tcf_action_valid(a->tcfa_action)) {
+ tcf_action_destroy_1(a, bind);
+ NL_SET_ERR_MSG(extack, "Invalid control action value");
+ return ERR_PTR(-EINVAL);
+ }
+
return a;
err_mod:
@@ -758,21 +936,12 @@ err_out:
return ERR_PTR(err);
}
-static void cleanup_a(struct list_head *actions, int ovr)
-{
- struct tc_action *a;
-
- if (!ovr)
- return;
-
- list_for_each_entry(a, actions, list)
- a->tcfa_refcnt--;
-}
+/* Returns numbers of initialized actions or negative error. */
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
- struct list_head *actions, size_t *attr_size,
- struct netlink_ext_ack *extack)
+ struct tc_action *actions[], size_t *attr_size,
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
@@ -786,25 +955,19 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
- extack);
+ rtnl_held, extack);
if (IS_ERR(act)) {
err = PTR_ERR(act);
goto err;
}
act->order = i;
sz += tcf_action_fill_size(act);
- if (ovr)
- act->tcfa_refcnt++;
- list_add_tail(&act->list, actions);
+ /* Start from index 0 */
+ actions[i - 1] = act;
}
*attr_size = tcf_action_full_attrs_size(sz);
-
- /* Remove the temp refcnt which was necessary to protect against
- * destroying an existing action which was being replaced
- */
- cleanup_a(actions, ovr);
- return 0;
+ return i - 1;
err:
tcf_action_destroy(actions, bind);
@@ -840,6 +1003,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+ gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw,
+ &p->tcfa_bstats_hw) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
@@ -855,7 +1020,7 @@ errout:
return -1;
}
-static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
+static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
u32 portid, u32 seq, u16 flags, int event, int bind,
int ref)
{
@@ -891,7 +1056,7 @@ out_nlmsg_trim:
static int
tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
- struct list_head *actions, int event,
+ struct tc_action *actions[], int event,
struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
@@ -900,7 +1065,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
if (!skb)
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
- 0, 0) <= 0) {
+ 0, 1) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
return -EINVAL;
@@ -934,12 +1099,14 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
err = -EINVAL;
ops = tc_lookup_action(tb[TCA_ACT_KIND]);
if (!ops) { /* could happen in batch of actions */
- NL_SET_ERR_MSG(extack, "Specified TC action not found");
+ NL_SET_ERR_MSG(extack, "Specified TC action kind not found");
goto err_out;
}
err = -ENOENT;
- if (ops->lookup(net, &a, index, extack) == 0)
+ if (ops->lookup(net, &a, index) == 0) {
+ NL_SET_ERR_MSG(extack, "TC action with specified index not found");
goto err_mod;
+ }
module_put(ops->owner);
return a;
@@ -1027,8 +1194,37 @@ err_out:
return err;
}
+static int tcf_action_delete(struct net *net, struct tc_action *actions[])
+{
+ int i;
+
+ for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) {
+ struct tc_action *a = actions[i];
+ const struct tc_action_ops *ops = a->ops;
+ /* Actions can be deleted concurrently so we must save their
+ * type and id to search again after reference is released.
+ */
+ struct tcf_idrinfo *idrinfo = a->idrinfo;
+ u32 act_index = a->tcfa_index;
+
+ actions[i] = NULL;
+ if (tcf_action_put(a)) {
+ /* last reference, action was deleted concurrently */
+ module_put(ops->owner);
+ } else {
+ int ret;
+
+ /* now do the delete */
+ ret = tcf_idr_delete_index(idrinfo, act_index);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
static int
-tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
{
int ret;
@@ -1040,14 +1236,14 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
- 0, 1) <= 0) {
+ 0, 2) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
kfree_skb(skb);
return -EINVAL;
}
/* now do the delete */
- ret = tcf_action_destroy(actions, 0);
+ ret = tcf_action_delete(net, actions);
if (ret < 0) {
NL_SET_ERR_MSG(extack, "Failed to delete TC action");
kfree_skb(skb);
@@ -1069,7 +1265,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
struct tc_action *act;
size_t attr_size = 0;
- LIST_HEAD(actions);
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
if (ret < 0)
@@ -1091,27 +1287,26 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
}
act->order = i;
attr_size += tcf_action_fill_size(act);
- list_add_tail(&act->list, &actions);
+ actions[i - 1] = act;
}
attr_size = tcf_action_full_attrs_size(attr_size);
if (event == RTM_GETACTION)
- ret = tcf_get_notify(net, portid, n, &actions, event, extack);
+ ret = tcf_get_notify(net, portid, n, actions, event, extack);
else { /* delete */
- ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack);
+ ret = tcf_del_notify(net, n, actions, portid, attr_size, extack);
if (ret)
goto err;
- return ret;
+ return 0;
}
err:
- if (event != RTM_GETACTION)
- tcf_action_destroy(&actions, 0);
+ tcf_action_put_many(actions);
return ret;
}
static int
-tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
+tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[],
u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
@@ -1142,14 +1337,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
{
size_t attr_size = 0;
int ret = 0;
- LIST_HEAD(actions);
+ struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
- &attr_size, extack);
- if (ret)
+ ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
+ &attr_size, true, extack);
+ if (ret < 0)
return ret;
+ ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
+ if (ovr)
+ tcf_action_put_many(actions);
- return tcf_add_notify(net, n, &actions, portid, attr_size, extack);
+ return ret;
}
static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
@@ -1254,7 +1452,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
u32 act_count = 0;
ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
- tcaa_policy, NULL);
+ tcaa_policy, cb->extack);
if (ret < 0)
return ret;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 18089c02e557..c7633843e223 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -34,8 +34,8 @@ struct tcf_bpf_cfg {
static unsigned int bpf_net_id;
static struct tc_action_ops act_bpf_ops;
-static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
- struct tcf_result *res)
+static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
+ struct tcf_result *res)
{
bool at_ingress = skb_at_tc_ingress(skb);
struct tcf_bpf *prog = to_bpf(act);
@@ -141,13 +141,14 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index = prog->tcf_index,
- .refcnt = prog->tcf_refcnt - ref,
- .bindcnt = prog->tcf_bindcnt - bind,
- .action = prog->tcf_action,
+ .refcnt = refcount_read(&prog->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
};
struct tcf_t tm;
int ret;
+ spin_lock_bh(&prog->tcf_lock);
+ opt.action = prog->tcf_action;
if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
TCA_ACT_BPF_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&prog->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&prog->tcf_lock);
nlmsg_trim(skb, tp);
return -1;
}
@@ -196,12 +199,10 @@ static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
return -EINVAL;
- bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL);
if (bpf_ops == NULL)
return -ENOMEM;
- memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size);
-
fprog_tmp.len = bpf_num_ops;
fprog_tmp.filter = bpf_ops;
@@ -266,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
{
cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
/* updates to prog->filter are prevented, since it's called either
- * with rtnl lock or during final cleanup in rcu callback
+ * with tcf lock or during final cleanup in rcu callback
*/
cfg->filter = rcu_dereference_protected(prog->filter, 1);
@@ -276,7 +277,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
- int replace, int bind, struct netlink_ext_ack *extack)
+ int replace, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
@@ -298,21 +300,27 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
- if (!tcf_idr_check(tn, parm->index, act, bind)) {
+ ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
+ if (!ret) {
ret = tcf_idr_create(tn, parm->index, est, act,
&act_bpf_ops, bind, true);
- if (ret < 0)
+ if (ret < 0) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
res = ACT_P_CREATED;
- } else {
+ } else if (ret > 0) {
/* Don't override defaults. */
if (bind)
return 0;
- tcf_idr_release(*act, bind);
- if (!replace)
+ if (!replace) {
+ tcf_idr_release(*act, bind);
return -EEXIST;
+ }
+ } else {
+ return ret;
}
is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
@@ -331,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
goto out;
prog = to_bpf(*act);
- ASSERT_RTNL();
+ spin_lock_bh(&prog->tcf_lock);
if (res != ACT_P_CREATED)
tcf_bpf_prog_fill_cfg(prog, &old);
@@ -344,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
prog->tcf_action = parm->action;
rcu_assign_pointer(prog->filter, cfg.filter);
+ spin_unlock_bh(&prog->tcf_lock);
if (res == ACT_P_CREATED) {
tcf_idr_insert(tn, *act);
@@ -355,8 +364,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
return res;
out:
- if (res == ACT_P_CREATED)
- tcf_idr_release(*act, bind);
+ tcf_idr_release(*act, bind);
return ret;
}
@@ -379,8 +387,7 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
@@ -391,7 +398,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
.kind = "bpf",
.type = TCA_ACT_BPF,
.owner = THIS_MODULE,
- .act = tcf_bpf,
+ .act = tcf_bpf_act,
.dump = tcf_bpf_dump,
.cleanup = tcf_bpf_cleanup,
.init = tcf_bpf_init,
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e4b880fa51fe..8475913f2070 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -31,8 +31,8 @@
static unsigned int connmark_net_id;
static struct tc_action_ops act_connmark_ops;
-static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
const struct nf_conntrack_tuple_hash *thash;
struct nf_conntrack_tuple tuple;
@@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind,
+ int ovr, int bind, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!ret) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_connmark_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ci = to_connmark(*a);
ci->tcf_action = parm->action;
@@ -131,16 +134,20 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
tcf_idr_insert(tn, *a);
ret = ACT_P_CREATED;
- } else {
+ } else if (ret > 0) {
ci = to_connmark(*a);
if (bind)
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
/* replacing action and zone */
+ spin_lock_bh(&ci->tcf_lock);
ci->tcf_action = parm->action;
ci->zone = parm->zone;
+ spin_unlock_bh(&ci->tcf_lock);
+ ret = 0;
}
return ret;
@@ -151,16 +158,16 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_connmark_info *ci = to_connmark(a);
-
struct tc_connmark opt = {
.index = ci->tcf_index,
- .refcnt = ci->tcf_refcnt - ref,
- .bindcnt = ci->tcf_bindcnt - bind,
- .action = ci->tcf_action,
- .zone = ci->zone,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&ci->tcf_lock);
+ opt.action = ci->tcf_action;
+ opt.zone = ci->zone;
if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -168,9 +175,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
TCA_CONNMARK_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&ci->tcf_lock);
return skb->len;
+
nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -185,8 +195,7 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -197,7 +206,7 @@ static struct tc_action_ops act_connmark_ops = {
.kind = "connmark",
.type = TCA_ACT_CONNMARK,
.owner = THIS_MODULE,
- .act = tcf_connmark,
+ .act = tcf_connmark_act,
.dump = tcf_connmark_dump,
.init = tcf_connmark_init,
.walk = tcf_connmark_walker,
@@ -239,4 +248,3 @@ module_exit(connmark_cleanup_module);
MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
MODULE_DESCRIPTION("Connection tracking mark restoring");
MODULE_LICENSE("GPL");
-
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 6e7124e57918..3dc25b7806d7 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -46,10 +46,11 @@ static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
- struct tcf_csum_params *params_old, *params_new;
+ struct tcf_csum_params *params_new;
struct nlattr *tb[TCA_CSUM_MAX + 1];
struct tc_csum *parm;
struct tcf_csum *p;
@@ -66,36 +67,43 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_csum_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)/* dont override defaults */
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
+ } else {
+ return err;
}
p = to_tcf_csum(*a);
- ASSERT_RTNL();
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
- params_old = rtnl_dereference(p->params);
+ params_new->update_flags = parm->update_flags;
+ spin_lock_bh(&p->tcf_lock);
p->tcf_action = parm->action;
- params_new->update_flags = parm->update_flags;
- rcu_assign_pointer(p->params, params_new);
- if (params_old)
- kfree_rcu(params_old, rcu);
+ rcu_swap_protected(p->params, params_new,
+ lockdep_is_held(&p->tcf_lock));
+ spin_unlock_bh(&p->tcf_lock);
+
+ if (params_new)
+ kfree_rcu(params_new, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -547,23 +555,22 @@ fail:
return 0;
}
-static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_csum *p = to_tcf_csum(a);
struct tcf_csum_params *params;
u32 update_flags;
int action;
- rcu_read_lock();
- params = rcu_dereference(p->params);
+ params = rcu_dereference_bh(p->params);
tcf_lastuse_update(&p->tcf_tm);
bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
action = READ_ONCE(p->tcf_action);
if (unlikely(action == TC_ACT_SHOT))
- goto drop_stats;
+ goto drop;
update_flags = params->update_flags;
switch (tc_skb_protocol(skb)) {
@@ -577,16 +584,11 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
break;
}
-unlock:
- rcu_read_unlock();
return action;
drop:
- action = TC_ACT_SHOT;
-
-drop_stats:
qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
- goto unlock;
+ return TC_ACT_SHOT;
}
static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -597,13 +599,15 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tcf_csum_params *params;
struct tc_csum opt = {
.index = p->tcf_index,
- .refcnt = p->tcf_refcnt - ref,
- .bindcnt = p->tcf_bindcnt - bind,
- .action = p->tcf_action,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
struct tcf_t t;
- params = rtnl_dereference(p->params);
+ spin_lock_bh(&p->tcf_lock);
+ params = rcu_dereference_protected(p->params,
+ lockdep_is_held(&p->tcf_lock));
+ opt.action = p->tcf_action;
opt.update_flags = params->update_flags;
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&p->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -640,8 +646,7 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
@@ -657,7 +662,7 @@ static struct tc_action_ops act_csum_ops = {
.kind = "csum",
.type = TCA_ACT_CSUM,
.owner = THIS_MODULE,
- .act = tcf_csum,
+ .act = tcf_csum_act,
.dump = tcf_csum_dump,
.init = tcf_csum_init,
.cleanup = tcf_csum_cleanup,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 4dc4f153cad8..b61c20ebb314 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
@@ -87,26 +88,37 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
p_parm = nla_data(tb[TCA_GACT_PROB]);
if (p_parm->ptype >= MAX_RAND)
return -EINVAL;
+ if (TC_ACT_EXT_CMP(p_parm->paction, TC_ACT_GOTO_CHAIN)) {
+ NL_SET_ERR_MSG(extack,
+ "goto chain not allowed on fallback");
+ return -EINVAL;
+ }
}
#endif
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_gact_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)/* dont override defaults */
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
+ } else {
+ return err;
}
gact = to_gact(*a);
- ASSERT_RTNL();
+ spin_lock_bh(&gact->tcf_lock);
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (p_parm) {
@@ -119,13 +131,15 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
gact->tcfg_ptype = p_parm->ptype;
}
#endif
+ spin_unlock_bh(&gact->tcf_lock);
+
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
}
-static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
@@ -148,7 +162,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
}
static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse)
+ u64 lastuse, bool hw)
{
struct tcf_gact *gact = to_gact(a);
int action = READ_ONCE(gact->tcf_action);
@@ -159,6 +173,10 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
if (action == TC_ACT_SHOT)
this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
+ bytes, packets);
+
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -169,12 +187,13 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_gact *gact = to_gact(a);
struct tc_gact opt = {
.index = gact->tcf_index,
- .refcnt = gact->tcf_refcnt - ref,
- .bindcnt = gact->tcf_bindcnt - bind,
- .action = gact->tcf_action,
+ .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&gact->tcf_lock);
+ opt.action = gact->tcf_action;
if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
#ifdef CONFIG_GACT_PROB
@@ -192,9 +211,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &gact->tcf_tm);
if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&gact->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&gact->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -209,8 +231,7 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
@@ -234,7 +255,7 @@ static struct tc_action_ops act_gact_ops = {
.kind = "gact",
.type = TCA_ACT_GACT,
.owner = THIS_MODULE,
- .act = tcf_gact,
+ .act = tcf_gact_act,
.stats_update = tcf_gact_stats_update,
.dump = tcf_gact_dump,
.init = tcf_gact_init,
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 20d7d36b2fc9..30b63fa23ee2 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -265,10 +265,8 @@ static const char *ife_meta_id2name(u32 metaid)
#endif
/* called when adding new meta information
- * under ife->tcf_lock for existing action
*/
-static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
- void *val, int len, bool exists)
+static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held)
{
struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
@@ -276,13 +274,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
if (!ops) {
ret = -ENOENT;
#ifdef CONFIG_MODULES
- if (exists)
- spin_unlock_bh(&ife->tcf_lock);
- rtnl_unlock();
+ if (rtnl_held)
+ rtnl_unlock();
request_module("ife-meta-%s", ife_meta_id2name(metaid));
- rtnl_lock();
- if (exists)
- spin_lock_bh(&ife->tcf_lock);
+ if (rtnl_held)
+ rtnl_lock();
ops = find_ife_oplist(metaid);
#endif
}
@@ -299,24 +295,17 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
}
/* called when adding new meta information
- * under ife->tcf_lock for existing action
*/
-static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
- int len, bool atomic)
+static int __add_metainfo(const struct tcf_meta_ops *ops,
+ struct tcf_ife_info *ife, u32 metaid, void *metaval,
+ int len, bool atomic, bool exists)
{
struct tcf_meta_info *mi = NULL;
- struct tcf_meta_ops *ops = find_ife_oplist(metaid);
int ret = 0;
- if (!ops)
- return -ENOENT;
-
mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL);
- if (!mi) {
- /*put back what find_ife_oplist took */
- module_put(ops->owner);
+ if (!mi)
return -ENOMEM;
- }
mi->metaid = metaid;
mi->ops = ops;
@@ -324,17 +313,49 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL);
if (ret != 0) {
kfree(mi);
- module_put(ops->owner);
return ret;
}
}
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
list_add_tail(&mi->metalist, &ife->metalist);
+ if (exists)
+ spin_unlock_bh(&ife->tcf_lock);
+
+ return ret;
+}
+static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops,
+ struct tcf_ife_info *ife, u32 metaid,
+ bool exists)
+{
+ int ret;
+
+ if (!try_module_get(ops->owner))
+ return -ENOENT;
+ ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists);
+ if (ret)
+ module_put(ops->owner);
+ return ret;
+}
+
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+ int len, bool exists)
+{
+ const struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+ int ret;
+
+ if (!ops)
+ return -ENOENT;
+ ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists);
+ if (ret)
+ /*put back what find_ife_oplist took */
+ module_put(ops->owner);
return ret;
}
-static int use_all_metadata(struct tcf_ife_info *ife)
+static int use_all_metadata(struct tcf_ife_info *ife, bool exists)
{
struct tcf_meta_ops *o;
int rc = 0;
@@ -342,7 +363,7 @@ static int use_all_metadata(struct tcf_ife_info *ife)
read_lock(&ife_mod_lock);
list_for_each_entry(o, &ifeoplist, list) {
- rc = add_metainfo(ife, o->metaid, NULL, 0, true);
+ rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists);
if (rc == 0)
installed += 1;
}
@@ -393,7 +414,6 @@ static void _tcf_ife_cleanup(struct tc_action *a)
struct tcf_meta_info *e, *n;
list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
- module_put(e->ops->owner);
list_del(&e->metalist);
if (e->metaval) {
if (e->ops->release)
@@ -401,6 +421,7 @@ static void _tcf_ife_cleanup(struct tc_action *a)
else
kfree(e->metaval);
}
+ module_put(e->ops->owner);
kfree(e);
}
}
@@ -419,9 +440,8 @@ static void tcf_ife_cleanup(struct tc_action *a)
kfree_rcu(p, rcu);
}
-/* under ife->tcf_lock for existing action */
static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
- bool exists)
+ bool exists, bool rtnl_held)
{
int len = 0;
int rc = 0;
@@ -433,7 +453,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
val = nla_data(tb[i]);
len = nla_len(tb[i]);
- rc = load_metaops_and_vet(ife, i, val, len, exists);
+ rc = load_metaops_and_vet(i, val, len, rtnl_held);
if (rc != 0)
return rc;
@@ -448,12 +468,13 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
struct nlattr *tb2[IFE_META_MAX + 1];
- struct tcf_ife_params *p, *p_old;
+ struct tcf_ife_params *p;
struct tcf_ife_info *ife;
u16 ife_type = ETH_P_IFE;
struct tc_ife *parm;
@@ -483,7 +504,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (!p)
return -ENOMEM;
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0) {
+ kfree(p);
+ return err;
+ }
+ exists = err;
if (exists && bind) {
kfree(p);
return 0;
@@ -493,16 +519,15 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
bind, true);
if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
kfree(p);
return ret;
}
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr) {
- kfree(p);
- return -EEXIST;
- }
+ kfree(p);
+ return -EEXIST;
}
ife = to_ife(*a);
@@ -531,8 +556,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
p->eth_type = ife_type;
}
- if (exists)
- spin_lock_bh(&ife->tcf_lock);
if (ret == ACT_P_CREATED)
INIT_LIST_HEAD(&ife->metalist);
@@ -542,16 +565,12 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
NULL, NULL);
if (err) {
metadata_parse_err:
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
-
- if (exists)
- spin_unlock_bh(&ife->tcf_lock);
+ tcf_idr_release(*a, bind);
kfree(p);
return err;
}
- err = populate_metalist(ife, tb2, exists);
+ err = populate_metalist(ife, tb2, exists, rtnl_held);
if (err)
goto metadata_parse_err;
@@ -561,26 +580,24 @@ metadata_parse_err:
* as we can. You better have at least one else we are
* going to bail out
*/
- err = use_all_metadata(ife);
+ err = use_all_metadata(ife, exists);
if (err) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
-
- if (exists)
- spin_unlock_bh(&ife->tcf_lock);
+ tcf_idr_release(*a, bind);
kfree(p);
return err;
}
}
+ if (exists)
+ spin_lock_bh(&ife->tcf_lock);
ife->tcf_action = parm->action;
+ /* protected by tcf_lock when modifying existing action */
+ rcu_swap_protected(ife->params, p, 1);
+
if (exists)
spin_unlock_bh(&ife->tcf_lock);
-
- p_old = rtnl_dereference(ife->params);
- rcu_assign_pointer(ife->params, p);
- if (p_old)
- kfree_rcu(p_old, rcu);
+ if (p)
+ kfree_rcu(p, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -593,16 +610,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ife_info *ife = to_ife(a);
- struct tcf_ife_params *p = rtnl_dereference(ife->params);
+ struct tcf_ife_params *p;
struct tc_ife opt = {
.index = ife->tcf_index,
- .refcnt = ife->tcf_refcnt - ref,
- .bindcnt = ife->tcf_bindcnt - bind,
- .action = ife->tcf_action,
- .flags = p->flags,
+ .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&ife->tcf_lock);
+ opt.action = ife->tcf_action;
+ p = rcu_dereference_protected(ife->params,
+ lockdep_is_held(&ife->tcf_lock));
+ opt.flags = p->flags;
+
if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -628,9 +649,11 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
pr_info("Failed to dump metalist\n");
}
+ spin_unlock_bh(&ife->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&ife->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -813,14 +836,11 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_ife_params *p;
int ret;
- rcu_read_lock();
- p = rcu_dereference(ife->params);
+ p = rcu_dereference_bh(ife->params);
if (p->flags & IFE_ENCODE) {
ret = tcf_ife_encode(skb, a, res, p);
- rcu_read_unlock();
return ret;
}
- rcu_read_unlock();
return tcf_ife_decode(skb, a, res);
}
@@ -835,8 +855,7 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 14c312d7908f..8af6c11d2482 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -119,36 +119,46 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
if (tb[TCA_IPT_INDEX] != NULL)
index = nla_get_u32(tb[TCA_IPT_INDEX]);
- exists = tcf_idr_check(tn, index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
- if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
+ if (nla_len(tb[TCA_IPT_TARG]) != td->u.target_size) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, ops, bind,
false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, index);
return ret;
+ }
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
}
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
@@ -196,7 +206,8 @@ err1:
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
bind);
@@ -204,14 +215,15 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla,
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
bind);
}
-static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
int ret = 0, result = 0;
struct tcf_ipt *ipt = to_ipt(a);
@@ -276,12 +288,13 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
* for foolproof you need to not assume this
*/
+ spin_lock_bh(&ipt->tcf_lock);
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
goto nla_put_failure;
- c.bindcnt = ipt->tcf_bindcnt - bind;
- c.refcnt = ipt->tcf_refcnt - ref;
+ c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind;
+ c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref;
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
@@ -295,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&ipt->tcf_lock);
kfree(t);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&ipt->tcf_lock);
nlmsg_trim(skb, b);
kfree(t);
return -1;
@@ -314,8 +329,7 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
@@ -326,7 +340,7 @@ static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
.type = TCA_ACT_IPT,
.owner = THIS_MODULE,
- .act = tcf_ipt,
+ .act = tcf_ipt_act,
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_release,
.init = tcf_ipt_init,
@@ -364,8 +378,7 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
@@ -376,7 +389,7 @@ static struct tc_action_ops act_xt_ops = {
.kind = "xt",
.type = TCA_ACT_XT,
.owner = THIS_MODULE,
- .act = tcf_ipt,
+ .act = tcf_ipt_act,
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_release,
.init = tcf_xt_init,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fd34015331ab..1dae5f2b358f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -25,10 +25,12 @@
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
static LIST_HEAD(mirred_list);
+static DEFINE_SPINLOCK(mirred_list_lock);
static bool tcf_mirred_is_act_redirect(int action)
{
@@ -49,13 +51,35 @@ static bool tcf_mirred_act_wants_ingress(int action)
}
}
+static bool tcf_mirred_can_reinsert(int action)
+{
+ switch (action) {
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ return true;
+ }
+ return false;
+}
+
+static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m)
+{
+ return rcu_dereference_protected(m->tcfm_dev,
+ lockdep_is_held(&m->tcf_lock));
+}
+
static void tcf_mirred_release(struct tc_action *a)
{
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
+ spin_lock(&mirred_list_lock);
list_del(&m->tcfm_list);
- dev = rtnl_dereference(m->tcfm_dev);
+ spin_unlock(&mirred_list_lock);
+
+ /* last reference to action, no need to lock */
+ dev = rcu_dereference_protected(m->tcfm_dev, 1);
if (dev)
dev_put(dev);
}
@@ -68,8 +92,9 @@ static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
@@ -78,7 +103,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct tcf_mirred *m;
struct net_device *dev;
bool exists = false;
- int ret;
+ int ret, err;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
@@ -93,7 +118,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_MIRRED_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
@@ -106,76 +134,83 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
default:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
- if (parm->ifindex) {
- dev = __dev_get_by_index(net, parm->ifindex);
- if (dev == NULL) {
- if (exists)
- tcf_idr_release(*a, bind);
- return -ENODEV;
- }
- mac_header_xmit = dev_is_mac_header_xmit(dev);
- } else {
- dev = NULL;
- }
if (!exists) {
- if (!dev) {
+ if (!parm->ifindex) {
+ tcf_idr_cleanup(tn, parm->index);
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
ret = tcf_idr_create(tn, parm->index, est, a,
&act_mirred_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
m = to_mirred(*a);
- ASSERT_RTNL();
+ spin_lock_bh(&m->tcf_lock);
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
- if (dev != NULL) {
- if (ret != ACT_P_CREATED)
- dev_put(rcu_dereference_protected(m->tcfm_dev, 1));
- dev_hold(dev);
- rcu_assign_pointer(m->tcfm_dev, dev);
+
+ if (parm->ifindex) {
+ dev = dev_get_by_index(net, parm->ifindex);
+ if (!dev) {
+ spin_unlock_bh(&m->tcf_lock);
+ tcf_idr_release(*a, bind);
+ return -ENODEV;
+ }
+ mac_header_xmit = dev_is_mac_header_xmit(dev);
+ rcu_swap_protected(m->tcfm_dev, dev,
+ lockdep_is_held(&m->tcf_lock));
+ if (dev)
+ dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
}
+ spin_unlock_bh(&m->tcf_lock);
if (ret == ACT_P_CREATED) {
+ spin_lock(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
+ spin_unlock(&mirred_list_lock);
+
tcf_idr_insert(tn, *a);
}
return ret;
}
-static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_mirred *m = to_mirred(a);
+ struct sk_buff *skb2 = skb;
bool m_mac_header_xmit;
struct net_device *dev;
- struct sk_buff *skb2;
int retval, err = 0;
+ bool use_reinsert;
+ bool want_ingress;
+ bool is_redirect;
int m_eaction;
int mac_len;
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
- rcu_read_lock();
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
m_eaction = READ_ONCE(m->tcfm_eaction);
retval = READ_ONCE(m->tcf_action);
- dev = rcu_dereference(m->tcfm_dev);
+ dev = rcu_dereference_bh(m->tcfm_dev);
if (unlikely(!dev)) {
pr_notice_once("tc mirred: target device is gone\n");
goto out;
@@ -187,16 +222,25 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
goto out;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
- goto out;
+ /* we could easily avoid the clone only if called by ingress and clsact;
+ * since we can't easily detect the clsact caller, skip clone only for
+ * ingress - that covers the TC S/W datapath.
+ */
+ is_redirect = tcf_mirred_is_act_redirect(m_eaction);
+ use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
+ tcf_mirred_can_reinsert(retval);
+ if (!use_reinsert) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out;
+ }
/* If action's target direction differs than filter's direction,
* and devices expect a mac header on xmit, then mac push/pull is
* needed.
*/
- if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
- m_mac_header_xmit) {
+ want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+ if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
if (!skb_at_tc_ingress(skb)) {
/* caught at egress, act ingress: pull mac */
mac_len = skb_network_header(skb) - skb_mac_header(skb);
@@ -207,15 +251,23 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
}
}
+ skb2->skb_iif = skb->dev->ifindex;
+ skb2->dev = dev;
+
/* mirror is always swallowed */
- if (tcf_mirred_is_act_redirect(m_eaction)) {
+ if (is_redirect) {
skb2->tc_redirected = 1;
skb2->tc_from_ingress = skb2->tc_at_ingress;
+
+ /* let's the caller reinsert the packet, if possible */
+ if (use_reinsert) {
+ res->ingress = want_ingress;
+ res->qstats = this_cpu_ptr(m->common.cpu_qstats);
+ return TC_ACT_REINSERT;
+ }
}
- skb2->skb_iif = skb->dev->ifindex;
- skb2->dev = dev;
- if (!tcf_mirred_act_wants_ingress(m_eaction))
+ if (!want_ingress)
err = dev_queue_xmit(skb2);
else
err = netif_receive_skb(skb2);
@@ -226,18 +278,20 @@ out:
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
- rcu_read_unlock();
return retval;
}
static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
- u64 lastuse)
+ u64 lastuse, bool hw)
{
struct tcf_mirred *m = to_mirred(a);
struct tcf_t *tm = &m->tcf_tm;
_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -246,26 +300,33 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = to_mirred(a);
- struct net_device *dev = rtnl_dereference(m->tcfm_dev);
struct tc_mirred opt = {
.index = m->tcf_index,
- .action = m->tcf_action,
- .refcnt = m->tcf_refcnt - ref,
- .bindcnt = m->tcf_bindcnt - bind,
- .eaction = m->tcfm_eaction,
- .ifindex = dev ? dev->ifindex : 0,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
};
+ struct net_device *dev;
struct tcf_t t;
+ spin_lock_bh(&m->tcf_lock);
+ opt.action = m->tcf_action;
+ opt.eaction = m->tcfm_eaction;
+ dev = tcf_mirred_dev_dereference(m);
+ if (dev)
+ opt.ifindex = dev->ifindex;
+
if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
tcf_tm_dump(&t, &m->tcf_tm);
if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&m->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&m->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -280,8 +341,7 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
@@ -296,15 +356,19 @@ static int mirred_device_event(struct notifier_block *unused,
ASSERT_RTNL();
if (event == NETDEV_UNREGISTER) {
+ spin_lock(&mirred_list_lock);
list_for_each_entry(m, &mirred_list, tcfm_list) {
- if (rcu_access_pointer(m->tcfm_dev) == dev) {
+ spin_lock_bh(&m->tcf_lock);
+ if (tcf_mirred_dev_dereference(m) == dev) {
dev_put(dev);
/* Note : no rcu grace period necessary, as
* net_device are already rcu protected.
*/
RCU_INIT_POINTER(m->tcfm_dev, NULL);
}
+ spin_unlock_bh(&m->tcf_lock);
}
+ spin_unlock(&mirred_list_lock);
}
return NOTIFY_DONE;
@@ -317,15 +381,27 @@ static struct notifier_block mirred_device_notifier = {
static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
{
struct tcf_mirred *m = to_mirred(a);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = rcu_dereference(m->tcfm_dev);
+ if (dev)
+ dev_hold(dev);
+ rcu_read_unlock();
- return rtnl_dereference(m->tcfm_dev);
+ return dev;
+}
+
+static void tcf_mirred_put_dev(struct net_device *dev)
+{
+ dev_put(dev);
}
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
.owner = THIS_MODULE,
- .act = tcf_mirred,
+ .act = tcf_mirred_act,
.stats_update = tcf_stats_update,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
@@ -334,6 +410,7 @@ static struct tc_action_ops act_mirred_ops = {
.lookup = tcf_mirred_search,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
+ .put_dev = tcf_mirred_put_dev,
};
static __net_init int mirred_init_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4b5848b6c252..c5c1e23add77 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action **a, int ovr, int bind,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -57,18 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_nat_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)
return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
+ } else {
+ return err;
}
p = to_tcf_nat(*a);
@@ -87,8 +93,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
return ret;
}
-static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_nat *p = to_tcf_nat(a);
struct iphdr *iph;
@@ -250,28 +256,31 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
unsigned char *b = skb_tail_pointer(skb);
struct tcf_nat *p = to_tcf_nat(a);
struct tc_nat opt = {
- .old_addr = p->old_addr,
- .new_addr = p->new_addr,
- .mask = p->mask,
- .flags = p->flags,
-
.index = p->tcf_index,
- .action = p->tcf_action,
- .refcnt = p->tcf_refcnt - ref,
- .bindcnt = p->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&p->tcf_lock);
+ opt.old_addr = p->old_addr;
+ opt.new_addr = p->new_addr;
+ opt.mask = p->mask;
+ opt.flags = p->flags;
+ opt.action = p->tcf_action;
+
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&p->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -286,8 +295,7 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
@@ -298,7 +306,7 @@ static struct tc_action_ops act_nat_ops = {
.kind = "nat",
.type = TCA_ACT_NAT,
.owner = THIS_MODULE,
- .act = tcf_nat,
+ .act = tcf_nat_act,
.dump = tcf_nat_dump,
.init = tcf_nat_init,
.walk = tcf_nat_walker,
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 8a925c72db5f..da3dd0f68cc2 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -109,16 +109,18 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
{
struct nlattr *keys_start = nla_nest_start(skb, TCA_PEDIT_KEYS_EX);
+ if (!keys_start)
+ goto nla_failure;
for (; n > 0; n--) {
struct nlattr *key_start;
key_start = nla_nest_start(skb, TCA_PEDIT_KEY_EX);
+ if (!key_start)
+ goto nla_failure;
if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) ||
- nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) {
- nlmsg_trim(skb, keys_start);
- return -EINVAL;
- }
+ nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd))
+ goto nla_failure;
nla_nest_end(skb, key_start);
@@ -128,24 +130,30 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
nla_nest_end(skb, keys_start);
return 0;
+nla_failure:
+ nla_nest_cancel(skb, keys_start);
+ return -EINVAL;
}
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
- struct nlattr *pattr;
- struct tc_pedit *parm;
- int ret = 0, err;
- struct tcf_pedit *p;
struct tc_pedit_key *keys = NULL;
struct tcf_pedit_key_ex *keys_ex;
+ struct tc_pedit *parm;
+ struct nlattr *pattr;
+ struct tcf_pedit *p;
+ int ret = 0, err;
int ksize;
- if (nla == NULL)
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
return -EINVAL;
+ }
err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
if (err < 0)
@@ -154,59 +162,68 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
pattr = tb[TCA_PEDIT_PARMS];
if (!pattr)
pattr = tb[TCA_PEDIT_PARMS_EX];
- if (!pattr)
+ if (!pattr) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute");
return -EINVAL;
+ }
parm = nla_data(pattr);
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
- if (nla_len(pattr) < sizeof(*parm) + ksize)
+ if (nla_len(pattr) < sizeof(*parm) + ksize) {
+ NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
return -EINVAL;
+ }
keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
if (IS_ERR(keys_ex))
return PTR_ERR(keys_ex);
- if (!tcf_idr_check(tn, parm->index, a, bind)) {
- if (!parm->nkeys)
- return -EINVAL;
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!err) {
+ if (!parm->nkeys) {
+ tcf_idr_cleanup(tn, parm->index);
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ ret = -EINVAL;
+ goto out_free;
+ }
ret = tcf_idr_create(tn, parm->index, est, a,
&act_pedit_ops, bind, false);
- if (ret)
- return ret;
- p = to_pedit(*a);
- keys = kmalloc(ksize, GFP_KERNEL);
- if (keys == NULL) {
- tcf_idr_release(*a, bind);
- kfree(keys_ex);
- return -ENOMEM;
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
+ goto out_free;
}
ret = ACT_P_CREATED;
- } else {
+ } else if (err > 0) {
if (bind)
- return 0;
- tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
- p = to_pedit(*a);
- if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
- keys = kmalloc(ksize, GFP_KERNEL);
- if (!keys) {
- kfree(keys_ex);
- return -ENOMEM;
- }
+ goto out_free;
+ if (!ovr) {
+ ret = -EEXIST;
+ goto out_release;
}
+ } else {
+ return err;
}
+ p = to_pedit(*a);
spin_lock_bh(&p->tcf_lock);
- p->tcfp_flags = parm->flags;
- p->tcf_action = parm->action;
- if (keys) {
+
+ if (ret == ACT_P_CREATED ||
+ (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) {
+ keys = kmalloc(ksize, GFP_ATOMIC);
+ if (!keys) {
+ spin_unlock_bh(&p->tcf_lock);
+ ret = -ENOMEM;
+ goto out_release;
+ }
kfree(p->tcfp_keys);
p->tcfp_keys = keys;
p->tcfp_nkeys = parm->nkeys;
}
memcpy(p->tcfp_keys, parm->keys, ksize);
+ p->tcfp_flags = parm->flags;
+ p->tcf_action = parm->action;
+
kfree(p->tcfp_keys_ex);
p->tcfp_keys_ex = keys_ex;
@@ -214,12 +231,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+
+out_release:
+ tcf_idr_release(*a, bind);
+out_free:
+ kfree(keys_ex);
+ return ret;
+
}
static void tcf_pedit_cleanup(struct tc_action *a)
{
struct tcf_pedit *p = to_pedit(a);
struct tc_pedit_key *keys = p->tcfp_keys;
+
kfree(keys);
kfree(p->tcfp_keys_ex);
}
@@ -263,13 +288,13 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb,
default:
ret = -EINVAL;
break;
- };
+ }
return ret;
}
-static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_pedit *p = to_pedit(a);
int i;
@@ -284,11 +309,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
if (p->tcfp_nkeys > 0) {
struct tc_pedit_key *tkey = p->tcfp_keys;
struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex;
- enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
+ enum pedit_header_type htype =
+ TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK;
enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET;
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
- u32 *ptr, _data;
+ u32 *ptr, hdata;
int offset = tkey->off;
int hoffset;
u32 val;
@@ -303,39 +329,39 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
rc = pedit_skb_hdr_offset(skb, htype, &hoffset);
if (rc) {
- pr_info("tc filter pedit bad header type specified (0x%x)\n",
+ pr_info("tc action pedit bad header type specified (0x%x)\n",
htype);
goto bad;
}
if (tkey->offmask) {
- char *d, _d;
+ u8 *d, _d;
if (!offset_valid(skb, hoffset + tkey->at)) {
- pr_info("tc filter pedit 'at' offset %d out of bounds\n",
+ pr_info("tc action pedit 'at' offset %d out of bounds\n",
hoffset + tkey->at);
goto bad;
}
- d = skb_header_pointer(skb, hoffset + tkey->at, 1,
- &_d);
+ d = skb_header_pointer(skb, hoffset + tkey->at,
+ sizeof(_d), &_d);
if (!d)
goto bad;
offset += (*d & tkey->offmask) >> tkey->shift;
}
if (offset % 4) {
- pr_info("tc filter pedit"
- " offset must be on 32 bit boundaries\n");
+ pr_info("tc action pedit offset must be on 32 bit boundaries\n");
goto bad;
}
if (!offset_valid(skb, hoffset + offset)) {
- pr_info("tc filter pedit offset %d out of bounds\n",
+ pr_info("tc action pedit offset %d out of bounds\n",
hoffset + offset);
goto bad;
}
- ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data);
+ ptr = skb_header_pointer(skb, hoffset + offset,
+ sizeof(hdata), &hdata);
if (!ptr)
goto bad;
/* just do it, baby */
@@ -347,19 +373,20 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
val = (*ptr + tkey->val) & ~tkey->mask;
break;
default:
- pr_info("tc filter pedit bad command (%d)\n",
+ pr_info("tc action pedit bad command (%d)\n",
cmd);
goto bad;
}
*ptr = ((*ptr & tkey->mask) ^ val);
- if (ptr == &_data)
+ if (ptr == &hdata)
skb_store_bits(skb, hoffset + offset, ptr, 4);
}
goto done;
- } else
+ } else {
WARN(1, "pedit BUG: index %d\n", p->tcf_index);
+ }
bad:
p->tcf_qstats.overlimits++;
@@ -385,17 +412,21 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
if (unlikely(!opt))
return -ENOBUFS;
+ spin_lock_bh(&p->tcf_lock);
memcpy(opt->keys, p->tcfp_keys,
p->tcfp_nkeys * sizeof(struct tc_pedit_key));
opt->index = p->tcf_index;
opt->nkeys = p->tcfp_nkeys;
opt->flags = p->tcfp_flags;
opt->action = p->tcf_action;
- opt->refcnt = p->tcf_refcnt - ref;
- opt->bindcnt = p->tcf_bindcnt - bind;
+ opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
+ opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
if (p->tcfp_keys_ex) {
- tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
+ if (tcf_pedit_key_ex_dump(skb,
+ p->tcfp_keys_ex,
+ p->tcfp_nkeys))
+ goto nla_put_failure;
if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt))
goto nla_put_failure;
@@ -407,11 +438,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &p->tcf_tm);
if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&p->tcf_lock);
kfree(opt);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&p->tcf_lock);
nlmsg_trim(skb, b);
kfree(opt);
return -1;
@@ -427,8 +460,7 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
@@ -439,7 +471,7 @@ static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
.type = TCA_ACT_PEDIT,
.owner = THIS_MODULE,
- .act = tcf_pedit,
+ .act = tcf_pedit_act,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
@@ -483,4 +515,3 @@ static void __exit pedit_cleanup_module(void)
module_init(pedit_init_module);
module_exit(pedit_cleanup_module);
-
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 4e72bc2a0dfb..052855d47354 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,8 +22,7 @@
#include <net/act_api.h>
#include <net/netlink.h>
-struct tcf_police {
- struct tc_action common;
+struct tcf_police_params {
int tcfp_result;
u32 tcfp_ewma_rate;
s64 tcfp_burst;
@@ -36,6 +35,12 @@ struct tcf_police {
bool rate_present;
struct psched_ratecfg peak;
bool peak_present;
+ struct rcu_head rcu;
+};
+
+struct tcf_police {
+ struct tc_action common;
+ struct tcf_police_params __rcu *params;
};
#define to_police(pc) ((struct tcf_police *)pc)
@@ -56,7 +61,7 @@ struct tc_police_compat {
static unsigned int police_net_id;
static struct tc_action_ops act_police_ops;
-static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
+static int tcf_police_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
const struct tc_action_ops *ops,
struct netlink_ext_ack *extack)
@@ -73,9 +78,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
};
-static int tcf_act_police_init(struct net *net, struct nlattr *nla,
+static int tcf_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind,
+ int ovr, int bind, bool rtnl_held,
struct netlink_ext_ack *extack)
{
int ret = 0, err;
@@ -84,6 +89,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tc_action_net *tn = net_generic(net, police_net_id);
+ struct tcf_police_params *new;
bool exists = false;
int size;
@@ -101,20 +107,24 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!exists) {
ret = tcf_idr_create(tn, parm->index, NULL, a,
- &act_police_ops, bind, false);
- if (ret)
+ &act_police_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
police = to_police(*a);
@@ -133,7 +143,8 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
}
if (est) {
- err = gen_replace_estimator(&police->tcf_bstats, NULL,
+ err = gen_replace_estimator(&police->tcf_bstats,
+ police->common.cpu_bstats,
&police->tcf_rate_est,
&police->tcf_lock,
NULL, est);
@@ -146,150 +157,176 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure;
}
- spin_lock_bh(&police->tcf_lock);
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (unlikely(!new)) {
+ err = -ENOMEM;
+ goto failure;
+ }
+
/* No failure allowed after this point */
- police->tcfp_mtu = parm->mtu;
- if (police->tcfp_mtu == 0) {
- police->tcfp_mtu = ~0;
+ new->tcfp_mtu = parm->mtu;
+ if (!new->tcfp_mtu) {
+ new->tcfp_mtu = ~0;
if (R_tab)
- police->tcfp_mtu = 255 << R_tab->rate.cell_log;
+ new->tcfp_mtu = 255 << R_tab->rate.cell_log;
}
if (R_tab) {
- police->rate_present = true;
- psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+ new->rate_present = true;
+ psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
qdisc_put_rtab(R_tab);
} else {
- police->rate_present = false;
+ new->rate_present = false;
}
if (P_tab) {
- police->peak_present = true;
- psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+ new->peak_present = true;
+ psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
qdisc_put_rtab(P_tab);
} else {
- police->peak_present = false;
+ new->peak_present = false;
}
- if (tb[TCA_POLICE_RESULT])
- police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
- police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
- police->tcfp_toks = police->tcfp_burst;
- if (police->peak_present) {
- police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
- police->tcfp_mtu);
- police->tcfp_ptoks = police->tcfp_mtu_ptoks;
+ new->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+ new->tcfp_toks = new->tcfp_burst;
+ if (new->peak_present) {
+ new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
+ new->tcfp_mtu);
+ new->tcfp_ptoks = new->tcfp_mtu_ptoks;
}
- police->tcf_action = parm->action;
if (tb[TCA_POLICE_AVRATE])
- police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+ new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+
+ if (tb[TCA_POLICE_RESULT]) {
+ new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+ if (TC_ACT_EXT_CMP(new->tcfp_result, TC_ACT_GOTO_CHAIN)) {
+ NL_SET_ERR_MSG(extack,
+ "goto chain not allowed on fallback");
+ err = -EINVAL;
+ goto failure;
+ }
+ }
+ spin_lock_bh(&police->tcf_lock);
+ new->tcfp_t_c = ktime_get_ns();
+ police->tcf_action = parm->action;
+ rcu_swap_protected(police->params,
+ new,
+ lockdep_is_held(&police->tcf_lock));
spin_unlock_bh(&police->tcf_lock);
- if (ret != ACT_P_CREATED)
- return ret;
- police->tcfp_t_c = ktime_get_ns();
- tcf_idr_insert(tn, *a);
+ if (new)
+ kfree_rcu(new, rcu);
+ if (ret == ACT_P_CREATED)
+ tcf_idr_insert(tn, *a);
return ret;
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return err;
}
-static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
+static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_police *police = to_police(a);
- s64 now;
- s64 toks;
- s64 ptoks = 0;
-
- spin_lock(&police->tcf_lock);
+ struct tcf_police_params *p;
+ s64 now, toks, ptoks = 0;
+ int ret;
- bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
+ bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb);
- if (police->tcfp_ewma_rate) {
+ ret = READ_ONCE(police->tcf_action);
+ p = rcu_dereference_bh(police->params);
+
+ if (p->tcfp_ewma_rate) {
struct gnet_stats_rate_est64 sample;
if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
- sample.bps >= police->tcfp_ewma_rate) {
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
- }
+ sample.bps >= p->tcfp_ewma_rate)
+ goto inc_overlimits;
}
- if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
- if (!police->rate_present) {
- spin_unlock(&police->tcf_lock);
- return police->tcfp_result;
+ if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
+ if (!p->rate_present) {
+ ret = p->tcfp_result;
+ goto end;
}
now = ktime_get_ns();
- toks = min_t(s64, now - police->tcfp_t_c,
- police->tcfp_burst);
- if (police->peak_present) {
- ptoks = toks + police->tcfp_ptoks;
- if (ptoks > police->tcfp_mtu_ptoks)
- ptoks = police->tcfp_mtu_ptoks;
- ptoks -= (s64) psched_l2t_ns(&police->peak,
- qdisc_pkt_len(skb));
+ toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst);
+ if (p->peak_present) {
+ ptoks = toks + p->tcfp_ptoks;
+ if (ptoks > p->tcfp_mtu_ptoks)
+ ptoks = p->tcfp_mtu_ptoks;
+ ptoks -= (s64)psched_l2t_ns(&p->peak,
+ qdisc_pkt_len(skb));
}
- toks += police->tcfp_toks;
- if (toks > police->tcfp_burst)
- toks = police->tcfp_burst;
- toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
+ toks += p->tcfp_toks;
+ if (toks > p->tcfp_burst)
+ toks = p->tcfp_burst;
+ toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
if ((toks|ptoks) >= 0) {
- police->tcfp_t_c = now;
- police->tcfp_toks = toks;
- police->tcfp_ptoks = ptoks;
- if (police->tcfp_result == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcfp_result;
+ p->tcfp_t_c = now;
+ p->tcfp_toks = toks;
+ p->tcfp_ptoks = ptoks;
+ ret = p->tcfp_result;
+ goto inc_drops;
}
}
- police->tcf_qstats.overlimits++;
- if (police->tcf_action == TC_ACT_SHOT)
- police->tcf_qstats.drops++;
- spin_unlock(&police->tcf_lock);
- return police->tcf_action;
+inc_overlimits:
+ qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats));
+inc_drops:
+ if (ret == TC_ACT_SHOT)
+ qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats));
+end:
+ return ret;
}
-static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
+static void tcf_police_cleanup(struct tc_action *a)
+{
+ struct tcf_police *police = to_police(a);
+ struct tcf_police_params *p;
+
+ p = rcu_dereference_protected(police->params, 1);
+ if (p)
+ kfree_rcu(p, rcu);
+}
+
+static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = to_police(a);
+ struct tcf_police_params *p;
struct tc_police opt = {
.index = police->tcf_index,
- .action = police->tcf_action,
- .mtu = police->tcfp_mtu,
- .burst = PSCHED_NS2TICKS(police->tcfp_burst),
- .refcnt = police->tcf_refcnt - ref,
- .bindcnt = police->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&police->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
};
struct tcf_t t;
- if (police->rate_present)
- psched_ratecfg_getrate(&opt.rate, &police->rate);
- if (police->peak_present)
- psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+ spin_lock_bh(&police->tcf_lock);
+ opt.action = police->tcf_action;
+ p = rcu_dereference_protected(police->params,
+ lockdep_is_held(&police->tcf_lock));
+ opt.mtu = p->tcfp_mtu;
+ opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
+ if (p->rate_present)
+ psched_ratecfg_getrate(&opt.rate, &p->rate);
+ if (p->peak_present)
+ psched_ratecfg_getrate(&opt.peakrate, &p->peak);
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
- if (police->tcfp_result &&
- nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+ if (p->tcfp_result &&
+ nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result))
goto nla_put_failure;
- if (police->tcfp_ewma_rate &&
- nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+ if (p->tcfp_ewma_rate &&
+ nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
@@ -298,16 +335,17 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&police->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&police->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
-static int tcf_police_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
@@ -322,11 +360,12 @@ static struct tc_action_ops act_police_ops = {
.kind = "police",
.type = TCA_ID_POLICE,
.owner = THIS_MODULE,
- .act = tcf_act_police,
- .dump = tcf_act_police_dump,
- .init = tcf_act_police_init,
- .walk = tcf_act_police_walker,
+ .act = tcf_police_act,
+ .dump = tcf_police_dump,
+ .init = tcf_police_init,
+ .walk = tcf_police_walker,
.lookup = tcf_police_search,
+ .cleanup = tcf_police_cleanup,
.size = sizeof(struct tcf_police),
};
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 5db358497c9e..1a0c682fd734 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -37,15 +37,17 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
struct psample_group *psample_group;
struct tc_sample *parm;
+ u32 psample_group_num;
struct tcf_sample *s;
bool exists = false;
- int ret;
+ int ret, err;
if (!nla)
return -EINVAL;
@@ -58,38 +60,46 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_SAMPLE_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
- &act_sample_ops, bind, false);
- if (ret)
+ &act_sample_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
- s = to_sample(*a);
- s->tcf_action = parm->action;
- s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
- s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
- psample_group = psample_group_get(net, s->psample_group_num);
+ psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]);
+ psample_group = psample_group_get(net, psample_group_num);
if (!psample_group) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
+
+ s = to_sample(*a);
+
+ spin_lock_bh(&s->tcf_lock);
+ s->tcf_action = parm->action;
+ s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]);
+ s->psample_group_num = psample_group_num;
RCU_INIT_POINTER(s->psample_group, psample_group);
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
+ spin_unlock_bh(&s->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -101,7 +111,8 @@ static void tcf_sample_cleanup(struct tc_action *a)
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
- psample_group = rtnl_dereference(s->psample_group);
+ /* last reference to action, no need to lock */
+ psample_group = rcu_dereference_protected(s->psample_group, 1);
RCU_INIT_POINTER(s->psample_group, NULL);
if (psample_group)
psample_group_put(psample_group);
@@ -136,8 +147,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
retval = READ_ONCE(s->tcf_action);
- rcu_read_lock();
- psample_group = rcu_dereference(s->psample_group);
+ psample_group = rcu_dereference_bh(s->psample_group);
/* randomly sample packets according to rate */
if (psample_group && (prandom_u32() % s->rate == 0)) {
@@ -161,7 +171,6 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
skb_pull(skb, skb->mac_len);
}
- rcu_read_unlock();
return retval;
}
@@ -172,12 +181,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_sample *s = to_sample(a);
struct tc_sample opt = {
.index = s->tcf_index,
- .action = s->tcf_action,
- .refcnt = s->tcf_refcnt - ref,
- .bindcnt = s->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&s->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&s->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&s->tcf_lock);
+ opt.action = s->tcf_action;
if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -194,9 +204,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num))
goto nla_put_failure;
+ spin_unlock_bh(&s->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&s->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -211,8 +224,7 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 98c4afe7c15b..902957beceb3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -28,8 +28,8 @@ static unsigned int simp_net_id;
static struct tc_action_ops act_simp_ops;
#define SIMP_MAX_DATA 32
-static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_defact *d = to_defact(a);
@@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
@@ -99,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (tb[TCA_DEF_DATA] == NULL) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_simp_ops, bind, false);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
d = to_defact(*a);
ret = alloc_defdata(d, tb[TCA_DEF_DATA]);
@@ -126,9 +134,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
} else {
d = to_defact(*a);
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
reset_policy(d, tb[TCA_DEF_DATA], parm);
}
@@ -145,12 +154,13 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_defact *d = to_defact(a);
struct tc_defact opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
- .action = d->tcf_action,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&d->tcf_lock);
+ opt.action = d->tcf_action;
if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
goto nla_put_failure;
@@ -158,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&d->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -175,8 +188,7 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
@@ -187,7 +199,7 @@ static struct tc_action_ops act_simp_ops = {
.kind = "simple",
.type = TCA_ACT_SIMP,
.owner = THIS_MODULE,
- .act = tcf_simp,
+ .act = tcf_simp_act,
.dump = tcf_simp_dump,
.cleanup = tcf_simp_release,
.init = tcf_simp_init,
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..64dba3708fce 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
#include <linux/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_skbedit.h>
@@ -30,29 +33,54 @@
static unsigned int skbedit_net_id;
static struct tc_action_ops act_skbedit_ops;
-static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+ int action;
- spin_lock(&d->tcf_lock);
tcf_lastuse_update(&d->tcf_tm);
- bstats_update(&d->tcf_bstats, skb);
-
- if (d->flags & SKBEDIT_F_PRIORITY)
- skb->priority = d->priority;
- if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
- skb->dev->real_num_tx_queues > d->queue_mapping)
- skb_set_queue_mapping(skb, d->queue_mapping);
- if (d->flags & SKBEDIT_F_MARK) {
- skb->mark &= ~d->mask;
- skb->mark |= d->mark & d->mask;
+ bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
+
+ params = rcu_dereference_bh(d->params);
+ action = READ_ONCE(d->tcf_action);
+
+ if (params->flags & SKBEDIT_F_PRIORITY)
+ skb->priority = params->priority;
+ if (params->flags & SKBEDIT_F_INHERITDSFIELD) {
+ int wlen = skb_network_offset(skb);
+
+ switch (tc_skb_protocol(skb)) {
+ case htons(ETH_P_IP):
+ wlen += sizeof(struct iphdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto err;
+ skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ break;
+
+ case htons(ETH_P_IPV6):
+ wlen += sizeof(struct ipv6hdr);
+ if (!pskb_may_pull(skb, wlen))
+ goto err;
+ skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ break;
+ }
}
- if (d->flags & SKBEDIT_F_PTYPE)
- skb->pkt_type = d->ptype;
+ if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
+ skb->dev->real_num_tx_queues > params->queue_mapping)
+ skb_set_queue_mapping(skb, params->queue_mapping);
+ if (params->flags & SKBEDIT_F_MARK) {
+ skb->mark &= ~params->mask;
+ skb->mark |= params->mark & params->mask;
+ }
+ if (params->flags & SKBEDIT_F_PTYPE)
+ skb->pkt_type = params->ptype;
+ return action;
- spin_unlock(&d->tcf_lock);
- return d->tcf_action;
+err:
+ qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
}
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,13 +90,16 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
[TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MASK] = { .len = sizeof(u32) },
+ [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+ struct tcf_skbedit_params *params_new;
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
@@ -114,52 +145,76 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
mask = nla_data(tb[TCA_SKBEDIT_MASK]);
}
+ if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+ u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+ if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+ flags |= SKBEDIT_F_INHERITDSFIELD;
+ }
+
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!flags) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
- &act_skbedit_ops, bind, false);
- if (ret)
+ &act_skbedit_ops, bind, true);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
d = to_skbedit(*a);
ret = ACT_P_CREATED;
} else {
d = to_skbedit(*a);
- tcf_idr_release(*a, bind);
- if (!ovr)
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
return -EEXIST;
+ }
}
- spin_lock_bh(&d->tcf_lock);
+ params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
+ if (unlikely(!params_new)) {
+ if (ret == ACT_P_CREATED)
+ tcf_idr_release(*a, bind);
+ return -ENOMEM;
+ }
- d->flags = flags;
+ params_new->flags = flags;
if (flags & SKBEDIT_F_PRIORITY)
- d->priority = *priority;
+ params_new->priority = *priority;
if (flags & SKBEDIT_F_QUEUE_MAPPING)
- d->queue_mapping = *queue_mapping;
+ params_new->queue_mapping = *queue_mapping;
if (flags & SKBEDIT_F_MARK)
- d->mark = *mark;
+ params_new->mark = *mark;
if (flags & SKBEDIT_F_PTYPE)
- d->ptype = *ptype;
+ params_new->ptype = *ptype;
/* default behaviour is to use all the bits */
- d->mask = 0xffffffff;
+ params_new->mask = 0xffffffff;
if (flags & SKBEDIT_F_MASK)
- d->mask = *mask;
+ params_new->mask = *mask;
+ spin_lock_bh(&d->tcf_lock);
d->tcf_action = parm->action;
-
+ rcu_swap_protected(d->params, params_new,
+ lockdep_is_held(&d->tcf_lock));
spin_unlock_bh(&d->tcf_lock);
+ if (params_new)
+ kfree_rcu(params_new, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -171,42 +226,66 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
struct tc_skbedit opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
- .action = d->tcf_action,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
};
+ u64 pure_flags = 0;
struct tcf_t t;
+ spin_lock_bh(&d->tcf_lock);
+ params = rcu_dereference_protected(d->params,
+ lockdep_is_held(&d->tcf_lock));
+ opt.action = d->tcf_action;
+
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_PRIORITY) &&
- nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority))
+ if ((params->flags & SKBEDIT_F_PRIORITY) &&
+ nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority))
+ goto nla_put_failure;
+ if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) &&
+ nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
- nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping))
+ if ((params->flags & SKBEDIT_F_MARK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_MARK) &&
- nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark))
+ if ((params->flags & SKBEDIT_F_PTYPE) &&
+ nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_PTYPE) &&
- nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype))
+ if ((params->flags & SKBEDIT_F_MASK) &&
+ nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask))
goto nla_put_failure;
- if ((d->flags & SKBEDIT_F_MASK) &&
- nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
+ if (params->flags & SKBEDIT_F_INHERITDSFIELD)
+ pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+ if (pure_flags != 0 &&
+ nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
goto nla_put_failure;
tcf_tm_dump(&t, &d->tcf_tm);
if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&d->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
+static void tcf_skbedit_cleanup(struct tc_action *a)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_skbedit_params *params;
+
+ params = rcu_dereference_protected(d->params, 1);
+ if (params)
+ kfree_rcu(params, rcu);
+}
+
static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, int type,
const struct tc_action_ops *ops,
@@ -217,8 +296,7 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -229,9 +307,10 @@ static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
.type = TCA_ACT_SKBEDIT,
.owner = THIS_MODULE,
- .act = tcf_skbedit,
+ .act = tcf_skbedit_act,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
+ .cleanup = tcf_skbedit_cleanup,
.walk = tcf_skbedit_walker,
.lookup = tcf_skbedit_search,
.size = sizeof(struct tcf_skbedit),
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index ad050d7d4b46..59710a183bd3 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -24,7 +24,7 @@ static unsigned int skbmod_net_id;
static struct tc_action_ops act_skbmod_ops;
#define MAX_EDIT_LEN ETH_HLEN
-static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_skbmod *d = to_skbmod(a);
@@ -41,20 +41,14 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
* then MAX_EDIT_LEN needs to change appropriately
*/
err = skb_ensure_writable(skb, MAX_EDIT_LEN);
- if (unlikely(err)) { /* best policy is to drop on the floor */
- qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
- return TC_ACT_SHOT;
- }
+ if (unlikely(err)) /* best policy is to drop on the floor */
+ goto drop;
- rcu_read_lock();
action = READ_ONCE(d->tcf_action);
- if (unlikely(action == TC_ACT_SHOT)) {
- qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
- rcu_read_unlock();
- return action;
- }
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
- p = rcu_dereference(d->skbmod_p);
+ p = rcu_dereference_bh(d->skbmod_p);
flags = p->flags;
if (flags & SKBMOD_F_DMAC)
ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
@@ -62,7 +56,6 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
if (flags & SKBMOD_F_ETYPE)
eth_hdr(skb)->h_proto = p->eth_type;
- rcu_read_unlock();
if (flags & SKBMOD_F_SWAPMAC) {
u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
@@ -73,6 +66,10 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
}
return action;
+
+drop:
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
}
static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
@@ -84,7 +81,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
struct nlattr *tb[TCA_SKBMOD_MAX + 1];
@@ -127,46 +125,50 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
if (parm->flags & SKBMOD_F_SWAPMAC)
lflags = SKBMOD_F_SWAPMAC;
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
if (!lflags) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_skbmod_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
d = to_skbmod(*a);
- ASSERT_RTNL();
p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
if (unlikely(!p)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
p->flags = lflags;
d->tcf_action = parm->action;
- p_old = rtnl_dereference(d->skbmod_p);
-
if (ovr)
spin_lock_bh(&d->tcf_lock);
+ /* Protected by tcf_lock if overwriting existing action. */
+ p_old = rcu_dereference_protected(d->skbmod_p, 1);
if (lflags & SKBMOD_F_DMAC)
ether_addr_copy(p->eth_dst, daddr);
@@ -202,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
{
struct tcf_skbmod *d = to_skbmod(a);
unsigned char *b = skb_tail_pointer(skb);
- struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
+ struct tcf_skbmod_params *p;
struct tc_skbmod opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
- .action = d->tcf_action,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&d->tcf_lock);
+ opt.action = d->tcf_action;
+ p = rcu_dereference_protected(d->skbmod_p,
+ lockdep_is_held(&d->tcf_lock));
opt.flags = p->flags;
if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -228,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&d->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&d->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -244,8 +251,7 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
@@ -256,7 +262,7 @@ static struct tc_action_ops act_skbmod_ops = {
.kind = "skbmod",
.type = TCA_ACT_SKBMOD,
.owner = THIS_MODULE,
- .act = tcf_skbmod_run,
+ .act = tcf_skbmod_act,
.dump = tcf_skbmod_dump,
.init = tcf_skbmod_init,
.cleanup = tcf_skbmod_cleanup,
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 9bc6c2ae98a5..4cca8f274662 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -13,6 +13,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <net/geneve.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -30,9 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_tunnel_key_params *params;
int action;
- rcu_read_lock();
-
- params = rcu_dereference(t->params);
+ params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
@@ -52,11 +51,138 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
break;
}
- rcu_read_unlock();
-
return action;
}
+static const struct nla_policy
+enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
+ .len = 128 },
+};
+
+static int
+tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1];
+ int err, data_len, opt_len;
+ u8 *data;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+ return -EINVAL;
+ }
+
+ data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+ data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]);
+ if (data_len < 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+ return -ERANGE;
+ }
+ if (data_len % 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+ return -ERANGE;
+ }
+
+ opt_len = sizeof(struct geneve_opt) + data_len;
+ if (dst) {
+ struct geneve_opt *opt = dst;
+
+ WARN_ON(dst_len < opt_len);
+
+ opt->opt_class =
+ nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]);
+ opt->length = data_len / 4; /* length is in units of 4 bytes */
+ opt->r1 = 0;
+ opt->r2 = 0;
+ opt->r3 = 0;
+
+ memcpy(opt + 1, data, data_len);
+ }
+
+ return opt_len;
+}
+
+static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
+ int dst_len, struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ const struct nlattr *attr, *head = nla_data(nla);
+
+ err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
+ enc_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(attr, head, len, rem) {
+ switch (nla_type(attr)) {
+ case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ opt_len = tunnel_key_copy_geneve_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (dst) {
+ dst_len -= opt_len;
+ dst += opt_len;
+ }
+ break;
+ }
+ }
+
+ if (!opts_len) {
+ NL_SET_ERR_MSG(extack, "Empty list of tunnel options");
+ return -EINVAL;
+ }
+
+ if (rem > 0) {
+ NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes");
+ return -EINVAL;
+ }
+
+ return opts_len;
+}
+
+static int tunnel_key_get_opts_len(struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ return tunnel_key_copy_opts(nla, NULL, 0, extack);
+}
+
+static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
+ int opts_len, struct netlink_ext_ack *extack)
+{
+ info->options_len = opts_len;
+ switch (nla_type(nla_data(nla))) {
+ case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ default:
+ NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
+ return -EINVAL;
+ }
+}
+
static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
[TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) },
[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
@@ -66,39 +192,53 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
[TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 },
[TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
[TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 },
};
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
- struct tcf_tunnel_key_params *params_old;
struct tcf_tunnel_key_params *params_new;
struct metadata_dst *metadata = NULL;
struct tc_tunnel_key *parm;
struct tcf_tunnel_key *t;
bool exists = false;
__be16 dst_port = 0;
+ int opts_len = 0;
__be64 key_id;
__be16 flags;
+ u8 tos, ttl;
int ret = 0;
int err;
- if (!nla)
+ if (!nla) {
+ NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed");
return -EINVAL;
+ }
err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy,
- NULL);
- if (err < 0)
+ extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes");
return err;
+ }
- if (!tb[TCA_TUNNEL_KEY_PARMS])
+ if (!tb[TCA_TUNNEL_KEY_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key parameters");
return -EINVAL;
+ }
parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
@@ -107,6 +247,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
break;
case TCA_TUNNEL_KEY_ACT_SET:
if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key id");
ret = -EINVAL;
goto err_out;
}
@@ -121,6 +262,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
+ if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) {
+ opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+ extack);
+ if (opts_len < 0) {
+ ret = opts_len;
+ goto err_out;
+ }
+ }
+
+ tos = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TOS])
+ tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]);
+ ttl = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TTL])
+ ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]);
+
if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
__be32 saddr;
@@ -129,9 +286,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
- metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+ metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl,
dst_port, flags,
- key_id, 0);
+ key_id, opts_len);
} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
struct in6_addr saddr;
@@ -140,19 +297,33 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
- metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
0, flags,
key_id, 0);
+ } else {
+ NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
+ ret = -EINVAL;
+ goto err_out;
}
if (!metadata) {
- ret = -EINVAL;
+ NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst");
+ ret = -ENOMEM;
goto err_out;
}
+ if (opts_len) {
+ ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS],
+ &metadata->u.tun_info,
+ opts_len, extack);
+ if (ret < 0)
+ goto release_tun_meta;
+ }
+
metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
break;
default:
+ NL_SET_ERR_MSG(extack, "Unknown tunnel key action");
ret = -EINVAL;
goto err_out;
}
@@ -160,45 +331,51 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_tunnel_key_ops, bind, true);
- if (ret)
- return ret;
+ if (ret) {
+ NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
+ goto release_tun_meta;
+ }
ret = ACT_P_CREATED;
- } else {
- tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ } else if (!ovr) {
+ NL_SET_ERR_MSG(extack, "TC IDR already exists");
+ ret = -EEXIST;
+ goto release_tun_meta;
}
t = to_tunnel_key(*a);
- ASSERT_RTNL();
params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
if (unlikely(!params_new)) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
- return -ENOMEM;
+ NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters");
+ ret = -ENOMEM;
+ exists = true;
+ goto release_tun_meta;
}
-
- params_old = rtnl_dereference(t->params);
-
- t->tcf_action = parm->action;
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
- rcu_assign_pointer(t->params, params_new);
-
- if (params_old)
- kfree_rcu(params_old, rcu);
+ spin_lock_bh(&t->tcf_lock);
+ t->tcf_action = parm->action;
+ rcu_swap_protected(t->params, params_new,
+ lockdep_is_held(&t->tcf_lock));
+ spin_unlock_bh(&t->tcf_lock);
+ if (params_new)
+ kfree_rcu(params_new, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
return ret;
+release_tun_meta:
+ dst_release(&metadata->dst);
+
err_out:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return ret;
}
@@ -216,6 +393,65 @@ static void tunnel_key_release(struct tc_action *a)
}
}
+static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ int len = info->options_len;
+ u8 *src = (u8 *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE);
+ if (!start)
+ return -EMSGSIZE;
+
+ while (len > 0) {
+ struct geneve_opt *opt = (struct geneve_opt *)src;
+
+ if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA,
+ opt->length * 4, opt + 1)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ len -= sizeof(struct geneve_opt) + opt->length * 4;
+ src += sizeof(struct geneve_opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct nlattr *start;
+ int err = -EINVAL;
+
+ if (!info->options_len)
+ return 0;
+
+ start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ err = tunnel_key_geneve_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else {
+err_out:
+ nla_nest_cancel(skb, start);
+ return err;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
static int tunnel_key_dump_addresses(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -252,22 +488,24 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_tunnel_key_params *params;
struct tc_tunnel_key opt = {
.index = t->tcf_index,
- .refcnt = t->tcf_refcnt - ref,
- .bindcnt = t->tcf_bindcnt - bind,
- .action = t->tcf_action,
+ .refcnt = refcount_read(&t->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&t->tcf_bindcnt) - bind,
};
struct tcf_t tm;
- params = rtnl_dereference(t->params);
-
+ spin_lock_bh(&t->tcf_lock);
+ params = rcu_dereference_protected(t->params,
+ lockdep_is_held(&t->tcf_lock));
+ opt.action = t->tcf_action;
opt.t_action = params->tcft_action;
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
- struct ip_tunnel_key *key =
- &params->tcft_enc_metadata->u.tun_info.key;
+ struct ip_tunnel_info *info =
+ &params->tcft_enc_metadata->u.tun_info;
+ struct ip_tunnel_key *key = &info->key;
__be32 key_id = tunnel_id_to_key32(key->tun_id);
if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
@@ -275,7 +513,14 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
&params->tcft_enc_metadata->u.tun_info) ||
nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) ||
nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
- !(key->tun_flags & TUNNEL_CSUM)))
+ !(key->tun_flags & TUNNEL_CSUM)) ||
+ tunnel_key_opts_dump(skb, info))
+ goto nla_put_failure;
+
+ if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos))
+ goto nla_put_failure;
+
+ if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl))
goto nla_put_failure;
}
@@ -283,10 +528,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
&tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&t->tcf_lock);
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&t->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -301,8 +548,7 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 1fb39e1f9d07..ba677d54a7af 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -22,8 +22,8 @@
static unsigned int vlan_net_id;
static struct tc_action_ops act_vlan_ops;
-static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
- struct tcf_result *res)
+static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
{
struct tcf_vlan *v = to_vlan(a);
struct tcf_vlan_params *p;
@@ -40,11 +40,9 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
if (skb_at_tc_ingress(skb))
skb_push_rcsum(skb, skb->mac_len);
- rcu_read_lock();
-
action = READ_ONCE(v->tcf_action);
- p = rcu_dereference(v->vlan_p);
+ p = rcu_dereference_bh(v->vlan_p);
switch (p->tcfv_action) {
case TCA_VLAN_ACT_POP:
@@ -61,7 +59,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
case TCA_VLAN_ACT_MODIFY:
/* No-op if no vlan tag (either hw-accel or in-payload) */
if (!skb_vlan_tagged(skb))
- goto unlock;
+ goto out;
/* extract existing tag (and guarantee no hw-accel tag) */
if (skb_vlan_tag_present(skb)) {
tci = skb_vlan_tag_get(skb);
@@ -86,18 +84,15 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
BUG();
}
- goto unlock;
-
-drop:
- action = TC_ACT_SHOT;
- qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
-
-unlock:
- rcu_read_unlock();
+out:
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, skb->mac_len);
return action;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ return TC_ACT_SHOT;
}
static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
@@ -109,11 +104,12 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
- struct tcf_vlan_params *p, *p_old;
+ struct tcf_vlan_params *p;
struct tc_vlan *parm;
struct tcf_vlan *v;
int action;
@@ -133,7 +129,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_VLAN_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_VLAN_PARMS]);
- exists = tcf_idr_check(tn, parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (err < 0)
+ return err;
+ exists = err;
if (exists && bind)
return 0;
@@ -145,12 +144,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
if (push_vid >= VLAN_VID_MASK) {
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -ERANGE;
}
@@ -163,6 +166,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
default:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EPROTONOSUPPORT;
}
} else {
@@ -175,6 +180,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
default:
if (exists)
tcf_idr_release(*a, bind);
+ else
+ tcf_idr_cleanup(tn, parm->index);
return -EINVAL;
}
action = parm->v_action;
@@ -182,39 +189,37 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, parm->index, est, a,
&act_vlan_ops, bind, true);
- if (ret)
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
return ret;
+ }
ret = ACT_P_CREATED;
- } else {
+ } else if (!ovr) {
tcf_idr_release(*a, bind);
- if (!ovr)
- return -EEXIST;
+ return -EEXIST;
}
v = to_vlan(*a);
- ASSERT_RTNL();
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
- if (ret == ACT_P_CREATED)
- tcf_idr_release(*a, bind);
+ tcf_idr_release(*a, bind);
return -ENOMEM;
}
- v->tcf_action = parm->action;
-
- p_old = rtnl_dereference(v->vlan_p);
-
p->tcfv_action = action;
p->tcfv_push_vid = push_vid;
p->tcfv_push_prio = push_prio;
p->tcfv_push_proto = push_proto;
- rcu_assign_pointer(v->vlan_p, p);
+ spin_lock_bh(&v->tcf_lock);
+ v->tcf_action = parm->action;
+ rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+ spin_unlock_bh(&v->tcf_lock);
- if (p_old)
- kfree_rcu(p_old, rcu);
+ if (p)
+ kfree_rcu(p, rcu);
if (ret == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -236,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_vlan *v = to_vlan(a);
- struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p);
+ struct tcf_vlan_params *p;
struct tc_vlan opt = {
.index = v->tcf_index,
- .refcnt = v->tcf_refcnt - ref,
- .bindcnt = v->tcf_bindcnt - bind,
- .action = v->tcf_action,
- .v_action = p->tcfv_action,
+ .refcnt = refcount_read(&v->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&v->tcf_bindcnt) - bind,
};
struct tcf_t t;
+ spin_lock_bh(&v->tcf_lock);
+ opt.action = v->tcf_action;
+ p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+ opt.v_action = p->tcfv_action;
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -261,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
+ spin_unlock_bh(&v->tcf_lock);
+
return skb->len;
nla_put_failure:
+ spin_unlock_bh(&v->tcf_lock);
nlmsg_trim(skb, b);
return -1;
}
@@ -278,8 +288,7 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
-static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
- struct netlink_ext_ack *extack)
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
@@ -290,7 +299,7 @@ static struct tc_action_ops act_vlan_ops = {
.kind = "vlan",
.type = TCA_ACT_VLAN,
.owner = THIS_MODULE,
- .act = tcf_vlan,
+ .act = tcf_vlan_act,
.dump = tcf_vlan_dump,
.init = tcf_vlan_init,
.cleanup = tcf_vlan_cleanup,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f74513a7c7a8..f427a1e00e7e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -31,6 +31,8 @@
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
+extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
+
/* The list of all installed classifier types */
static LIST_HEAD(tcf_proto_base);
@@ -39,7 +41,7 @@ static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
-static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
+static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
{
const struct tcf_proto_ops *t, *res = NULL;
@@ -57,6 +59,33 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
return res;
}
+static const struct tcf_proto_ops *
+tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
+{
+ const struct tcf_proto_ops *ops;
+
+ ops = __tcf_proto_lookup_ops(kind);
+ if (ops)
+ return ops;
+#ifdef CONFIG_MODULES
+ rtnl_unlock();
+ request_module("cls_%s", kind);
+ rtnl_lock();
+ ops = __tcf_proto_lookup_ops(kind);
+ /* We dropped the RTNL semaphore in order to perform
+ * the module load. So, even if we succeeded in loading
+ * the module we have to replay the request. We indicate
+ * this using -EAGAIN.
+ */
+ if (ops) {
+ module_put(ops->owner);
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ NL_SET_ERR_MSG(extack, "TC classifier not found");
+ return ERR_PTR(-ENOENT);
+}
+
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
@@ -133,27 +162,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
if (!tp)
return ERR_PTR(-ENOBUFS);
- err = -ENOENT;
- tp->ops = tcf_proto_lookup_ops(kind);
- if (!tp->ops) {
-#ifdef CONFIG_MODULES
- rtnl_unlock();
- request_module("cls_%s", kind);
- rtnl_lock();
- tp->ops = tcf_proto_lookup_ops(kind);
- /* We dropped the RTNL semaphore in order to perform
- * the module load. So, even if we succeeded in loading
- * the module we have to replay the request. We indicate
- * this using -EAGAIN.
- */
- if (tp->ops) {
- module_put(tp->ops->owner);
- err = -EAGAIN;
- } else {
- NL_SET_ERR_MSG(extack, "TC classifier not found");
- err = -ENOENT;
- }
-#endif
+ tp->ops = tcf_proto_lookup_ops(kind, extack);
+ if (IS_ERR(tp->ops)) {
+ err = PTR_ERR(tp->ops);
goto errout;
}
tp->classify = tp->ops->classify;
@@ -195,11 +206,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (!chain)
return NULL;
- INIT_LIST_HEAD(&chain->filter_chain_list);
list_add_tail(&chain->list, &block->chain_list);
chain->block = block;
chain->index = chain_index;
chain->refcnt = 1;
+ if (!chain->index)
+ block->chain0.chain = chain;
return chain;
}
@@ -209,36 +221,29 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
if (item->chain_head_change)
item->chain_head_change(tp_head, item->chain_head_change_priv);
}
-static void tcf_chain_head_change(struct tcf_chain *chain,
- struct tcf_proto *tp_head)
+
+static void tcf_chain0_head_change(struct tcf_chain *chain,
+ struct tcf_proto *tp_head)
{
struct tcf_filter_chain_list_item *item;
+ struct tcf_block *block = chain->block;
- list_for_each_entry(item, &chain->filter_chain_list, list)
+ if (chain->index)
+ return;
+ list_for_each_entry(item, &block->chain0.filter_chain_list, list)
tcf_chain_head_change_item(item, tp_head);
}
-static void tcf_chain_flush(struct tcf_chain *chain)
-{
- struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
-
- tcf_chain_head_change(chain, NULL);
- while (tp) {
- RCU_INIT_POINTER(chain->filter_chain, tp->next);
- tcf_proto_destroy(tp, NULL);
- tp = rtnl_dereference(chain->filter_chain);
- tcf_chain_put(chain);
- }
-}
-
static void tcf_chain_destroy(struct tcf_chain *chain)
{
struct tcf_block *block = chain->block;
list_del(&chain->list);
+ if (!chain->index)
+ block->chain0.chain = NULL;
kfree(chain);
- if (list_empty(&block->chain_list))
- kfree(block);
+ if (list_empty(&block->chain_list) && !refcount_read(&block->refcnt))
+ kfree_rcu(block, rcu);
}
static void tcf_chain_hold(struct tcf_chain *chain)
@@ -246,28 +251,119 @@ static void tcf_chain_hold(struct tcf_chain *chain)
++chain->refcnt;
}
-struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
- bool create)
+static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain)
+{
+ /* In case all the references are action references, this
+ * chain should not be shown to the user.
+ */
+ return chain->refcnt == chain->action_refcnt;
+}
+
+static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
+ u32 chain_index)
{
struct tcf_chain *chain;
list_for_each_entry(chain, &block->chain_list, list) {
- if (chain->index == chain_index) {
- tcf_chain_hold(chain);
+ if (chain->index == chain_index)
return chain;
- }
}
+ return NULL;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+ u32 seq, u16 flags, int event, bool unicast);
+
+static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
+ u32 chain_index, bool create,
+ bool by_act)
+{
+ struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
+
+ if (chain) {
+ tcf_chain_hold(chain);
+ } else {
+ if (!create)
+ return NULL;
+ chain = tcf_chain_create(block, chain_index);
+ if (!chain)
+ return NULL;
+ }
+
+ if (by_act)
+ ++chain->action_refcnt;
- return create ? tcf_chain_create(block, chain_index) : NULL;
+ /* Send notification only in case we got the first
+ * non-action reference. Until then, the chain acts only as
+ * a placeholder for actions pointing to it and user ought
+ * not know about them.
+ */
+ if (chain->refcnt - chain->action_refcnt == 1 && !by_act)
+ tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+ RTM_NEWCHAIN, false);
+
+ return chain;
+}
+
+static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
+ bool create)
+{
+ return __tcf_chain_get(block, chain_index, create, false);
}
-EXPORT_SYMBOL(tcf_chain_get);
-void tcf_chain_put(struct tcf_chain *chain)
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index)
{
- if (--chain->refcnt == 0)
+ return __tcf_chain_get(block, chain_index, true, true);
+}
+EXPORT_SYMBOL(tcf_chain_get_by_act);
+
+static void tc_chain_tmplt_del(struct tcf_chain *chain);
+
+static void __tcf_chain_put(struct tcf_chain *chain, bool by_act)
+{
+ if (by_act)
+ chain->action_refcnt--;
+ chain->refcnt--;
+
+ /* The last dropped non-action reference will trigger notification. */
+ if (chain->refcnt - chain->action_refcnt == 0 && !by_act)
+ tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
+
+ if (chain->refcnt == 0) {
+ tc_chain_tmplt_del(chain);
tcf_chain_destroy(chain);
+ }
+}
+
+static void tcf_chain_put(struct tcf_chain *chain)
+{
+ __tcf_chain_put(chain, false);
+}
+
+void tcf_chain_put_by_act(struct tcf_chain *chain)
+{
+ __tcf_chain_put(chain, true);
+}
+EXPORT_SYMBOL(tcf_chain_put_by_act);
+
+static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
+{
+ if (chain->explicitly_created)
+ tcf_chain_put(chain);
+}
+
+static void tcf_chain_flush(struct tcf_chain *chain)
+{
+ struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
+
+ tcf_chain0_head_change(chain, NULL);
+ while (tp) {
+ RCU_INIT_POINTER(chain->filter_chain, tp->next);
+ tcf_proto_destroy(tp, NULL);
+ tp = rtnl_dereference(chain->filter_chain);
+ tcf_chain_put(chain);
+ }
}
-EXPORT_SYMBOL(tcf_chain_put);
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
@@ -277,18 +373,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
static int tcf_block_offload_cmd(struct tcf_block *block,
struct net_device *dev,
struct tcf_block_ext_info *ei,
- enum tc_block_command command)
+ enum tc_block_command command,
+ struct netlink_ext_ack *extack)
{
struct tc_block_offload bo = {};
bo.command = command;
bo.binder_type = ei->binder_type;
bo.block = block;
+ bo.extack = extack;
return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
}
static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
- struct tcf_block_ext_info *ei)
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
{
struct net_device *dev = q->dev_queue->dev;
int err;
@@ -299,10 +398,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
/* If tc offload feature is disabled and the block we try to bind
* to already has some offloaded filters, forbid to bind.
*/
- if (!tc_can_offload(dev) && tcf_block_offload_in_use(block))
+ if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+ NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
return -EOPNOTSUPP;
+ }
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND);
+ err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
return err;
@@ -322,7 +423,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_dec;
- err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND);
+ err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
return;
@@ -332,10 +433,11 @@ no_offload_dev_dec:
}
static int
-tcf_chain_head_change_cb_add(struct tcf_chain *chain,
- struct tcf_block_ext_info *ei,
- struct netlink_ext_ack *extack)
+tcf_chain0_head_change_cb_add(struct tcf_block *block,
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
{
+ struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
item = kmalloc(sizeof(*item), GFP_KERNEL);
@@ -345,23 +447,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain,
}
item->chain_head_change = ei->chain_head_change;
item->chain_head_change_priv = ei->chain_head_change_priv;
- if (chain->filter_chain)
- tcf_chain_head_change_item(item, chain->filter_chain);
- list_add(&item->list, &chain->filter_chain_list);
+ if (chain0 && chain0->filter_chain)
+ tcf_chain_head_change_item(item, chain0->filter_chain);
+ list_add(&item->list, &block->chain0.filter_chain_list);
return 0;
}
static void
-tcf_chain_head_change_cb_del(struct tcf_chain *chain,
- struct tcf_block_ext_info *ei)
+tcf_chain0_head_change_cb_del(struct tcf_block *block,
+ struct tcf_block_ext_info *ei)
{
+ struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
- list_for_each_entry(item, &chain->filter_chain_list, list) {
+ list_for_each_entry(item, &block->chain0.filter_chain_list, list) {
if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
(item->chain_head_change == ei->chain_head_change &&
item->chain_head_change_priv == ei->chain_head_change_priv)) {
- tcf_chain_head_change_item(item, NULL);
+ if (chain0)
+ tcf_chain_head_change_item(item, NULL);
list_del(&item->list);
kfree(item);
return;
@@ -371,6 +475,7 @@ tcf_chain_head_change_cb_del(struct tcf_chain *chain,
}
struct tcf_net {
+ spinlock_t idr_lock; /* Protects idr */
struct idr idr;
};
@@ -380,16 +485,25 @@ static int tcf_block_insert(struct tcf_block *block, struct net *net,
struct netlink_ext_ack *extack)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ int err;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&tn->idr_lock);
+ err = idr_alloc_u32(&tn->idr, block, &block->index, block->index,
+ GFP_NOWAIT);
+ spin_unlock(&tn->idr_lock);
+ idr_preload_end();
- return idr_alloc_u32(&tn->idr, block, &block->index, block->index,
- GFP_KERNEL);
+ return err;
}
static void tcf_block_remove(struct tcf_block *block, struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ spin_lock(&tn->idr_lock);
idr_remove(&tn->idr, block->index);
+ spin_unlock(&tn->idr_lock);
}
static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
@@ -397,8 +511,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
struct netlink_ext_ack *extack)
{
struct tcf_block *block;
- struct tcf_chain *chain;
- int err;
block = kzalloc(sizeof(*block), GFP_KERNEL);
if (!block) {
@@ -408,15 +520,9 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
INIT_LIST_HEAD(&block->chain_list);
INIT_LIST_HEAD(&block->cb_list);
INIT_LIST_HEAD(&block->owner_list);
+ INIT_LIST_HEAD(&block->chain0.filter_chain_list);
- /* Create chain 0 by default, it has to be always present. */
- chain = tcf_chain_create(block, 0);
- if (!chain) {
- NL_SET_ERR_MSG(extack, "Failed to create new tcf chain");
- err = -ENOMEM;
- goto err_chain_create;
- }
- block->refcnt = 1;
+ refcount_set(&block->refcnt, 1);
block->net = net;
block->index = block_index;
@@ -424,10 +530,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
if (!tcf_block_shared(block))
block->q = q;
return block;
-
-err_chain_create:
- kfree(block);
- return ERR_PTR(err);
}
static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
@@ -437,6 +539,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
return idr_find(&tn->idr, block_index);
}
+static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index)
+{
+ struct tcf_block *block;
+
+ rcu_read_lock();
+ block = tcf_block_lookup(net, block_index);
+ if (block && !refcount_inc_not_zero(&block->refcnt))
+ block = NULL;
+ rcu_read_unlock();
+
+ return block;
+}
+
+static void tcf_block_flush_all_chains(struct tcf_block *block)
+{
+ struct tcf_chain *chain;
+
+ /* Hold a refcnt for all chains, so that they don't disappear
+ * while we are iterating.
+ */
+ list_for_each_entry(chain, &block->chain_list, list)
+ tcf_chain_hold(chain);
+
+ list_for_each_entry(chain, &block->chain_list, list)
+ tcf_chain_flush(chain);
+}
+
+static void tcf_block_put_all_chains(struct tcf_block *block)
+{
+ struct tcf_chain *chain, *tmp;
+
+ /* At this point, all the chains should have refcnt >= 1. */
+ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+ tcf_chain_put_explicitly_created(chain);
+ tcf_chain_put(chain);
+ }
+}
+
+static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q,
+ struct tcf_block_ext_info *ei)
+{
+ if (refcount_dec_and_test(&block->refcnt)) {
+ /* Flushing/putting all chains will cause the block to be
+ * deallocated when last chain is freed. However, if chain_list
+ * is empty, block has to be manually deallocated. After block
+ * reference counter reached 0, it is no longer possible to
+ * increment it or add new chains to block.
+ */
+ bool free_block = list_empty(&block->chain_list);
+
+ if (tcf_block_shared(block))
+ tcf_block_remove(block, block->net);
+ if (!free_block)
+ tcf_block_flush_all_chains(block);
+
+ if (q)
+ tcf_block_offload_unbind(block, q, ei);
+
+ if (free_block)
+ kfree_rcu(block, rcu);
+ else
+ tcf_block_put_all_chains(block);
+ } else if (q) {
+ tcf_block_offload_unbind(block, q, ei);
+ }
+}
+
+static void tcf_block_refcnt_put(struct tcf_block *block)
+{
+ __tcf_block_put(block, NULL, NULL);
+}
+
/* Find tcf block.
* Set q, parent, cl when appropriate.
*/
@@ -447,9 +621,10 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
struct netlink_ext_ack *extack)
{
struct tcf_block *block;
+ int err = 0;
if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, block_index);
+ block = tcf_block_refcnt_get(net, block_index);
if (!block) {
NL_SET_ERR_MSG(extack, "Block of given index was not found");
return ERR_PTR(-EINVAL);
@@ -458,60 +633,106 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
const struct Qdisc_class_ops *cops;
struct net_device *dev;
+ rcu_read_lock();
+
/* Find link */
- dev = __dev_get_by_index(net, ifindex);
- if (!dev)
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (!dev) {
+ rcu_read_unlock();
return ERR_PTR(-ENODEV);
+ }
/* Find qdisc */
if (!*parent) {
*q = dev->qdisc;
*parent = (*q)->handle;
} else {
- *q = qdisc_lookup(dev, TC_H_MAJ(*parent));
+ *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent));
if (!*q) {
NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
- return ERR_PTR(-EINVAL);
+ err = -EINVAL;
+ goto errout_rcu;
}
}
+ *q = qdisc_refcount_inc_nz(*q);
+ if (!*q) {
+ NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+ err = -EINVAL;
+ goto errout_rcu;
+ }
+
/* Is it classful? */
cops = (*q)->ops->cl_ops;
if (!cops) {
NL_SET_ERR_MSG(extack, "Qdisc not classful");
- return ERR_PTR(-EINVAL);
+ err = -EINVAL;
+ goto errout_rcu;
}
if (!cops->tcf_block) {
NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
- return ERR_PTR(-EOPNOTSUPP);
+ err = -EOPNOTSUPP;
+ goto errout_rcu;
}
+ /* At this point we know that qdisc is not noop_qdisc,
+ * which means that qdisc holds a reference to net_device
+ * and we hold a reference to qdisc, so it is safe to release
+ * rcu read lock.
+ */
+ rcu_read_unlock();
+
/* Do we search for filter, attached to class? */
if (TC_H_MIN(*parent)) {
*cl = cops->find(*q, *parent);
if (*cl == 0) {
NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
- return ERR_PTR(-ENOENT);
+ err = -ENOENT;
+ goto errout_qdisc;
}
}
/* And the last stroke */
block = cops->tcf_block(*q, *cl, extack);
- if (!block)
- return ERR_PTR(-EINVAL);
+ if (!block) {
+ err = -EINVAL;
+ goto errout_qdisc;
+ }
if (tcf_block_shared(block)) {
NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
- return ERR_PTR(-EOPNOTSUPP);
+ err = -EOPNOTSUPP;
+ goto errout_qdisc;
}
+
+ /* Always take reference to block in order to support execution
+ * of rules update path of cls API without rtnl lock. Caller
+ * must release block when it is finished using it. 'if' block
+ * of this conditional obtain reference to block by calling
+ * tcf_block_refcnt_get().
+ */
+ refcount_inc(&block->refcnt);
}
return block;
+
+errout_rcu:
+ rcu_read_unlock();
+errout_qdisc:
+ if (*q) {
+ qdisc_put(*q);
+ *q = NULL;
+ }
+ return ERR_PTR(err);
}
-static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
+static void tcf_block_release(struct Qdisc *q, struct tcf_block *block)
{
- return list_first_entry(&block->chain_list, struct tcf_chain, list);
+ if (!IS_ERR_OR_NULL(block))
+ tcf_block_refcnt_put(block);
+
+ if (q)
+ qdisc_put(q);
}
struct tcf_block_owner_item {
@@ -579,21 +800,16 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
{
struct net *net = qdisc_net(q);
struct tcf_block *block = NULL;
- bool created = false;
int err;
- if (ei->block_index) {
+ if (ei->block_index)
/* block_index not 0 means the shared block is requested */
- block = tcf_block_lookup(net, ei->block_index);
- if (block)
- block->refcnt++;
- }
+ block = tcf_block_refcnt_get(net, ei->block_index);
if (!block) {
block = tcf_block_create(net, q, ei->block_index, extack);
if (IS_ERR(block))
return PTR_ERR(block);
- created = true;
if (tcf_block_shared(block)) {
err = tcf_block_insert(block, net, extack);
if (err)
@@ -607,12 +823,11 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
tcf_block_owner_netif_keep_dst(block, q, ei->binder_type);
- err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
- ei, extack);
+ err = tcf_chain0_head_change_cb_add(block, ei, extack);
if (err)
- goto err_chain_head_change_cb_add;
+ goto err_chain0_head_change_cb_add;
- err = tcf_block_offload_bind(block, q, ei);
+ err = tcf_block_offload_bind(block, q, ei, extack);
if (err)
goto err_block_offload_bind;
@@ -620,19 +835,12 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
return 0;
err_block_offload_bind:
- tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
-err_chain_head_change_cb_add:
+ tcf_chain0_head_change_cb_del(block, ei);
+err_chain0_head_change_cb_add:
tcf_block_owner_del(block, q, ei->binder_type);
err_block_owner_add:
- if (created) {
- if (tcf_block_shared(block))
- tcf_block_remove(block, net);
err_block_insert:
- kfree(tcf_block_chain_zero(block));
- kfree(block);
- } else {
- block->refcnt--;
- }
+ tcf_block_refcnt_put(block);
return err;
}
EXPORT_SYMBOL(tcf_block_get_ext);
@@ -664,37 +872,12 @@ EXPORT_SYMBOL(tcf_block_get);
void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
struct tcf_block_ext_info *ei)
{
- struct tcf_chain *chain, *tmp;
-
if (!block)
return;
- tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
+ tcf_chain0_head_change_cb_del(block, ei);
tcf_block_owner_del(block, q, ei->binder_type);
- if (--block->refcnt == 0) {
- if (tcf_block_shared(block))
- tcf_block_remove(block, block->net);
-
- /* Hold a refcnt for all chains, so that they don't disappear
- * while we are iterating.
- */
- list_for_each_entry(chain, &block->chain_list, list)
- tcf_chain_hold(chain);
-
- list_for_each_entry(chain, &block->chain_list, list)
- tcf_chain_flush(chain);
- }
-
- tcf_block_offload_unbind(block, q, ei);
-
- if (block->refcnt == 0) {
- /* At this point, all the chains should have refcnt >= 1. */
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
- tcf_chain_put(chain);
-
- /* Finally, put chain 0 and allow block to be freed. */
- tcf_chain_put(tcf_block_chain_zero(block));
- }
+ __tcf_block_put(block, q, ei);
}
EXPORT_SYMBOL(tcf_block_put_ext);
@@ -746,18 +929,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
}
EXPORT_SYMBOL(tcf_block_cb_decref);
+static int
+tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
+ void *cb_priv, bool add, bool offload_in_use,
+ struct netlink_ext_ack *extack)
+{
+ struct tcf_chain *chain;
+ struct tcf_proto *tp;
+ int err;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ for (tp = rtnl_dereference(chain->filter_chain); tp;
+ tp = rtnl_dereference(tp->next)) {
+ if (tp->ops->reoffload) {
+ err = tp->ops->reoffload(tp, add, cb, cb_priv,
+ extack);
+ if (err && add)
+ goto err_playback_remove;
+ } else if (add && offload_in_use) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support");
+ goto err_playback_remove;
+ }
+ }
+ }
+
+ return 0;
+
+err_playback_remove:
+ tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
+ extack);
+ return err;
+}
+
struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv)
+ void *cb_priv,
+ struct netlink_ext_ack *extack)
{
struct tcf_block_cb *block_cb;
+ int err;
- /* At this point, playback of previous block cb calls is not supported,
- * so forbid to register to block which already has some offloaded
- * filters present.
- */
- if (tcf_block_offload_in_use(block))
- return ERR_PTR(-EOPNOTSUPP);
+ /* Replay any already present rules */
+ err = tcf_block_playback_offloads(block, cb, cb_priv, true,
+ tcf_block_offload_in_use(block),
+ extack);
+ if (err)
+ return ERR_PTR(err);
block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
if (!block_cb)
@@ -772,17 +990,22 @@ EXPORT_SYMBOL(__tcf_block_cb_register);
int tcf_block_cb_register(struct tcf_block *block,
tc_setup_cb_t *cb, void *cb_ident,
- void *cb_priv)
+ void *cb_priv, struct netlink_ext_ack *extack)
{
struct tcf_block_cb *block_cb;
- block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
- return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
+ block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
+ extack);
+ return PTR_ERR_OR_ZERO(block_cb);
}
EXPORT_SYMBOL(tcf_block_cb_register);
-void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb)
+void __tcf_block_cb_unregister(struct tcf_block *block,
+ struct tcf_block_cb *block_cb)
{
+ tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
+ false, tcf_block_offload_in_use(block),
+ NULL);
list_del(&block_cb->list);
kfree(block_cb);
}
@@ -796,7 +1019,7 @@ void tcf_block_cb_unregister(struct tcf_block *block,
block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
if (!block_cb)
return;
- __tcf_block_cb_unregister(block_cb);
+ __tcf_block_cb_unregister(block, block_cb);
}
EXPORT_SYMBOL(tcf_block_cb_unregister);
@@ -893,7 +1116,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
struct tcf_proto *tp)
{
if (*chain_info->pprev == chain->filter_chain)
- tcf_chain_head_change(chain, tp);
+ tcf_chain0_head_change(chain, tp);
RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
rcu_assign_pointer(*chain_info->pprev, tp);
tcf_chain_hold(chain);
@@ -906,7 +1129,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
struct tcf_proto *next = rtnl_dereference(chain_info->next);
if (tp == chain->filter_chain)
- tcf_chain_head_change(chain, next);
+ tcf_chain0_head_change(chain, next);
RCU_INIT_POINTER(*chain_info->pprev, next);
tcf_chain_put(chain);
}
@@ -1083,7 +1306,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
replay:
tp_created = 0;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1124,7 +1347,7 @@ replay:
}
chain = tcf_chain_get(block, chain_index, true);
if (!chain) {
- NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ NL_SET_ERR_MSG(extack, "Cannot create specified filter chain");
err = -ENOMEM;
goto errout;
}
@@ -1182,6 +1405,12 @@ replay:
goto errout;
}
+ if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
+ NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
+ err = -EINVAL;
+ goto errout;
+ }
+
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
extack);
@@ -1198,6 +1427,7 @@ replay:
errout:
if (chain)
tcf_chain_put(chain);
+ tcf_block_release(q, block);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
@@ -1226,7 +1456,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1257,8 +1487,15 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
}
chain = tcf_chain_get(block, chain_index, false);
if (!chain) {
+ /* User requested flush on non-existent chain. Nothing to do,
+ * so just return success.
+ */
+ if (prio == 0) {
+ err = 0;
+ goto errout;
+ }
NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
- err = -EINVAL;
+ err = -ENOENT;
goto errout;
}
@@ -1312,6 +1549,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
errout:
if (chain)
tcf_chain_put(chain);
+ tcf_block_release(q, block);
return err;
}
@@ -1334,7 +1572,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
void *fh = NULL;
int err;
- err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
if (err < 0)
return err;
@@ -1397,6 +1635,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
errout:
if (chain)
tcf_chain_put(chain);
+ tcf_block_release(q, block);
return err;
}
@@ -1463,7 +1702,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
arg.w.stop = 0;
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
+ arg.w.cookie = cb->args[2];
tp->ops->walk(tp, &arg.w);
+ cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
return false;
@@ -1488,12 +1729,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
- err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+ err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL,
+ cb->extack);
if (err)
return err;
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
- block = tcf_block_lookup(net, tcm->tcm_block_index);
+ block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
goto out;
/* If we work with block index, q is NULL and parent value
@@ -1552,6 +1794,344 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
}
}
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ tcf_block_refcnt_put(block);
+ cb->args[0] = index;
+
+out:
+ /* If we did no progress, the error (EMSGSIZE) is real */
+ if (skb->len == 0 && err)
+ return err;
+ return skb->len;
+}
+
+static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
+ struct sk_buff *skb, struct tcf_block *block,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_proto_ops *ops;
+ struct nlmsghdr *nlh;
+ struct tcmsg *tcm;
+ void *priv;
+
+ ops = chain->tmplt_ops;
+ priv = chain->tmplt_priv;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_handle = 0;
+ if (block->q) {
+ tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex;
+ tcm->tcm_parent = block->q->handle;
+ } else {
+ tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+ tcm->tcm_block_index = block->index;
+ }
+
+ if (nla_put_u32(skb, TCA_CHAIN, chain->index))
+ goto nla_put_failure;
+
+ if (ops) {
+ if (nla_put_string(skb, TCA_KIND, ops->kind))
+ goto nla_put_failure;
+ if (ops->tmplt_dump(skb, net, priv) < 0)
+ goto nla_put_failure;
+ }
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+ u32 seq, u16 flags, int event, bool unicast)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct tcf_block *block = chain->block;
+ struct net *net = block->net;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_chain_fill_node(chain, net, skb, block, portid,
+ seq, flags, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
+}
+
+static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ const struct tcf_proto_ops *ops;
+ void *tmplt_priv;
+
+ /* If kind is not set, user did not specify template. */
+ if (!tca[TCA_KIND])
+ return 0;
+
+ ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
+ NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+ return -EOPNOTSUPP;
+ }
+
+ tmplt_priv = ops->tmplt_create(net, chain, tca, extack);
+ if (IS_ERR(tmplt_priv)) {
+ module_put(ops->owner);
+ return PTR_ERR(tmplt_priv);
+ }
+ chain->tmplt_ops = ops;
+ chain->tmplt_priv = tmplt_priv;
+ return 0;
+}
+
+static void tc_chain_tmplt_del(struct tcf_chain *chain)
+{
+ const struct tcf_proto_ops *ops = chain->tmplt_ops;
+
+ /* If template ops are set, no work to do for us. */
+ if (!ops)
+ return;
+
+ ops->tmplt_destroy(chain->tmplt_priv);
+ module_put(ops->owner);
+}
+
+/* Add/delete/get a chain */
+
+static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ unsigned long cl;
+ int err;
+
+ if (n->nlmsg_type != RTM_GETCHAIN &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ parent = t->tcm_parent;
+ cl = 0;
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block))
+ return PTR_ERR(block);
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ err = -EINVAL;
+ goto errout_block;
+ }
+ chain = tcf_chain_lookup(block, chain_index);
+ if (n->nlmsg_type == RTM_NEWCHAIN) {
+ if (chain) {
+ if (tcf_chain_held_by_acts_only(chain)) {
+ /* The chain exists only because there is
+ * some action referencing it.
+ */
+ tcf_chain_hold(chain);
+ } else {
+ NL_SET_ERR_MSG(extack, "Filter chain already exists");
+ err = -EEXIST;
+ goto errout_block;
+ }
+ } else {
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+ NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
+ err = -ENOENT;
+ goto errout_block;
+ }
+ chain = tcf_chain_create(block, chain_index);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Failed to create filter chain");
+ err = -ENOMEM;
+ goto errout_block;
+ }
+ }
+ } else {
+ if (!chain || tcf_chain_held_by_acts_only(chain)) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ err = -EINVAL;
+ goto errout_block;
+ }
+ tcf_chain_hold(chain);
+ }
+
+ switch (n->nlmsg_type) {
+ case RTM_NEWCHAIN:
+ err = tc_chain_tmplt_add(chain, net, tca, extack);
+ if (err)
+ goto errout;
+ /* In case the chain was successfully added, take a reference
+ * to the chain. This ensures that an empty chain
+ * does not disappear at the end of this function.
+ */
+ tcf_chain_hold(chain);
+ chain->explicitly_created = true;
+ tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+ RTM_NEWCHAIN, false);
+ break;
+ case RTM_DELCHAIN:
+ tfilter_notify_chain(net, skb, block, q, parent, n,
+ chain, RTM_DELTFILTER);
+ /* Flush the chain first as the user requested chain removal. */
+ tcf_chain_flush(chain);
+ /* In case the chain was successfully deleted, put a reference
+ * to the chain previously taken during addition.
+ */
+ tcf_chain_put_explicitly_created(chain);
+ chain->explicitly_created = false;
+ break;
+ case RTM_GETCHAIN:
+ err = tc_chain_notify(chain, skb, n->nlmsg_seq,
+ n->nlmsg_seq, n->nlmsg_type, true);
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(extack, "Unsupported message type");
+ goto errout;
+ }
+
+errout:
+ tcf_chain_put(chain);
+errout_block:
+ tcf_block_release(q, block);
+ if (err == -EAGAIN)
+ /* Replay the request. */
+ goto replay;
+ return err;
+}
+
+/* called with RTNL */
+static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct Qdisc *q = NULL;
+ struct tcf_block *block;
+ struct tcf_chain *chain;
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ long index_start;
+ long index;
+ u32 parent;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return skb->len;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ cb->extack);
+ if (err)
+ return err;
+
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
+ if (!block)
+ goto out;
+ /* If we work with block index, q is NULL and parent value
+ * will never be used in the following code. The check
+ * in tcf_fill_node prevents it. However, compiler does not
+ * see that far, so set parent to zero to silence the warning
+ * about parent being uninitialized.
+ */
+ parent = 0;
+ } else {
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+ unsigned long cl = 0;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return skb->len;
+
+ parent = tcm->tcm_parent;
+ if (!parent) {
+ q = dev->qdisc;
+ parent = q->handle;
+ } else {
+ q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+ }
+ if (!q)
+ goto out;
+ cops = q->ops->cl_ops;
+ if (!cops)
+ goto out;
+ if (!cops->tcf_block)
+ goto out;
+ if (TC_H_MIN(tcm->tcm_parent)) {
+ cl = cops->find(q, tcm->tcm_parent);
+ if (cl == 0)
+ goto out;
+ }
+ block = cops->tcf_block(q, cl, NULL);
+ if (!block)
+ goto out;
+ if (tcf_block_shared(block))
+ q = NULL;
+ }
+
+ index_start = cb->args[0];
+ index = 0;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ if ((tca[TCA_CHAIN] &&
+ nla_get_u32(tca[TCA_CHAIN]) != chain->index))
+ continue;
+ if (index < index_start) {
+ index++;
+ continue;
+ }
+ if (tcf_chain_held_by_acts_only(chain))
+ continue;
+ err = tc_chain_fill_node(chain, net, skb, block,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWCHAIN);
+ if (err <= 0)
+ break;
+ index++;
+ }
+
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK)
+ tcf_block_refcnt_put(block);
cb->args[0] = index;
out:
@@ -1564,11 +2144,7 @@ out:
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- LIST_HEAD(actions);
-
- ASSERT_RTNL();
- tcf_exts_to_list(exts, &actions);
- tcf_action_destroy(&actions, TCA_ACT_UNBIND);
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
kfree(exts->actions);
exts->nr_actions = 0;
#endif
@@ -1587,7 +2163,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tp, tb[exts->police],
rate_tlv, "police", ovr,
- TCA_ACT_BIND, extack);
+ TCA_ACT_BIND, true, extack);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -1595,17 +2171,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
exts->actions[0] = act;
exts->nr_actions = 1;
} else if (exts->action && tb[exts->action]) {
- LIST_HEAD(actions);
- int err, i = 0;
+ int err;
err = tcf_action_init(net, tp, tb[exts->action],
rate_tlv, NULL, ovr, TCA_ACT_BIND,
- &actions, &attr_size, extack);
- if (err)
+ exts->actions, &attr_size, true,
+ extack);
+ if (err < 0)
return err;
- list_for_each_entry(act, &actions, list)
- exts->actions[i++] = act;
- exts->nr_actions = i;
+ exts->nr_actions = err;
}
exts->net = net;
}
@@ -1654,14 +2228,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
- LIST_HEAD(actions);
-
nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
- tcf_exts_to_list(exts, &actions);
- if (tcf_action_dump(skb, &actions, 0, 0) < 0)
+ if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
@@ -1718,6 +2289,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
if (!dev)
continue;
ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop);
+ a->ops->put_dev(dev);
if (ret < 0)
return ret;
ok_count += ret;
@@ -1752,6 +2324,7 @@ static __net_init int tcf_net_init(struct net *net)
{
struct tcf_net *tn = net_generic(net, tcf_net_id);
+ spin_lock_init(&tn->idr_lock);
idr_init(&tn->idr);
return 0;
}
@@ -1786,6 +2359,10 @@ static int __init tc_filter_init(void)
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
tc_dump_tfilter, 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
+ tc_dump_chain, 0);
return 0;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 95367f37098d..6a5dce8baf19 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -324,4 +324,3 @@ static void __exit exit_basic(void)
module_init(init_basic)
module_exit(exit_basic)
MODULE_LICENSE("GPL");
-
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1aa7f6511065..fa6fe2fe0f32 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -43,6 +43,7 @@ struct cls_bpf_prog {
struct tcf_result res;
bool exts_integrated;
u32 gen_flags;
+ unsigned int in_hw_count;
struct tcf_exts exts;
u32 handle;
u16 bpf_num_ops;
@@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
cls_bpf_offload_cmd(tp, oldprog, prog, extack);
return err;
} else if (err > 0) {
+ prog->in_hw_count = err;
tcf_block_offload_inc(block, &prog->gen_flags);
}
}
@@ -347,12 +349,10 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
return -EINVAL;
- bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL);
if (bpf_ops == NULL)
return -ENOMEM;
- memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
-
fprog_tmp.len = bpf_num_ops;
fprog_tmp.filter = bpf_ops;
@@ -652,6 +652,42 @@ skip:
}
}
+static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_bpf_head *head = rtnl_dereference(tp->root);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_bpf_offload cls_bpf = {};
+ struct cls_bpf_prog *prog;
+ int err;
+
+ list_for_each_entry(prog, &head->plist, link) {
+ if (tc_skip_hw(prog->gen_flags))
+ continue;
+
+ tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags,
+ extack);
+ cls_bpf.command = TC_CLSBPF_OFFLOAD;
+ cls_bpf.exts = &prog->exts;
+ cls_bpf.prog = add ? prog->filter : NULL;
+ cls_bpf.oldprog = add ? NULL : prog->filter;
+ cls_bpf.name = prog->bpf_name;
+ cls_bpf.exts_integrated = prog->exts_integrated;
+
+ err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(prog->gen_flags))
+ return err;
+ continue;
+ }
+
+ tc_cls_offload_cnt_update(block, &prog->in_hw_count,
+ &prog->gen_flags, add);
+ }
+
+ return 0;
+}
+
static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
.kind = "bpf",
.owner = THIS_MODULE,
@@ -662,6 +698,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
.change = cls_bpf_change,
.delete = cls_bpf_delete,
.walk = cls_bpf_walk,
+ .reoffload = cls_bpf_reoffload,
.dump = cls_bpf_dump,
.bind_class = cls_bpf_bind_class,
};
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9e8b26a80fb3..9aada2d0ef06 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -24,6 +24,7 @@
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/flow_dissector.h>
+#include <net/geneve.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
@@ -35,6 +36,7 @@ struct fl_flow_key {
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
struct flow_dissector_key_vlan vlan;
+ struct flow_dissector_key_vlan cvlan;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
@@ -51,6 +53,8 @@ struct fl_flow_key {
struct flow_dissector_key_mpls mpls;
struct flow_dissector_key_tcp tcp;
struct flow_dissector_key_ip ip;
+ struct flow_dissector_key_ip enc_ip;
+ struct flow_dissector_key_enc_opts enc_opts;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -70,6 +74,13 @@ struct fl_flow_mask {
struct list_head list;
};
+struct fl_flow_tmplt {
+ struct fl_flow_key dummy_key;
+ struct fl_flow_key mask;
+ struct flow_dissector dissector;
+ struct tcf_chain *chain;
+};
+
struct cls_fl_head {
struct rhashtable ht;
struct list_head masks;
@@ -87,6 +98,7 @@ struct cls_fl_filter {
struct list_head list;
u32 handle;
u32 flags;
+ u32 in_hw_count;
struct rcu_work rwork;
struct net_device *hw_dev;
};
@@ -144,6 +156,23 @@ static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key,
*lmkey++ = *lkey++ & *lmask++;
}
+static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt,
+ struct fl_flow_mask *mask)
+{
+ const long *lmask = fl_key_get_start(&mask->key, mask);
+ const long *ltmplt;
+ int i;
+
+ if (!tmplt)
+ return true;
+ ltmplt = fl_key_get_start(&tmplt->mask, mask);
+ for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) {
+ if (~*ltmplt++ & *lmask++)
+ return false;
+ }
+ return true;
+}
+
static void fl_clear_masked_range(struct fl_flow_key *key,
struct fl_flow_mask *mask)
{
@@ -289,6 +318,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
fl_hw_destroy_filter(tp, f, NULL);
return err;
} else if (err > 0) {
+ f->in_hw_count = err;
tcf_block_offload_inc(block, &f->flags);
}
@@ -447,6 +477,28 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
+ .len = 128 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -498,22 +550,26 @@ static int fl_set_key_mpls(struct nlattr **tb,
}
static void fl_set_key_vlan(struct nlattr **tb,
+ __be16 ethertype,
+ int vlan_id_key, int vlan_prio_key,
struct flow_dissector_key_vlan *key_val,
struct flow_dissector_key_vlan *key_mask)
{
#define VLAN_PRIORITY_MASK 0x7
- if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+ if (tb[vlan_id_key]) {
key_val->vlan_id =
- nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+ nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK;
key_mask->vlan_id = VLAN_VID_MASK;
}
- if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+ if (tb[vlan_prio_key]) {
key_val->vlan_priority =
- nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) &
+ nla_get_u8(tb[vlan_prio_key]) &
VLAN_PRIORITY_MASK;
key_mask->vlan_priority = VLAN_PRIORITY_MASK;
}
+ key_val->vlan_tpid = ethertype;
+ key_mask->vlan_tpid = cpu_to_be16(~0);
}
static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
@@ -551,17 +607,156 @@ static int fl_set_key_flags(struct nlattr **tb,
return 0;
}
-static void fl_set_key_ip(struct nlattr **tb,
+static void fl_set_key_ip(struct nlattr **tb, bool encap,
struct flow_dissector_key_ip *key,
struct flow_dissector_key_ip *mask)
{
- fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS,
- &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK,
- sizeof(key->tos));
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
- fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL,
- &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK,
- sizeof(key->ttl));
+ fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos));
+ fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
+}
+
+static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1];
+ struct nlattr *class = NULL, *type = NULL, *data = NULL;
+ struct geneve_opt *opt;
+ int err, data_len = 0;
+
+ if (option_len > sizeof(struct geneve_opt))
+ data_len = option_len - sizeof(struct geneve_opt);
+
+ opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len];
+ memset(opt, 0xff, option_len);
+ opt->length = data_len / 4;
+ opt->r1 = 0;
+ opt->r2 = 0;
+ opt->r3 = 0;
+
+ /* If no mask has been prodived we assume an exact match. */
+ if (!depth)
+ return sizeof(struct geneve_opt) + data_len;
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GENEVE) {
+ NL_SET_ERR_MSG(extack, "Non-geneve option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+ nla, geneve_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ /* We are not allowed to omit any of CLASS, TYPE or DATA
+ * fields from the key.
+ */
+ if (!option_len &&
+ (!tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data");
+ return -EINVAL;
+ }
+
+ /* Omitting any of CLASS, TYPE or DATA fields is allowed
+ * for the mask.
+ */
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]) {
+ int new_len = key->enc_opts.len;
+
+ data = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA];
+ data_len = nla_len(data);
+ if (data_len < 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long");
+ return -ERANGE;
+ }
+ if (data_len % 4) {
+ NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long");
+ return -ERANGE;
+ }
+
+ new_len += sizeof(struct geneve_opt) + data_len;
+ BUILD_BUG_ON(FLOW_DIS_TUN_OPTS_MAX != IP_TUNNEL_OPTS_MAX);
+ if (new_len > FLOW_DIS_TUN_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -ERANGE;
+ }
+ opt->length = data_len / 4;
+ memcpy(opt->opt_data, nla_data(data), data_len);
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]) {
+ class = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS];
+ opt->opt_class = nla_get_be16(class);
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]) {
+ type = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE];
+ opt->type = nla_get_u8(type);
+ }
+
+ return sizeof(struct geneve_opt) + data_len;
+}
+
+static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL;
+ int option_len, key_depth, msk_depth = 0;
+
+ nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]);
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) {
+ nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
+ }
+
+ nla_for_each_attr(nla_opt_key, nla_enc_key,
+ nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
+ switch (nla_type(nla_opt_key)) {
+ case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+ option_len = fl_set_geneve_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+ option_len = fl_set_geneve_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
}
static int fl_set_key(struct net *net, struct nlattr **tb,
@@ -590,12 +785,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_ETH_TYPE]) {
ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
- if (ethertype == htons(ETH_P_8021Q)) {
- fl_set_key_vlan(tb, &key->vlan, &mask->vlan);
- fl_set_key_val(tb, &key->basic.n_proto,
- TCA_FLOWER_KEY_VLAN_ETH_TYPE,
- &mask->basic.n_proto, TCA_FLOWER_UNSPEC,
- sizeof(key->basic.n_proto));
+ if (eth_type_vlan(ethertype)) {
+ fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan,
+ &mask->vlan);
+
+ if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) {
+ ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]);
+ if (eth_type_vlan(ethertype)) {
+ fl_set_key_vlan(tb, ethertype,
+ TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ &key->cvlan, &mask->cvlan);
+ fl_set_key_val(tb, &key->basic.n_proto,
+ TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ &mask->basic.n_proto,
+ TCA_FLOWER_UNSPEC,
+ sizeof(key->basic.n_proto));
+ } else {
+ key->basic.n_proto = ethertype;
+ mask->basic.n_proto = cpu_to_be16(~0);
+ }
+ }
} else {
key->basic.n_proto = ethertype;
mask->basic.n_proto = cpu_to_be16(~0);
@@ -607,7 +818,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto));
- fl_set_key_ip(tb, &key->ip, &mask->ip);
+ fl_set_key_ip(tb, false, &key->ip, &mask->ip);
}
if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
@@ -742,6 +953,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
&mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
sizeof(key->enc_tp.dst));
+ fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPTS]) {
+ ret = fl_set_enc_opt(tb, key, mask, extack);
+ if (ret)
+ return ret;
+ }
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -774,7 +993,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
-#define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member))
+#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
#define FL_KEY_IS_MASKED(mask, member) \
memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -793,47 +1012,54 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
FL_KEY_SET(keys, cnt, id, member); \
} while(0);
-static void fl_init_dissector(struct fl_flow_mask *mask)
+static void fl_init_dissector(struct flow_dissector *dissector,
+ struct fl_flow_key *mask)
{
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_PORTS, tp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_TCP, tcp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ICMP, icmp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ARP, arp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_MPLS, mpls);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_VLAN, vlan);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_CVLAN, cvlan);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
- if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) ||
- FL_KEY_IS_MASKED(&mask->key, enc_ipv6))
+ if (FL_KEY_IS_MASKED(mask, enc_ipv4) ||
+ FL_KEY_IS_MASKED(mask, enc_ipv6))
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
enc_control);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts);
- skb_flow_dissector_init(&mask->dissector, keys, cnt);
+ skb_flow_dissector_init(dissector, keys, cnt);
}
static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
@@ -852,7 +1078,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
if (err)
goto errout_free;
- fl_init_dissector(newmask);
+ fl_init_dissector(&newmask->dissector, &newmask->key);
INIT_LIST_HEAD_RCU(&newmask->filters);
@@ -901,6 +1127,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
struct cls_fl_filter *f, struct fl_flow_mask *mask,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr,
+ struct fl_flow_tmplt *tmplt,
struct netlink_ext_ack *extack)
{
int err;
@@ -921,6 +1148,11 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
fl_mask_update_range(mask);
fl_set_masked_key(&f->mkey, &f->key, mask);
+ if (!fl_mask_fits_tmplt(tmplt, mask)) {
+ NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -986,7 +1218,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr,
- extack);
+ tp->chain->tmplt_priv, extack);
if (err)
goto errout_idr;
@@ -1071,20 +1303,146 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f;
+
+ arg->count = arg->skip;
+
+ while ((f = idr_get_next_ul(&head->handle_idr,
+ &arg->cookie)) != NULL) {
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->cookie = f->handle + 1;
+ arg->count++;
+ }
+}
+
+static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = tp->chain->block;
struct fl_flow_mask *mask;
+ struct cls_fl_filter *f;
+ int err;
- list_for_each_entry_rcu(mask, &head->masks, list) {
- list_for_each_entry_rcu(f, &mask->filters, list) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
- break;
+ list_for_each_entry(mask, &head->masks, list) {
+ list_for_each_entry(f, &mask->filters, list) {
+ if (tc_skip_hw(f->flags))
+ continue;
+
+ tc_cls_common_offload_init(&cls_flower.common, tp,
+ f->flags, extack);
+ cls_flower.command = add ?
+ TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+ cls_flower.cookie = (unsigned long)f;
+ cls_flower.dissector = &mask->dissector;
+ cls_flower.mask = &mask->key;
+ cls_flower.key = &f->mkey;
+ cls_flower.exts = &f->exts;
+ cls_flower.classid = f->res.classid;
+
+ err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(f->flags))
+ return err;
+ continue;
}
-skip:
- arg->count++;
+
+ tc_cls_offload_cnt_update(block, &f->in_hw_count,
+ &f->flags, add);
}
}
+
+ return 0;
+}
+
+static void fl_hw_create_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = chain->block;
+ struct tcf_exts dummy_exts = { 0, };
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.cookie = (unsigned long) tmplt;
+ cls_flower.dissector = &tmplt->dissector;
+ cls_flower.mask = &tmplt->mask;
+ cls_flower.key = &tmplt->dummy_key;
+ cls_flower.exts = &dummy_exts;
+
+ /* We don't care if driver (any of them) fails to handle this
+ * call. It serves just as a hint for it.
+ */
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = chain->block;
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.cookie = (unsigned long) tmplt;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ struct fl_flow_tmplt *tmplt;
+ struct nlattr **tb;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return ERR_PTR(-EINVAL);
+
+ tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+ if (!tb)
+ return ERR_PTR(-ENOBUFS);
+ err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
+ fl_policy, NULL);
+ if (err)
+ goto errout_tb;
+
+ tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL);
+ if (!tmplt) {
+ err = -ENOMEM;
+ goto errout_tb;
+ }
+ tmplt->chain = chain;
+ err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack);
+ if (err)
+ goto errout_tmplt;
+ kfree(tb);
+
+ fl_init_dissector(&tmplt->dissector, &tmplt->mask);
+
+ fl_hw_create_tmplt(chain, tmplt);
+
+ return tmplt;
+
+errout_tmplt:
+ kfree(tmplt);
+errout_tb:
+ kfree(tb);
+ return ERR_PTR(err);
+}
+
+static void fl_tmplt_destroy(void *tmplt_priv)
+{
+ struct fl_flow_tmplt *tmplt = tmplt_priv;
+
+ fl_hw_destroy_tmplt(tmplt->chain, tmplt);
+ kfree(tmplt);
}
static int fl_dump_key_val(struct sk_buff *skb,
@@ -1141,20 +1499,24 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
return 0;
}
-static int fl_dump_key_ip(struct sk_buff *skb,
+static int fl_dump_key_ip(struct sk_buff *skb, bool encap,
struct flow_dissector_key_ip *key,
struct flow_dissector_key_ip *mask)
{
- if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos,
- TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) ||
- fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl,
- TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl)))
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
+
+ if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) ||
+ fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)))
return -1;
return 0;
}
static int fl_dump_key_vlan(struct sk_buff *skb,
+ int vlan_id_key, int vlan_prio_key,
struct flow_dissector_key_vlan *vlan_key,
struct flow_dissector_key_vlan *vlan_mask)
{
@@ -1163,13 +1525,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb,
if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask)))
return 0;
if (vlan_mask->vlan_id) {
- err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID,
+ err = nla_put_u16(skb, vlan_id_key,
vlan_key->vlan_id);
if (err)
return err;
}
if (vlan_mask->vlan_priority) {
- err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO,
+ err = nla_put_u8(skb, vlan_prio_key,
vlan_key->vlan_priority);
if (err)
return err;
@@ -1216,29 +1578,86 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
}
-static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+static int fl_dump_key_geneve_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
{
- struct cls_fl_filter *f = fh;
+ struct geneve_opt *opt;
struct nlattr *nest;
- struct fl_flow_key *key, *mask;
+ int opt_off = 0;
- if (!f)
- return skb->len;
+ nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE);
+ if (!nest)
+ goto nla_put_failure;
- t->tcm_handle = f->handle;
+ while (enc_opts->len > opt_off) {
+ opt = (struct geneve_opt *)&enc_opts->data[opt_off];
- nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,
+ opt->opt_class))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,
+ opt->type))
+ goto nla_put_failure;
+ if (nla_put(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,
+ opt->length * 4, opt->opt_data))
+ goto nla_put_failure;
+
+ opt_off += sizeof(struct geneve_opt) + opt->length * 4;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct nlattr *nest;
+ int err;
+
+ if (!enc_opts->len)
+ return 0;
+
+ nest = nla_nest_start(skb, enc_opt_type);
if (!nest)
goto nla_put_failure;
- if (f->res.classid &&
- nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
+ switch (enc_opts->dst_opt_type) {
+ case TUNNEL_GENEVE_OPT:
+ err = fl_dump_key_geneve_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ default:
goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ return 0;
- key = &f->key;
- mask = &f->mask->key;
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_enc_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *key_opts,
+ struct flow_dissector_key_enc_opts *msk_opts)
+{
+ int err;
+
+ err = fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS, key_opts);
+ if (err)
+ return err;
+ return fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS_MASK, msk_opts);
+}
+
+static int fl_dump_key(struct sk_buff *skb, struct net *net,
+ struct fl_flow_key *key, struct fl_flow_key *mask)
+{
if (mask->indev_ifindex) {
struct net_device *dev;
@@ -1247,9 +1666,6 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
goto nla_put_failure;
}
- if (!tc_skip_hw(f->flags))
- fl_hw_update_stats(tp, f);
-
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
sizeof(key->eth.dst)) ||
@@ -1264,15 +1680,36 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls))
goto nla_put_failure;
- if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan))
+ if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID,
+ TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan))
+ goto nla_put_failure;
+
+ if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID,
+ TCA_FLOWER_KEY_CVLAN_PRIO,
+ &key->cvlan, &mask->cvlan) ||
+ (mask->cvlan.vlan_tpid &&
+ nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->cvlan.vlan_tpid)))
goto nla_put_failure;
+ if (mask->basic.n_proto) {
+ if (mask->cvlan.vlan_tpid) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE,
+ key->basic.n_proto))
+ goto nla_put_failure;
+ } else if (mask->vlan.vlan_tpid) {
+ if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->basic.n_proto))
+ goto nla_put_failure;
+ }
+ }
+
if ((key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) &&
(fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto)) ||
- fl_dump_key_ip(skb, &key->ip, &mask->ip)))
+ fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
goto nla_put_failure;
if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
@@ -1397,15 +1834,55 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
&mask->enc_tp.dst,
TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
- sizeof(key->enc_tp.dst)))
+ sizeof(key->enc_tp.dst)) ||
+ fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip) ||
+ fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts))
goto nla_put_failure;
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_fl_filter *f = fh;
+ struct nlattr *nest;
+ struct fl_flow_key *key, *mask;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ key = &f->key;
+ mask = &f->mask->key;
+
+ if (fl_dump_key(skb, net, key, mask))
+ goto nla_put_failure;
+
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
+
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
+ goto nla_put_failure;
+
if (tcf_exts_dump(skb, &f->exts))
goto nla_put_failure;
@@ -1421,6 +1898,31 @@ nla_put_failure:
return -1;
}
+static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
+{
+ struct fl_flow_tmplt *tmplt = tmplt_priv;
+ struct fl_flow_key *key, *mask;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ key = &tmplt->dummy_key;
+ mask = &tmplt->mask;
+
+ if (fl_dump_key(skb, net, key, mask))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
{
struct cls_fl_filter *f = fh;
@@ -1438,8 +1940,12 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.change = fl_change,
.delete = fl_delete,
.walk = fl_walk,
+ .reoffload = fl_reoffload,
.dump = fl_dump,
.bind_class = fl_bind_class,
+ .tmplt_create = fl_tmplt_create,
+ .tmplt_destroy = fl_tmplt_destroy,
+ .tmplt_dump = fl_tmplt_dump,
.owner = THIS_MODULE,
};
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 47b207ef7762..856fa79d4ffd 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
struct tcf_result res;
u32 handle;
u32 flags;
+ unsigned int in_hw_count;
struct rcu_work rwork;
};
@@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
} else if (err > 0) {
+ head->in_hw_count = err;
tcf_block_offload_inc(block, &head->flags);
}
@@ -111,6 +113,8 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
if (!head)
return;
+ tcf_unbind_filter(tp, &head->res);
+
if (!tc_skip_hw(head->flags))
mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
@@ -235,6 +239,35 @@ skip:
arg->count++;
}
+static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct cls_mall_head *head = rtnl_dereference(tp->root);
+ struct tc_cls_matchall_offload cls_mall = {};
+ struct tcf_block *block = tp->chain->block;
+ int err;
+
+ if (tc_skip_hw(head->flags))
+ return 0;
+
+ tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
+ cls_mall.command = add ?
+ TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
+ cls_mall.exts = &head->exts;
+ cls_mall.cookie = (unsigned long)head;
+
+ err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(head->flags))
+ return err;
+ return 0;
+ }
+
+ tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+
+ return 0;
+}
+
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
struct sk_buff *skb, struct tcmsg *t)
{
@@ -289,6 +322,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = {
.change = mall_change,
.delete = mall_delete,
.walk = mall_walk,
+ .reoffload = mall_reoffload,
.dump = mall_dump,
.bind_class = mall_bind_class,
.owner = THIS_MODULE,
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 32f4bbd82f35..9ccc93f257db 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -447,11 +447,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
tcf_bind_filter(tp, &cr.res, base);
}
- if (old_r)
- tcf_exts_change(&r->exts, &e);
- else
- tcf_exts_change(&cr.exts, &e);
-
if (old_r && old_r != r) {
err = tcindex_filter_result_init(old_r);
if (err < 0) {
@@ -462,12 +457,15 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
oldp = p;
r->res = cr.res;
+ tcf_exts_change(&r->exts, &e);
+
rcu_assign_pointer(tp->root, cp);
if (r == &new_filter_result) {
struct tcindex_filter *nfp;
struct tcindex_filter __rcu **fp;
+ f->result.res = r->res;
tcf_exts_change(&f->result.exts, &r->exts);
fp = cp->h + (handle % cp->hash);
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index fb861f90fde6..4b28fd44576d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -62,12 +62,12 @@ struct tc_u_knode {
struct tc_u32_pcnt __percpu *pf;
#endif
u32 flags;
+ unsigned int in_hw_count;
#ifdef CONFIG_CLS_U32_MARK
u32 val;
u32 mask;
u32 __percpu *pcpu_success;
#endif
- struct tcf_proto *tp;
struct rcu_work rwork;
/* The 'sel' field MUST be the last field in structure to allow for
* tc_u32_keys allocated at end of structure.
@@ -79,10 +79,10 @@ struct tc_u_hnode {
struct tc_u_hnode __rcu *next;
u32 handle;
u32 prio;
- struct tc_u_common *tp_c;
int refcnt;
unsigned int divisor;
struct idr handle_idr;
+ bool is_root;
struct rcu_head rcu;
u32 flags;
/* The 'ht' field MUST be the last field in structure to allow for
@@ -97,7 +97,7 @@ struct tc_u_common {
int refcnt;
struct idr handle_idr;
struct hlist_node hnode;
- struct rcu_head rcu;
+ long knodes;
};
static inline unsigned int u32_hash_fold(__be32 key,
@@ -343,19 +343,16 @@ static void *tc_u_common_ptr(const struct tcf_proto *tp)
return block->q;
}
-static unsigned int tc_u_hash(const struct tcf_proto *tp)
+static struct hlist_head *tc_u_hash(void *key)
{
- return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT);
+ return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT);
}
-static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
+static struct tc_u_common *tc_u_common_find(void *key)
{
struct tc_u_common *tc;
- unsigned int h;
-
- h = tc_u_hash(tp);
- hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
- if (tc->ptr == tc_u_common_ptr(tp))
+ hlist_for_each_entry(tc, tc_u_hash(key), hnode) {
+ if (tc->ptr == key)
return tc;
}
return NULL;
@@ -364,10 +361,8 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
static int u32_init(struct tcf_proto *tp)
{
struct tc_u_hnode *root_ht;
- struct tc_u_common *tp_c;
- unsigned int h;
-
- tp_c = tc_u_common_find(tp);
+ void *key = tc_u_common_ptr(tp);
+ struct tc_u_common *tp_c = tc_u_common_find(key);
root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
if (root_ht == NULL)
@@ -376,6 +371,7 @@ static int u32_init(struct tcf_proto *tp)
root_ht->refcnt++;
root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
root_ht->prio = tp->prio;
+ root_ht->is_root = true;
idr_init(&root_ht->handle_idr);
if (tp_c == NULL) {
@@ -384,26 +380,24 @@ static int u32_init(struct tcf_proto *tp)
kfree(root_ht);
return -ENOBUFS;
}
- tp_c->ptr = tc_u_common_ptr(tp);
+ tp_c->ptr = key;
INIT_HLIST_NODE(&tp_c->hnode);
idr_init(&tp_c->handle_idr);
- h = tc_u_hash(tp);
- hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
+ hlist_add_head(&tp_c->hnode, tc_u_hash(key));
}
tp_c->refcnt++;
RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, root_ht);
- root_ht->tp_c = tp_c;
+ root_ht->refcnt++;
rcu_assign_pointer(tp->root, root_ht);
tp->data = tp_c;
return 0;
}
-static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
- bool free_pf)
+static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
{
struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
@@ -437,7 +431,7 @@ static void u32_delete_key_work(struct work_struct *work)
struct tc_u_knode,
rwork);
rtnl_lock();
- u32_destroy_key(key->tp, key, false);
+ u32_destroy_key(key, false);
rtnl_unlock();
}
@@ -454,12 +448,13 @@ static void u32_delete_key_freepf_work(struct work_struct *work)
struct tc_u_knode,
rwork);
rtnl_lock();
- u32_destroy_key(key->tp, key, true);
+ u32_destroy_key(key, true);
rtnl_unlock();
}
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
{
+ struct tc_u_common *tp_c = tp->data;
struct tc_u_knode __rcu **kp;
struct tc_u_knode *pkp;
struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
@@ -470,6 +465,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
if (pkp == key) {
RCU_INIT_POINTER(*kp, key->next);
+ tp_c->knodes--;
tcf_unbind_filter(tp, &key->res);
idr_remove(&ht->handle_idr, key->handle);
@@ -571,6 +567,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
u32_remove_hw_knode(tp, n, NULL);
return err;
} else if (err > 0) {
+ n->in_hw_count = err;
tcf_block_offload_inc(block, &n->flags);
}
@@ -583,6 +580,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
struct netlink_ext_ack *extack)
{
+ struct tc_u_common *tp_c = tp->data;
struct tc_u_knode *n;
unsigned int h;
@@ -590,13 +588,14 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
RCU_INIT_POINTER(ht->ht[h],
rtnl_dereference(n->next));
+ tp_c->knodes--;
tcf_unbind_filter(tp, &n->res);
u32_remove_hw_knode(tp, n, extack);
idr_remove(&ht->handle_idr, n->handle);
if (tcf_exts_get_net(&n->exts))
tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
else
- u32_destroy_key(n->tp, n, true);
+ u32_destroy_key(n, true);
}
}
}
@@ -608,7 +607,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
struct tc_u_hnode __rcu **hn;
struct tc_u_hnode *phn;
- WARN_ON(ht->refcnt);
+ WARN_ON(--ht->refcnt);
u32_clear_hnode(tp, ht, extack);
@@ -629,17 +628,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
return -ENOENT;
}
-static bool ht_empty(struct tc_u_hnode *ht)
-{
- unsigned int h;
-
- for (h = 0; h <= ht->divisor; h++)
- if (rcu_access_pointer(ht->ht[h]))
- return false;
-
- return true;
-}
-
static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
{
struct tc_u_common *tp_c = tp->data;
@@ -647,7 +635,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
WARN_ON(root_ht == NULL);
- if (root_ht && --root_ht->refcnt == 0)
+ if (root_ht && --root_ht->refcnt == 1)
u32_destroy_hnode(tp, root_ht, extack);
if (--tp_c->refcnt == 0) {
@@ -677,26 +665,21 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
struct netlink_ext_ack *extack)
{
struct tc_u_hnode *ht = arg;
- struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
struct tc_u_common *tp_c = tp->data;
int ret = 0;
- if (ht == NULL)
- goto out;
-
if (TC_U32_KEY(ht->handle)) {
u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack);
ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
goto out;
}
- if (root_ht == ht) {
+ if (ht->is_root) {
NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
return -EINVAL;
}
if (ht->refcnt == 1) {
- ht->refcnt--;
u32_destroy_hnode(tp, ht, extack);
} else {
NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
@@ -704,38 +687,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
}
out:
- *last = true;
- if (root_ht) {
- if (root_ht->refcnt > 1) {
- *last = false;
- goto ret;
- }
- if (root_ht->refcnt == 1) {
- if (!ht_empty(root_ht)) {
- *last = false;
- goto ret;
- }
- }
- }
-
- if (tp_c->refcnt > 1) {
- *last = false;
- goto ret;
- }
-
- if (tp_c->refcnt == 1) {
- struct tc_u_hnode *ht;
-
- for (ht = rtnl_dereference(tp_c->hlist);
- ht;
- ht = rtnl_dereference(ht->next))
- if (!ht_empty(ht)) {
- *last = false;
- break;
- }
- }
-
-ret:
+ *last = tp_c->refcnt == 1 && tp_c->knodes == 0;
return ret;
}
@@ -766,7 +718,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
};
static int u32_set_parms(struct net *net, struct tcf_proto *tp,
- unsigned long base, struct tc_u_hnode *ht,
+ unsigned long base,
struct tc_u_knode *n, struct nlattr **tb,
struct nlattr *est, bool ovr,
struct netlink_ext_ack *extack)
@@ -787,12 +739,16 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
}
if (handle) {
- ht_down = u32_lookup_ht(ht->tp_c, handle);
+ ht_down = u32_lookup_ht(tp->data, handle);
if (!ht_down) {
NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
return -EINVAL;
}
+ if (ht_down->is_root) {
+ NL_SET_ERR_MSG_MOD(extack, "Not linking to root node");
+ return -EINVAL;
+ }
ht_down->refcnt++;
}
@@ -889,7 +845,6 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
/* Similarly success statistics must be moved as pointers */
new->pcpu_success = n->pcpu_success;
#endif
- new->tp = tp;
memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
@@ -912,6 +867,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_U32_MAX + 1];
u32 htid, flags = 0;
+ size_t sel_size;
int err;
#ifdef CONFIG_CLS_U32_PERF
size_t size;
@@ -957,18 +913,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (!new)
return -ENOMEM;
- err = u32_set_parms(net, tp, base,
- rtnl_dereference(n->ht_up), new, tb,
+ err = u32_set_parms(net, tp, base, new, tb,
tca[TCA_RATE], ovr, extack);
if (err) {
- u32_destroy_key(tp, new, false);
+ u32_destroy_key(new, false);
return err;
}
err = u32_replace_hw_knode(tp, new, flags, extack);
if (err) {
- u32_destroy_key(tp, new, false);
+ u32_destroy_key(new, false);
return err;
}
@@ -985,7 +940,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (tb[TCA_U32_DIVISOR]) {
unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
- if (--divisor > 0x100) {
+ if (!is_power_of_2(divisor)) {
+ NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2");
+ return -EINVAL;
+ }
+ if (divisor-- > 0x100) {
NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
return -EINVAL;
}
@@ -1010,7 +969,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
return err;
}
}
- ht->tp_c = tp_c;
ht->refcnt = 1;
ht->divisor = divisor;
ht->handle = handle;
@@ -1074,8 +1032,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
}
s = nla_data(tb[TCA_U32_SEL]);
+ sel_size = struct_size(s, keys, s->nkeys);
+ if (nla_len(tb[TCA_U32_SEL]) < sel_size) {
+ err = -EINVAL;
+ goto erridr;
+ }
- n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
+ n = kzalloc(offsetof(typeof(*n), sel) + sel_size, GFP_KERNEL);
if (n == NULL) {
err = -ENOBUFS;
goto erridr;
@@ -1090,12 +1053,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
}
#endif
- memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
+ memcpy(&n->sel, s, sel_size);
RCU_INIT_POINTER(n->ht_up, ht);
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
n->flags = flags;
- n->tp = tp;
err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
if (err < 0)
@@ -1117,7 +1079,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
}
#endif
- err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr,
+ err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr,
extack);
if (err == 0) {
struct tc_u_knode __rcu **ins;
@@ -1138,6 +1100,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
RCU_INIT_POINTER(n->next, pins);
rcu_assign_pointer(*ins, n);
+ tp_c->knodes++;
*arg = n;
return 0;
}
@@ -1199,6 +1162,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
}
}
+static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+ bool add, tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_cls_u32_offload cls_u32 = {};
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack);
+ cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE;
+ cls_u32.hnode.divisor = ht->divisor;
+ cls_u32.hnode.handle = ht->handle;
+ cls_u32.hnode.prio = ht->prio;
+
+ err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+ if (err && add && tc_skip_sw(ht->flags))
+ return err;
+
+ return 0;
+}
+
+static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+ bool add, tc_setup_cb_t *cb, void *cb_priv,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+ struct tcf_block *block = tp->chain->block;
+ struct tc_cls_u32_offload cls_u32 = {};
+ int err;
+
+ tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
+ cls_u32.command = add ?
+ TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE;
+ cls_u32.knode.handle = n->handle;
+
+ if (add) {
+ cls_u32.knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+ cls_u32.knode.val = n->val;
+ cls_u32.knode.mask = n->mask;
+#else
+ cls_u32.knode.val = 0;
+ cls_u32.knode.mask = 0;
+#endif
+ cls_u32.knode.sel = &n->sel;
+ cls_u32.knode.exts = &n->exts;
+ if (n->ht_down)
+ cls_u32.knode.link_handle = ht->handle;
+ }
+
+ err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+ if (err) {
+ if (add && tc_skip_sw(n->flags))
+ return err;
+ return 0;
+ }
+
+ tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+
+ return 0;
+}
+
+static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+ void *cb_priv, struct netlink_ext_ack *extack)
+{
+ struct tc_u_common *tp_c = tp->data;
+ struct tc_u_hnode *ht;
+ struct tc_u_knode *n;
+ unsigned int h;
+ int err;
+
+ for (ht = rtnl_dereference(tp_c->hlist);
+ ht;
+ ht = rtnl_dereference(ht->next)) {
+ if (ht->prio != tp->prio)
+ continue;
+
+ /* When adding filters to a new dev, try to offload the
+ * hashtable first. When removing, do the filters before the
+ * hashtable.
+ */
+ if (add && !tc_skip_hw(ht->flags)) {
+ err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv,
+ extack);
+ if (err)
+ return err;
+ }
+
+ for (h = 0; h <= ht->divisor; h++) {
+ for (n = rtnl_dereference(ht->ht[h]);
+ n;
+ n = rtnl_dereference(n->next)) {
+ if (tc_skip_hw(n->flags))
+ continue;
+
+ err = u32_reoffload_knode(tp, n, add, cb,
+ cb_priv, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ if (!add && !tc_skip_hw(ht->flags))
+ u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack);
+ }
+
+ return 0;
+}
+
static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
{
struct tc_u_knode *n = fh;
@@ -1336,6 +1407,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
.change = u32_change,
.delete = u32_delete,
.walk = u32_walk,
+ .reoffload = u32_reoffload,
.dump = u32_dump,
.bind_class = u32_bind_class,
.owner = THIS_MODULE,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 54eca685420f..022bca98bde6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -27,7 +27,6 @@
#include <linux/kmod.h>
#include <linux/list.h>
#include <linux/hrtimer.h>
-#include <linux/lockdep.h>
#include <linux/slab.h>
#include <linux/hashtable.h>
@@ -315,6 +314,24 @@ out:
return q;
}
+struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
+{
+ struct netdev_queue *nq;
+ struct Qdisc *q;
+
+ if (!handle)
+ return NULL;
+ q = qdisc_match_from_root(dev->qdisc, handle);
+ if (q)
+ goto out;
+
+ nq = dev_ingress_queue_rcu(dev);
+ if (nq)
+ q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
+out:
+ return q;
+}
+
static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
{
unsigned long cl;
@@ -596,12 +613,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
+ clockid_t clockid)
{
- hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
wd->timer.function = qdisc_watchdog;
wd->qdisc = qdisc;
}
+EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
+
+void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
+{
+ qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
+}
EXPORT_SYMBOL(qdisc_watchdog_init);
void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
@@ -914,7 +938,7 @@ static void notify_and_destroy(struct net *net, struct sk_buff *skb,
qdisc_notify(net, skb, n, clid, old, new);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
}
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
@@ -967,7 +991,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_refcount_inc(new);
if (!ingress)
- qdisc_destroy(old);
+ qdisc_put(old);
}
skip:
@@ -1046,10 +1070,6 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
return 0;
}
-/* lockdep annotation is needed for ingress; egress gets it only for name */
-static struct lock_class_key qdisc_tx_lock;
-static struct lock_class_key qdisc_rx_lock;
-
/*
Allocate and initialize new qdisc.
@@ -1114,7 +1134,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (handle == TC_H_INGRESS) {
sch->flags |= TCQ_F_INGRESS;
handle = TC_H_MAKE(TC_H_INGRESS, 0);
- lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
} else {
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
@@ -1122,7 +1141,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (handle == 0)
goto err_out3;
}
- lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
if (!netif_is_multiqueue(dev))
sch->flags |= TCQ_F_ONETXQUEUE;
}
@@ -1300,6 +1318,18 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
return 0;
}
+const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
+ [TCA_KIND] = { .type = NLA_STRING },
+ [TCA_OPTIONS] = { .type = NLA_NESTED },
+ [TCA_RATE] = { .type = NLA_BINARY,
+ .len = sizeof(struct tc_estimator) },
+ [TCA_STAB] = { .type = NLA_NESTED },
+ [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
+ [TCA_CHAIN] = { .type = NLA_U32 },
+ [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
+ [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
+};
+
/*
* Delete/get qdisc.
*/
@@ -1320,7 +1350,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ extack);
if (err < 0)
return err;
@@ -1404,7 +1435,8 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
replay:
/* Reinit, just in case something touches this. */
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ extack);
if (err < 0)
return err;
@@ -1561,7 +1593,7 @@ graft:
err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
if (err) {
if (q)
- qdisc_destroy(q);
+ qdisc_put(q);
return err;
}
@@ -1638,7 +1670,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
idx = 0;
ASSERT_RTNL();
- err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
+ err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
+ rtm_tca_policy, cb->extack);
if (err < 0)
return err;
@@ -1857,7 +1890,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
+ err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
+ extack);
if (err < 0)
return err;
@@ -2036,7 +2070,8 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
if (tcm->tcm_parent) {
q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
- if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+ if (q && q != root &&
+ tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
return -1;
return 0;
}
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index cd49afca9617..d714d3747bcb 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -150,7 +150,7 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
pr_debug("atm_tc_put: destroying\n");
list_del_init(&flow->list);
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
- qdisc_destroy(flow->q);
+ qdisc_put(flow->q);
tcf_block_put(flow->block);
if (flow->sock) {
pr_debug("atm_tc_put: f_count %ld\n",
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
new file mode 100644
index 000000000000..b910cd5c56f7
--- /dev/null
+++ b/net/sched/sch_cake.c
@@ -0,0 +1,3034 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+
+/* COMMON Applications Kept Enhanced (CAKE) discipline
+ *
+ * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
+ * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
+ * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
+ * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
+ * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
+ * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
+ *
+ * The CAKE Principles:
+ * (or, how to have your cake and eat it too)
+ *
+ * This is a combination of several shaping, AQM and FQ techniques into one
+ * easy-to-use package:
+ *
+ * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
+ * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
+ * eliminating the need for any sort of burst parameter (eg. token bucket
+ * depth). Burst support is limited to that necessary to overcome scheduling
+ * latency.
+ *
+ * - A Diffserv-aware priority queue, giving more priority to certain classes,
+ * up to a specified fraction of bandwidth. Above that bandwidth threshold,
+ * the priority is reduced to avoid starving other tins.
+ *
+ * - Each priority tin has a separate Flow Queue system, to isolate traffic
+ * flows from each other. This prevents a burst on one flow from increasing
+ * the delay to another. Flows are distributed to queues using a
+ * set-associative hash function.
+ *
+ * - Each queue is actively managed by Cobalt, which is a combination of the
+ * Codel and Blue AQM algorithms. This serves flows fairly, and signals
+ * congestion early via ECN (if available) and/or packet drops, to keep
+ * latency low. The codel parameters are auto-tuned based on the bandwidth
+ * setting, as is necessary at low bandwidths.
+ *
+ * The configuration parameters are kept deliberately simple for ease of use.
+ * Everything has sane defaults. Complete generality of configuration is *not*
+ * a goal.
+ *
+ * The priority queue operates according to a weighted DRR scheme, combined with
+ * a bandwidth tracker which reuses the shaper logic to detect which side of the
+ * bandwidth sharing threshold the tin is operating. This determines whether a
+ * priority-based weight (high) or a bandwidth-based weight (low) is used for
+ * that tin in the current pass.
+ *
+ * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
+ * granted us permission to leverage.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/reciprocal_div.h>
+#include <net/netlink.h>
+#include <linux/if_vlan.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/tcp.h>
+#include <net/flow_dissector.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
+#define CAKE_SET_WAYS (8)
+#define CAKE_MAX_TINS (8)
+#define CAKE_QUEUES (1024)
+#define CAKE_FLOW_MASK 63
+#define CAKE_FLOW_NAT_FLAG 64
+
+/* struct cobalt_params - contains codel and blue parameters
+ * @interval: codel initial drop rate
+ * @target: maximum persistent sojourn time & blue update rate
+ * @mtu_time: serialisation delay of maximum-size packet
+ * @p_inc: increment of blue drop probability (0.32 fxp)
+ * @p_dec: decrement of blue drop probability (0.32 fxp)
+ */
+struct cobalt_params {
+ u64 interval;
+ u64 target;
+ u64 mtu_time;
+ u32 p_inc;
+ u32 p_dec;
+};
+
+/* struct cobalt_vars - contains codel and blue variables
+ * @count: codel dropping frequency
+ * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
+ * @drop_next: time to drop next packet, or when we dropped last
+ * @blue_timer: Blue time to next drop
+ * @p_drop: BLUE drop probability (0.32 fxp)
+ * @dropping: set if in dropping state
+ * @ecn_marked: set if marked
+ */
+struct cobalt_vars {
+ u32 count;
+ u32 rec_inv_sqrt;
+ ktime_t drop_next;
+ ktime_t blue_timer;
+ u32 p_drop;
+ bool dropping;
+ bool ecn_marked;
+};
+
+enum {
+ CAKE_SET_NONE = 0,
+ CAKE_SET_SPARSE,
+ CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
+ CAKE_SET_BULK,
+ CAKE_SET_DECAYING
+};
+
+struct cake_flow {
+ /* this stuff is all needed per-flow at dequeue time */
+ struct sk_buff *head;
+ struct sk_buff *tail;
+ struct list_head flowchain;
+ s32 deficit;
+ u32 dropped;
+ struct cobalt_vars cvars;
+ u16 srchost; /* index into cake_host table */
+ u16 dsthost;
+ u8 set;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct cake_host {
+ u32 srchost_tag;
+ u32 dsthost_tag;
+ u16 srchost_refcnt;
+ u16 dsthost_refcnt;
+};
+
+struct cake_heap_entry {
+ u16 t:3, b:10;
+};
+
+struct cake_tin_data {
+ struct cake_flow flows[CAKE_QUEUES];
+ u32 backlogs[CAKE_QUEUES];
+ u32 tags[CAKE_QUEUES]; /* for set association */
+ u16 overflow_idx[CAKE_QUEUES];
+ struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
+ u16 flow_quantum;
+
+ struct cobalt_params cparams;
+ u32 drop_overlimit;
+ u16 bulk_flow_count;
+ u16 sparse_flow_count;
+ u16 decaying_flow_count;
+ u16 unresponsive_flow_count;
+
+ u32 max_skblen;
+
+ struct list_head new_flows;
+ struct list_head old_flows;
+ struct list_head decaying_flows;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ ktime_t time_next_packet;
+ u64 tin_rate_ns;
+ u64 tin_rate_bps;
+ u16 tin_rate_shft;
+
+ u16 tin_quantum_prio;
+ u16 tin_quantum_band;
+ s32 tin_deficit;
+ u32 tin_backlog;
+ u32 tin_dropped;
+ u32 tin_ecn_mark;
+
+ u32 packets;
+ u64 bytes;
+
+ u32 ack_drops;
+
+ /* moving averages */
+ u64 avge_delay;
+ u64 peak_delay;
+ u64 base_delay;
+
+ /* hash function stats */
+ u32 way_directs;
+ u32 way_hits;
+ u32 way_misses;
+ u32 way_collisions;
+}; /* number of tins is small, so size of this struct doesn't matter much */
+
+struct cake_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct cake_tin_data *tins;
+
+ struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
+ u16 overflow_timeout;
+
+ u16 tin_cnt;
+ u8 tin_mode;
+ u8 flow_mode;
+ u8 ack_filter;
+ u8 atm_mode;
+
+ /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
+ u16 rate_shft;
+ ktime_t time_next_packet;
+ ktime_t failsafe_next_packet;
+ u64 rate_ns;
+ u64 rate_bps;
+ u16 rate_flags;
+ s16 rate_overhead;
+ u16 rate_mpu;
+ u64 interval;
+ u64 target;
+
+ /* resource tracking */
+ u32 buffer_used;
+ u32 buffer_max_used;
+ u32 buffer_limit;
+ u32 buffer_config_limit;
+
+ /* indices for dequeue */
+ u16 cur_tin;
+ u16 cur_flow;
+
+ struct qdisc_watchdog watchdog;
+ const u8 *tin_index;
+ const u8 *tin_order;
+
+ /* bandwidth capacity estimate */
+ ktime_t last_packet_time;
+ ktime_t avg_window_begin;
+ u64 avg_packet_interval;
+ u64 avg_window_bytes;
+ u64 avg_peak_bandwidth;
+ ktime_t last_reconfig_time;
+
+ /* packet length stats */
+ u32 avg_netoff;
+ u16 max_netlen;
+ u16 max_adjlen;
+ u16 min_netlen;
+ u16 min_adjlen;
+};
+
+enum {
+ CAKE_FLAG_OVERHEAD = BIT(0),
+ CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
+ CAKE_FLAG_INGRESS = BIT(2),
+ CAKE_FLAG_WASH = BIT(3),
+ CAKE_FLAG_SPLIT_GSO = BIT(4)
+};
+
+/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
+ * obtain the best features of each. Codel is excellent on flows which
+ * respond to congestion signals in a TCP-like way. BLUE is more effective on
+ * unresponsive flows.
+ */
+
+struct cobalt_skb_cb {
+ ktime_t enqueue_time;
+ u32 adjusted_len;
+};
+
+static u64 us_to_ns(u64 us)
+{
+ return us * NSEC_PER_USEC;
+}
+
+static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
+ return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
+{
+ return get_cobalt_cb(skb)->enqueue_time;
+}
+
+static void cobalt_set_enqueue_time(struct sk_buff *skb,
+ ktime_t now)
+{
+ get_cobalt_cb(skb)->enqueue_time = now;
+}
+
+static u16 quantum_div[CAKE_QUEUES + 1] = {0};
+
+/* Diffserv lookup tables */
+
+static const u8 precedence[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+static const u8 diffserv8[] = {
+ 2, 5, 1, 2, 4, 2, 2, 2,
+ 0, 2, 1, 2, 1, 2, 1, 2,
+ 5, 2, 4, 2, 4, 2, 4, 2,
+ 3, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 3, 2, 3, 2, 3, 2,
+ 6, 2, 2, 2, 6, 2, 6, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+ 7, 2, 2, 2, 2, 2, 2, 2,
+};
+
+static const u8 diffserv4[] = {
+ 0, 2, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 2, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 2, 0, 2, 0, 2, 0,
+ 3, 0, 0, 0, 3, 0, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 diffserv3[] = {
+ 0, 0, 0, 0, 2, 0, 0, 0,
+ 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 2, 0, 2, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+ 2, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const u8 besteffort[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* tin priority order for stats dumping */
+
+static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
+static const u8 bulk_order[] = {1, 0, 2, 3};
+
+#define REC_INV_SQRT_CACHE (16)
+static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
+
+/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+
+static void cobalt_newton_step(struct cobalt_vars *vars)
+{
+ u32 invsqrt, invsqrt2;
+ u64 val;
+
+ invsqrt = vars->rec_inv_sqrt;
+ invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+ val = (3LL << 32) - ((u64)vars->count * invsqrt2);
+
+ val >>= 2; /* avoid overflow in following multiply */
+ val = (val * invsqrt) >> (32 - 2 + 1);
+
+ vars->rec_inv_sqrt = val;
+}
+
+static void cobalt_invsqrt(struct cobalt_vars *vars)
+{
+ if (vars->count < REC_INV_SQRT_CACHE)
+ vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
+ else
+ cobalt_newton_step(vars);
+}
+
+/* There is a big difference in timing between the accurate values placed in
+ * the cache and the approximations given by a single Newton step for small
+ * count values, particularly when stepping from count 1 to 2 or vice versa.
+ * Above 16, a single Newton step gives sufficient accuracy in either
+ * direction, given the precision stored.
+ *
+ * The magnitude of the error when stepping up to count 2 is such as to give
+ * the value that *should* have been produced at count 4.
+ */
+
+static void cobalt_cache_init(void)
+{
+ struct cobalt_vars v;
+
+ memset(&v, 0, sizeof(v));
+ v.rec_inv_sqrt = ~0U;
+ cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
+
+ for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+ cobalt_newton_step(&v);
+
+ cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
+ }
+}
+
+static void cobalt_vars_init(struct cobalt_vars *vars)
+{
+ memset(vars, 0, sizeof(*vars));
+
+ if (!cobalt_rec_inv_sqrt_cache[0]) {
+ cobalt_cache_init();
+ cobalt_rec_inv_sqrt_cache[0] = ~0;
+ }
+}
+
+/* CoDel control_law is t + interval/sqrt(count)
+ * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
+ * both sqrt() and divide operation.
+ */
+static ktime_t cobalt_control(ktime_t t,
+ u64 interval,
+ u32 rec_inv_sqrt)
+{
+ return ktime_add_ns(t, reciprocal_scale(interval,
+ rec_inv_sqrt));
+}
+
+/* Call this when a packet had to be dropped due to queue overflow. Returns
+ * true if the BLUE state was quiescent before but active after this call.
+ */
+static bool cobalt_queue_full(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now)
+{
+ bool up = false;
+
+ if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+ up = !vars->p_drop;
+ vars->p_drop += p->p_inc;
+ if (vars->p_drop < p->p_inc)
+ vars->p_drop = ~0;
+ vars->blue_timer = now;
+ }
+ vars->dropping = true;
+ vars->drop_next = now;
+ if (!vars->count)
+ vars->count = 1;
+
+ return up;
+}
+
+/* Call this when the queue was serviced but turned out to be empty. Returns
+ * true if the BLUE state was active before but quiescent after this call.
+ */
+static bool cobalt_queue_empty(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now)
+{
+ bool down = false;
+
+ if (vars->p_drop &&
+ ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
+ if (vars->p_drop < p->p_dec)
+ vars->p_drop = 0;
+ else
+ vars->p_drop -= p->p_dec;
+ vars->blue_timer = now;
+ down = !vars->p_drop;
+ }
+ vars->dropping = false;
+
+ if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+
+ return down;
+}
+
+/* Call this with a freshly dequeued packet for possible congestion marking.
+ * Returns true as an instruction to drop the packet, false for delivery.
+ */
+static bool cobalt_should_drop(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now,
+ struct sk_buff *skb,
+ u32 bulk_flows)
+{
+ bool next_due, over_target, drop = false;
+ ktime_t schedule;
+ u64 sojourn;
+
+/* The 'schedule' variable records, in its sign, whether 'now' is before or
+ * after 'drop_next'. This allows 'drop_next' to be updated before the next
+ * scheduling decision is actually branched, without destroying that
+ * information. Similarly, the first 'schedule' value calculated is preserved
+ * in the boolean 'next_due'.
+ *
+ * As for 'drop_next', we take advantage of the fact that 'interval' is both
+ * the delay between first exceeding 'target' and the first signalling event,
+ * *and* the scaling factor for the signalling frequency. It's therefore very
+ * natural to use a single mechanism for both purposes, and eliminates a
+ * significant amount of reference Codel's spaghetti code. To help with this,
+ * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
+ * as possible to 1.0 in fixed-point.
+ */
+
+ sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+ schedule = ktime_sub(now, vars->drop_next);
+ over_target = sojourn > p->target &&
+ sojourn > p->mtu_time * bulk_flows * 2 &&
+ sojourn > p->mtu_time * 4;
+ next_due = vars->count && ktime_to_ns(schedule) >= 0;
+
+ vars->ecn_marked = false;
+
+ if (over_target) {
+ if (!vars->dropping) {
+ vars->dropping = true;
+ vars->drop_next = cobalt_control(now,
+ p->interval,
+ vars->rec_inv_sqrt);
+ }
+ if (!vars->count)
+ vars->count = 1;
+ } else if (vars->dropping) {
+ vars->dropping = false;
+ }
+
+ if (next_due && vars->dropping) {
+ /* Use ECN mark if possible, otherwise drop */
+ drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+
+ vars->count++;
+ if (!vars->count)
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = ktime_sub(now, vars->drop_next);
+ } else {
+ while (next_due) {
+ vars->count--;
+ cobalt_invsqrt(vars);
+ vars->drop_next = cobalt_control(vars->drop_next,
+ p->interval,
+ vars->rec_inv_sqrt);
+ schedule = ktime_sub(now, vars->drop_next);
+ next_due = vars->count && ktime_to_ns(schedule) >= 0;
+ }
+ }
+
+ /* Simple BLUE implementation. Lack of ECN is deliberate. */
+ if (vars->p_drop)
+ drop |= (prandom_u32() < vars->p_drop);
+
+ /* Overload the drop_next field as an activity timeout */
+ if (!vars->count)
+ vars->drop_next = ktime_add_ns(now, p->interval);
+ else if (ktime_to_ns(schedule) > 0 && !drop)
+ vars->drop_next = now;
+
+ return drop;
+}
+
+static void cake_update_flowkeys(struct flow_keys *keys,
+ const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct nf_conntrack_tuple tuple = {};
+ bool rev = !skb->_nfct;
+
+ if (tc_skb_protocol(skb) != htons(ETH_P_IP))
+ return;
+
+ if (!nf_ct_get_tuple_skb(&tuple, skb))
+ return;
+
+ keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+ keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+
+ if (keys->ports.ports) {
+ keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
+ keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+ }
+#endif
+}
+
+/* Cake has several subtle multiple bit settings. In these cases you
+ * would be matching triple isolate mode as well.
+ */
+
+static bool cake_dsrc(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
+}
+
+static bool cake_ddst(int flow_mode)
+{
+ return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
+}
+
+static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
+ int flow_mode, u16 flow_override, u16 host_override)
+{
+ u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
+ u16 reduced_hash, srchost_idx, dsthost_idx;
+ struct flow_keys keys, host_keys;
+
+ if (unlikely(flow_mode == CAKE_FLOW_NONE))
+ return 0;
+
+ /* If both overrides are set we can skip packet dissection entirely */
+ if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
+ (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
+ goto skip_hash;
+
+ skb_flow_dissect_flow_keys(skb, &keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ if (flow_mode & CAKE_FLOW_NAT_FLAG)
+ cake_update_flowkeys(&keys, skb);
+
+ /* flow_hash_from_keys() sorts the addresses by value, so we have
+ * to preserve their order in a separate data structure to treat
+ * src and dst host addresses as independently selectable.
+ */
+ host_keys = keys;
+ host_keys.ports.ports = 0;
+ host_keys.basic.ip_proto = 0;
+ host_keys.keyid.keyid = 0;
+ host_keys.tags.flow_label = 0;
+
+ switch (host_keys.control.addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ host_keys.addrs.v4addrs.src = 0;
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ host_keys.addrs.v4addrs.dst = 0;
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+ memset(&host_keys.addrs.v6addrs.src, 0,
+ sizeof(host_keys.addrs.v6addrs.src));
+ dsthost_hash = flow_hash_from_keys(&host_keys);
+ host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ memset(&host_keys.addrs.v6addrs.dst, 0,
+ sizeof(host_keys.addrs.v6addrs.dst));
+ srchost_hash = flow_hash_from_keys(&host_keys);
+ break;
+
+ default:
+ dsthost_hash = 0;
+ srchost_hash = 0;
+ }
+
+ /* This *must* be after the above switch, since as a
+ * side-effect it sorts the src and dst addresses.
+ */
+ if (flow_mode & CAKE_FLOW_FLOWS)
+ flow_hash = flow_hash_from_keys(&keys);
+
+skip_hash:
+ if (flow_override)
+ flow_hash = flow_override - 1;
+ if (host_override) {
+ dsthost_hash = host_override - 1;
+ srchost_hash = host_override - 1;
+ }
+
+ if (!(flow_mode & CAKE_FLOW_FLOWS)) {
+ if (flow_mode & CAKE_FLOW_SRC_IP)
+ flow_hash ^= srchost_hash;
+
+ if (flow_mode & CAKE_FLOW_DST_IP)
+ flow_hash ^= dsthost_hash;
+ }
+
+ reduced_hash = flow_hash % CAKE_QUEUES;
+
+ /* set-associative hashing */
+ /* fast path if no hash collision (direct lookup succeeds) */
+ if (likely(q->tags[reduced_hash] == flow_hash &&
+ q->flows[reduced_hash].set)) {
+ q->way_directs++;
+ } else {
+ u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
+ u32 outer_hash = reduced_hash - inner_hash;
+ bool allocate_src = false;
+ bool allocate_dst = false;
+ u32 i, k;
+
+ /* check if any active queue in the set is reserved for
+ * this flow.
+ */
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->tags[outer_hash + k] == flow_hash) {
+ if (i)
+ q->way_hits++;
+
+ if (!q->flows[outer_hash + k].set) {
+ /* need to increment host refcnts */
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ }
+
+ goto found;
+ }
+ }
+
+ /* no queue is reserved for this flow, look for an
+ * empty one.
+ */
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->flows[outer_hash + k].set) {
+ q->way_misses++;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+ goto found;
+ }
+ }
+
+ /* With no empty queues, default to the original
+ * queue, accept the collision, update the host tags.
+ */
+ q->way_collisions++;
+ q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
+ q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
+ allocate_src = cake_dsrc(flow_mode);
+ allocate_dst = cake_ddst(flow_mode);
+found:
+ /* reserve queue for future packets in same flow */
+ reduced_hash = outer_hash + k;
+ q->tags[reduced_hash] = flow_hash;
+
+ if (allocate_src) {
+ srchost_idx = srchost_hash % CAKE_QUEUES;
+ inner_hash = srchost_idx % CAKE_SET_WAYS;
+ outer_hash = srchost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].srchost_tag ==
+ srchost_hash)
+ goto found_src;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].srchost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].srchost_tag = srchost_hash;
+found_src:
+ srchost_idx = outer_hash + k;
+ q->hosts[srchost_idx].srchost_refcnt++;
+ q->flows[reduced_hash].srchost = srchost_idx;
+ }
+
+ if (allocate_dst) {
+ dsthost_idx = dsthost_hash % CAKE_QUEUES;
+ inner_hash = dsthost_idx % CAKE_SET_WAYS;
+ outer_hash = dsthost_idx - inner_hash;
+ for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (q->hosts[outer_hash + k].dsthost_tag ==
+ dsthost_hash)
+ goto found_dst;
+ }
+ for (i = 0; i < CAKE_SET_WAYS;
+ i++, k = (k + 1) % CAKE_SET_WAYS) {
+ if (!q->hosts[outer_hash + k].dsthost_refcnt)
+ break;
+ }
+ q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
+found_dst:
+ dsthost_idx = outer_hash + k;
+ q->hosts[dsthost_idx].dsthost_refcnt++;
+ q->flows[reduced_hash].dsthost = dsthost_idx;
+ }
+ }
+
+ return reduced_hash;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+/* remove one skb from head of slot queue */
+
+static struct sk_buff *dequeue_head(struct cake_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ if (skb) {
+ flow->head = skb->next;
+ skb_mark_not_on_list(skb);
+ }
+
+ return skb;
+}
+
+/* add skb to flow queue (tail add) */
+
+static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
+ struct ipv6hdr *buf)
+{
+ unsigned int offset = skb_network_offset(skb);
+ struct iphdr *iph;
+
+ iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
+
+ if (!iph)
+ return NULL;
+
+ if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
+ return skb_header_pointer(skb, offset + iph->ihl * 4,
+ sizeof(struct ipv6hdr), buf);
+
+ else if (iph->version == 4)
+ return iph;
+
+ else if (iph->version == 6)
+ return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
+ buf);
+
+ return NULL;
+}
+
+static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
+ void *buf, unsigned int bufsize)
+{
+ unsigned int offset = skb_network_offset(skb);
+ const struct ipv6hdr *ipv6h;
+ const struct tcphdr *tcph;
+ const struct iphdr *iph;
+ struct ipv6hdr _ipv6h;
+ struct tcphdr _tcph;
+
+ ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h)
+ return NULL;
+
+ if (ipv6h->version == 4) {
+ iph = (struct iphdr *)ipv6h;
+ offset += iph->ihl * 4;
+
+ /* special-case 6in4 tunnelling, as that is a common way to get
+ * v6 connectivity in the home
+ */
+ if (iph->protocol == IPPROTO_IPV6) {
+ ipv6h = skb_header_pointer(skb, offset,
+ sizeof(_ipv6h), &_ipv6h);
+
+ if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+
+ offset += sizeof(struct ipv6hdr);
+
+ } else if (iph->protocol != IPPROTO_TCP) {
+ return NULL;
+ }
+
+ } else if (ipv6h->version == 6) {
+ if (ipv6h->nexthdr != IPPROTO_TCP)
+ return NULL;
+
+ offset += sizeof(struct ipv6hdr);
+ } else {
+ return NULL;
+ }
+
+ tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (!tcph)
+ return NULL;
+
+ return skb_header_pointer(skb, offset,
+ min(__tcp_hdrlen(tcph), bufsize), buf);
+}
+
+static const void *cake_get_tcpopt(const struct tcphdr *tcph,
+ int code, int *oplen)
+{
+ /* inspired by tcp_parse_options in tcp_input.c */
+ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+
+ if (opcode == code) {
+ *oplen = opsize;
+ return ptr;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+
+ return NULL;
+}
+
+/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
+ * bytes than the other. In the case where both sequences ACKs bytes that the
+ * other doesn't, A is considered greater. DSACKs in A also makes A be
+ * considered greater.
+ *
+ * @return -1, 0 or 1 as normal compare functions
+ */
+static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
+ const struct tcphdr *tcph_b)
+{
+ const struct tcp_sack_block_wire *sack_a, *sack_b;
+ u32 ack_seq_a = ntohl(tcph_a->ack_seq);
+ u32 bytes_a = 0, bytes_b = 0;
+ int oplen_a, oplen_b;
+ bool first = true;
+
+ sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
+ sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
+
+ /* pointers point to option contents */
+ oplen_a -= TCPOLEN_SACK_BASE;
+ oplen_b -= TCPOLEN_SACK_BASE;
+
+ if (sack_a && oplen_a >= sizeof(*sack_a) &&
+ (!sack_b || oplen_b < sizeof(*sack_b)))
+ return -1;
+ else if (sack_b && oplen_b >= sizeof(*sack_b) &&
+ (!sack_a || oplen_a < sizeof(*sack_a)))
+ return 1;
+ else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
+ (!sack_b || oplen_b < sizeof(*sack_b)))
+ return 0;
+
+ while (oplen_a >= sizeof(*sack_a)) {
+ const struct tcp_sack_block_wire *sack_tmp = sack_b;
+ u32 start_a = get_unaligned_be32(&sack_a->start_seq);
+ u32 end_a = get_unaligned_be32(&sack_a->end_seq);
+ int oplen_tmp = oplen_b;
+ bool found = false;
+
+ /* DSACK; always considered greater to prevent dropping */
+ if (before(start_a, ack_seq_a))
+ return -1;
+
+ bytes_a += end_a - start_a;
+
+ while (oplen_tmp >= sizeof(*sack_tmp)) {
+ u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
+ u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
+
+ /* first time through we count the total size */
+ if (first)
+ bytes_b += end_b - start_b;
+
+ if (!after(start_b, start_a) && !before(end_b, end_a)) {
+ found = true;
+ if (!first)
+ break;
+ }
+ oplen_tmp -= sizeof(*sack_tmp);
+ sack_tmp++;
+ }
+
+ if (!found)
+ return -1;
+
+ oplen_a -= sizeof(*sack_a);
+ sack_a++;
+ first = false;
+ }
+
+ /* If we made it this far, all ranges SACKed by A are covered by B, so
+ * either the SACKs are equal, or B SACKs more bytes.
+ */
+ return bytes_b > bytes_a ? 1 : 0;
+}
+
+static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
+ u32 *tsval, u32 *tsecr)
+{
+ const u8 *ptr;
+ int opsize;
+
+ ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
+
+ if (ptr && opsize == TCPOLEN_TIMESTAMP) {
+ *tsval = get_unaligned_be32(ptr);
+ *tsecr = get_unaligned_be32(ptr + 4);
+ }
+}
+
+static bool cake_tcph_may_drop(const struct tcphdr *tcph,
+ u32 tstamp_new, u32 tsecr_new)
+{
+ /* inspired by tcp_parse_options in tcp_input.c */
+ int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
+ const u8 *ptr = (const u8 *)(tcph + 1);
+ u32 tstamp, tsecr;
+
+ /* 3 reserved flags must be unset to avoid future breakage
+ * ACK must be set
+ * ECE/CWR are handled separately
+ * All other flags URG/PSH/RST/SYN/FIN must be unset
+ * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
+ * 0x00C00000 = CWR/ECE (handled separately)
+ * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
+ */
+ if (((tcp_flag_word(tcph) &
+ cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
+ return false;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ if (opcode == TCPOPT_EOL)
+ break;
+ if (opcode == TCPOPT_NOP) {
+ length--;
+ continue;
+ }
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ break;
+
+ switch (opcode) {
+ case TCPOPT_MD5SIG: /* doesn't influence state */
+ break;
+
+ case TCPOPT_SACK: /* stricter checking performed later */
+ if (opsize % 8 != 2)
+ return false;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ /* only drop timestamps lower than new */
+ if (opsize != TCPOLEN_TIMESTAMP)
+ return false;
+ tstamp = get_unaligned_be32(ptr);
+ tsecr = get_unaligned_be32(ptr + 4);
+ if (after(tstamp, tstamp_new) ||
+ after(tsecr, tsecr_new))
+ return false;
+ break;
+
+ case TCPOPT_MSS: /* these should only be set on SYN */
+ case TCPOPT_WINDOW:
+ case TCPOPT_SACK_PERM:
+ case TCPOPT_FASTOPEN:
+ case TCPOPT_EXP:
+ default: /* don't drop if any unknown options are present */
+ return false;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+
+ return true;
+}
+
+static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
+ struct cake_flow *flow)
+{
+ bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
+ struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
+ struct sk_buff *skb_check, *skb_prev = NULL;
+ const struct ipv6hdr *ipv6h, *ipv6h_check;
+ unsigned char _tcph[64], _tcph_check[64];
+ const struct tcphdr *tcph, *tcph_check;
+ const struct iphdr *iph, *iph_check;
+ struct ipv6hdr _iph, _iph_check;
+ const struct sk_buff *skb;
+ int seglen, num_found = 0;
+ u32 tstamp = 0, tsecr = 0;
+ __be32 elig_flags = 0;
+ int sack_comp;
+
+ /* no other possible ACKs to filter */
+ if (flow->head == flow->tail)
+ return NULL;
+
+ skb = flow->tail;
+ tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
+ iph = cake_get_iphdr(skb, &_iph);
+ if (!tcph)
+ return NULL;
+
+ cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
+
+ /* the 'triggering' packet need only have the ACK flag set.
+ * also check that SYN is not set, as there won't be any previous ACKs.
+ */
+ if ((tcp_flag_word(tcph) &
+ (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
+ return NULL;
+
+ /* the 'triggering' ACK is at the tail of the queue, we have already
+ * returned if it is the only packet in the flow. loop through the rest
+ * of the queue looking for pure ACKs with the same 5-tuple as the
+ * triggering one.
+ */
+ for (skb_check = flow->head;
+ skb_check && skb_check != skb;
+ skb_prev = skb_check, skb_check = skb_check->next) {
+ iph_check = cake_get_iphdr(skb_check, &_iph_check);
+ tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
+ sizeof(_tcph_check));
+
+ /* only TCP packets with matching 5-tuple are eligible, and only
+ * drop safe headers
+ */
+ if (!tcph_check || iph->version != iph_check->version ||
+ tcph_check->source != tcph->source ||
+ tcph_check->dest != tcph->dest)
+ continue;
+
+ if (iph_check->version == 4) {
+ if (iph_check->saddr != iph->saddr ||
+ iph_check->daddr != iph->daddr)
+ continue;
+
+ seglen = ntohs(iph_check->tot_len) -
+ (4 * iph_check->ihl);
+ } else if (iph_check->version == 6) {
+ ipv6h = (struct ipv6hdr *)iph;
+ ipv6h_check = (struct ipv6hdr *)iph_check;
+
+ if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
+ ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
+ continue;
+
+ seglen = ntohs(ipv6h_check->payload_len);
+ } else {
+ WARN_ON(1); /* shouldn't happen */
+ continue;
+ }
+
+ /* If the ECE/CWR flags changed from the previous eligible
+ * packet in the same flow, we should no longer be dropping that
+ * previous packet as this would lose information.
+ */
+ if (elig_ack && (tcp_flag_word(tcph_check) &
+ (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
+ elig_ack = NULL;
+ elig_ack_prev = NULL;
+ num_found--;
+ }
+
+ /* Check TCP options and flags, don't drop ACKs with segment
+ * data, and don't drop ACKs with a higher cumulative ACK
+ * counter than the triggering packet. Check ACK seqno here to
+ * avoid parsing SACK options of packets we are going to exclude
+ * anyway.
+ */
+ if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
+ (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
+ after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
+ continue;
+
+ /* Check SACK options. The triggering packet must SACK more data
+ * than the ACK under consideration, or SACK the same range but
+ * have a larger cumulative ACK counter. The latter is a
+ * pathological case, but is contained in the following check
+ * anyway, just to be safe.
+ */
+ sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
+
+ if (sack_comp < 0 ||
+ (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
+ sack_comp == 0))
+ continue;
+
+ /* At this point we have found an eligible pure ACK to drop; if
+ * we are in aggressive mode, we are done. Otherwise, keep
+ * searching unless this is the second eligible ACK we
+ * found.
+ *
+ * Since we want to drop ACK closest to the head of the queue,
+ * save the first eligible ACK we find, even if we need to loop
+ * again.
+ */
+ if (!elig_ack) {
+ elig_ack = skb_check;
+ elig_ack_prev = skb_prev;
+ elig_flags = (tcp_flag_word(tcph_check)
+ & (TCP_FLAG_ECE | TCP_FLAG_CWR));
+ }
+
+ if (num_found++ > 0)
+ goto found;
+ }
+
+ /* We made it through the queue without finding two eligible ACKs . If
+ * we found a single eligible ACK we can drop it in aggressive mode if
+ * we can guarantee that this does not interfere with ECN flag
+ * information. We ensure this by dropping it only if the enqueued
+ * packet is consecutive with the eligible ACK, and their flags match.
+ */
+ if (elig_ack && aggressive && elig_ack->next == skb &&
+ (elig_flags == (tcp_flag_word(tcph) &
+ (TCP_FLAG_ECE | TCP_FLAG_CWR))))
+ goto found;
+
+ return NULL;
+
+found:
+ if (elig_ack_prev)
+ elig_ack_prev->next = elig_ack->next;
+ else
+ flow->head = elig_ack->next;
+
+ skb_mark_not_on_list(elig_ack);
+
+ return elig_ack;
+}
+
+static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
+{
+ avg -= avg >> shift;
+ avg += sample >> shift;
+ return avg;
+}
+
+static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
+{
+ if (q->rate_flags & CAKE_FLAG_OVERHEAD)
+ len -= off;
+
+ if (q->max_netlen < len)
+ q->max_netlen = len;
+ if (q->min_netlen > len)
+ q->min_netlen = len;
+
+ len += q->rate_overhead;
+
+ if (len < q->rate_mpu)
+ len = q->rate_mpu;
+
+ if (q->atm_mode == CAKE_ATM_ATM) {
+ len += 47;
+ len /= 48;
+ len *= 53;
+ } else if (q->atm_mode == CAKE_ATM_PTM) {
+ /* Add one byte per 64 bytes or part thereof.
+ * This is conservative and easier to calculate than the
+ * precise value.
+ */
+ len += (len + 63) / 64;
+ }
+
+ if (q->max_adjlen < len)
+ q->max_adjlen = len;
+ if (q->min_adjlen > len)
+ q->min_adjlen = len;
+
+ return len;
+}
+
+static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int hdr_len, last_len = 0;
+ u32 off = skb_network_offset(skb);
+ u32 len = qdisc_pkt_len(skb);
+ u16 segs = 1;
+
+ q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
+
+ if (!shinfo->gso_size)
+ return cake_calc_overhead(q, len, off);
+
+ /* borrowed from qdisc_pkt_len_init() */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
+
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
+ segs = DIV_ROUND_UP(skb->len - hdr_len,
+ shinfo->gso_size);
+ else
+ segs = shinfo->gso_segs;
+
+ len = shinfo->gso_size + hdr_len;
+ last_len = skb->len - shinfo->gso_size * (segs - 1);
+
+ return (cake_calc_overhead(q, len, off) * (segs - 1) +
+ cake_calc_overhead(q, last_len, off));
+}
+
+static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+ struct cake_heap_entry jj = q->overflow_heap[j];
+
+ q->overflow_heap[i] = jj;
+ q->overflow_heap[j] = ii;
+
+ q->tins[ii.t].overflow_idx[ii.b] = j;
+ q->tins[jj.t].overflow_idx[jj.b] = i;
+}
+
+static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
+{
+ struct cake_heap_entry ii = q->overflow_heap[i];
+
+ return q->tins[ii.t].backlogs[ii.b];
+}
+
+static void cake_heapify(struct cake_sched_data *q, u16 i)
+{
+ static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
+ u32 mb = cake_heap_get_backlog(q, i);
+ u32 m = i;
+
+ while (m < a) {
+ u32 l = m + m + 1;
+ u32 r = l + 1;
+
+ if (l < a) {
+ u32 lb = cake_heap_get_backlog(q, l);
+
+ if (lb > mb) {
+ m = l;
+ mb = lb;
+ }
+ }
+
+ if (r < a) {
+ u32 rb = cake_heap_get_backlog(q, r);
+
+ if (rb > mb) {
+ m = r;
+ mb = rb;
+ }
+ }
+
+ if (m != i) {
+ cake_heap_swap(q, i, m);
+ i = m;
+ } else {
+ break;
+ }
+ }
+}
+
+static void cake_heapify_up(struct cake_sched_data *q, u16 i)
+{
+ while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
+ u16 p = (i - 1) >> 1;
+ u32 ib = cake_heap_get_backlog(q, i);
+ u32 pb = cake_heap_get_backlog(q, p);
+
+ if (ib > pb) {
+ cake_heap_swap(q, i, p);
+ i = p;
+ } else {
+ break;
+ }
+ }
+}
+
+static int cake_advance_shaper(struct cake_sched_data *q,
+ struct cake_tin_data *b,
+ struct sk_buff *skb,
+ ktime_t now, bool drop)
+{
+ u32 len = get_cobalt_cb(skb)->adjusted_len;
+
+ /* charge packet bandwidth to this tin
+ * and to the global shaper.
+ */
+ if (q->rate_ns) {
+ u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
+ u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
+ u64 failsafe_dur = global_dur + (global_dur >> 1);
+
+ if (ktime_before(b->time_next_packet, now))
+ b->time_next_packet = ktime_add_ns(b->time_next_packet,
+ tin_dur);
+
+ else if (ktime_before(b->time_next_packet,
+ ktime_add_ns(now, tin_dur)))
+ b->time_next_packet = ktime_add_ns(now, tin_dur);
+
+ q->time_next_packet = ktime_add_ns(q->time_next_packet,
+ global_dur);
+ if (!drop)
+ q->failsafe_next_packet = \
+ ktime_add_ns(q->failsafe_next_packet,
+ failsafe_dur);
+ }
+ return len;
+}
+
+static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ ktime_t now = ktime_get();
+ u32 idx = 0, tin = 0, len;
+ struct cake_heap_entry qq;
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ struct sk_buff *skb;
+
+ if (!q->overflow_timeout) {
+ int i;
+ /* Build fresh max-heap */
+ for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
+ cake_heapify(q, i);
+ }
+ q->overflow_timeout = 65535;
+
+ /* select longest queue for pruning */
+ qq = q->overflow_heap[0];
+ tin = qq.t;
+ idx = qq.b;
+
+ b = &q->tins[tin];
+ flow = &b->flows[idx];
+ skb = dequeue_head(flow);
+ if (unlikely(!skb)) {
+ /* heap has gone wrong, rebuild it next time */
+ q->overflow_timeout = 0;
+ return idx + (tin << 16);
+ }
+
+ if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count++;
+
+ len = qdisc_pkt_len(skb);
+ q->buffer_used -= skb->truesize;
+ b->backlogs[idx] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ qdisc_tree_reduce_backlog(sch, 1, len);
+
+ flow->dropped++;
+ b->tin_dropped++;
+ sch->qstats.drops++;
+
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, skb, now, true);
+
+ __qdisc_drop(skb, to_free);
+ sch->q.qlen--;
+
+ cake_heapify(q, 0);
+
+ return idx + (tin << 16);
+}
+
+static void cake_wash_diffserv(struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ break;
+ case htons(ETH_P_IPV6):
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ break;
+ default:
+ break;
+ }
+}
+
+static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
+{
+ u8 dscp;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ if (wash && dscp)
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
+ return dscp;
+
+ case htons(ETH_P_IPV6):
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ if (wash && dscp)
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
+ return dscp;
+
+ case htons(ETH_P_ARP):
+ return 0x38; /* CS7 - Net Control */
+
+ default:
+ /* If there is no Diffserv field, treat as best-effort */
+ return 0;
+ }
+}
+
+static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
+ struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 tin;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->tin_cnt) {
+ tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
+
+ if (q->rate_flags & CAKE_FLAG_WASH)
+ cake_wash_diffserv(skb);
+ } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
+ /* extract the Diffserv Precedence field, if it exists */
+ /* and clear DSCP bits if washing */
+ tin = q->tin_index[cake_handle_diffserv(skb,
+ q->rate_flags & CAKE_FLAG_WASH)];
+ if (unlikely(tin >= q->tin_cnt))
+ tin = 0;
+ } else {
+ tin = 0;
+ if (q->rate_flags & CAKE_FLAG_WASH)
+ cake_wash_diffserv(skb);
+ }
+
+ return &q->tins[tin];
+}
+
+static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
+ struct sk_buff *skb, int flow_mode, int *qerr)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ u16 flow = 0, host = 0;
+ int result;
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ goto hash;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
+ flow = TC_H_MIN(res.classid);
+ if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
+ host = TC_H_MAJ(res.classid) >> 16;
+ }
+hash:
+ *t = cake_select_tin(sch, skb);
+ return cake_hash(*t, skb, flow_mode, flow, host) + 1;
+}
+
+static void cake_reconfigure(struct Qdisc *sch);
+
+static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int len = qdisc_pkt_len(skb);
+ int uninitialized_var(ret);
+ struct sk_buff *ack = NULL;
+ ktime_t now = ktime_get();
+ struct cake_tin_data *b;
+ struct cake_flow *flow;
+ u32 idx;
+
+ /* choose flow to insert into */
+ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
+ if (idx == 0) {
+ if (ret & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return ret;
+ }
+ idx--;
+ flow = &b->flows[idx];
+
+ /* ensure shaper state isn't stale */
+ if (!b->tin_backlog) {
+ if (ktime_before(b->time_next_packet, now))
+ b->time_next_packet = now;
+
+ if (!sch->q.qlen) {
+ if (ktime_before(q->time_next_packet, now)) {
+ q->failsafe_next_packet = now;
+ q->time_next_packet = now;
+ } else if (ktime_after(q->time_next_packet, now) &&
+ ktime_after(q->failsafe_next_packet, now)) {
+ u64 next = \
+ min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(
+ q->failsafe_next_packet));
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ }
+ }
+ }
+
+ if (unlikely(len > b->max_skblen))
+ b->max_skblen = len;
+
+ if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features = netif_skb_features(skb);
+ unsigned int slen = 0;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ while (segs) {
+ nskb = segs->next;
+ skb_mark_not_on_list(segs);
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ cobalt_set_enqueue_time(segs, now);
+ get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
+ segs);
+ flow_queue_add(flow, segs);
+
+ sch->q.qlen++;
+ slen += segs->len;
+ q->buffer_used += segs->truesize;
+ b->packets++;
+ segs = nskb;
+ }
+
+ /* stats */
+ b->bytes += slen;
+ b->backlogs[idx] += slen;
+ b->tin_backlog += slen;
+ sch->qstats.backlog += slen;
+ q->avg_window_bytes += slen;
+
+ qdisc_tree_reduce_backlog(sch, 1, len);
+ consume_skb(skb);
+ } else {
+ /* not splitting */
+ cobalt_set_enqueue_time(skb, now);
+ get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
+ flow_queue_add(flow, skb);
+
+ if (q->ack_filter)
+ ack = cake_ack_filter(q, flow);
+
+ if (ack) {
+ b->ack_drops++;
+ sch->qstats.drops++;
+ b->bytes += qdisc_pkt_len(ack);
+ len -= qdisc_pkt_len(ack);
+ q->buffer_used += skb->truesize - ack->truesize;
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ cake_advance_shaper(q, b, ack, now, true);
+
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
+ consume_skb(ack);
+ } else {
+ sch->q.qlen++;
+ q->buffer_used += skb->truesize;
+ }
+
+ /* stats */
+ b->packets++;
+ b->bytes += len;
+ b->backlogs[idx] += len;
+ b->tin_backlog += len;
+ sch->qstats.backlog += len;
+ q->avg_window_bytes += len;
+ }
+
+ if (q->overflow_timeout)
+ cake_heapify_up(q, b->overflow_idx[idx]);
+
+ /* incoming bandwidth capacity estimate */
+ if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
+ u64 packet_interval = \
+ ktime_to_ns(ktime_sub(now, q->last_packet_time));
+
+ if (packet_interval > NSEC_PER_SEC)
+ packet_interval = NSEC_PER_SEC;
+
+ /* filter out short-term bursts, eg. wifi aggregation */
+ q->avg_packet_interval = \
+ cake_ewma(q->avg_packet_interval,
+ packet_interval,
+ (packet_interval > q->avg_packet_interval ?
+ 2 : 8));
+
+ q->last_packet_time = now;
+
+ if (packet_interval > q->avg_packet_interval) {
+ u64 window_interval = \
+ ktime_to_ns(ktime_sub(now,
+ q->avg_window_begin));
+ u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
+
+ do_div(b, window_interval);
+ q->avg_peak_bandwidth =
+ cake_ewma(q->avg_peak_bandwidth, b,
+ b > q->avg_peak_bandwidth ? 2 : 8);
+ q->avg_window_bytes = 0;
+ q->avg_window_begin = now;
+
+ if (ktime_after(now,
+ ktime_add_ms(q->last_reconfig_time,
+ 250))) {
+ q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
+ cake_reconfigure(sch);
+ }
+ }
+ } else {
+ q->avg_window_bytes = 0;
+ q->last_packet_time = now;
+ }
+
+ /* flowchain */
+ if (!flow->set || flow->set == CAKE_SET_DECAYING) {
+ struct cake_host *srchost = &b->hosts[flow->srchost];
+ struct cake_host *dsthost = &b->hosts[flow->dsthost];
+ u16 host_load = 1;
+
+ if (!flow->set) {
+ list_add_tail(&flow->flowchain, &b->new_flows);
+ } else {
+ b->decaying_flow_count--;
+ list_move_tail(&flow->flowchain, &b->new_flows);
+ }
+ flow->set = CAKE_SET_SPARSE;
+ b->sparse_flow_count++;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ flow->deficit = (b->flow_quantum *
+ quantum_div[host_load]) >> 16;
+ } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
+ /* this flow was empty, accounted as a sparse flow, but actually
+ * in the bulk rotation.
+ */
+ flow->set = CAKE_SET_BULK;
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ }
+
+ if (q->buffer_used > q->buffer_max_used)
+ q->buffer_max_used = q->buffer_used;
+
+ if (q->buffer_used > q->buffer_limit) {
+ u32 dropped = 0;
+
+ while (q->buffer_used > q->buffer_limit) {
+ dropped++;
+ cake_drop(sch, to_free);
+ }
+ b->drop_overlimit += dropped;
+ }
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_flow *flow = &b->flows[q->cur_flow];
+ struct sk_buff *skb = NULL;
+ u32 len;
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ len = qdisc_pkt_len(skb);
+ b->backlogs[q->cur_flow] -= len;
+ b->tin_backlog -= len;
+ sch->qstats.backlog -= len;
+ q->buffer_used -= skb->truesize;
+ sch->q.qlen--;
+
+ if (q->overflow_timeout)
+ cake_heapify(q, b->overflow_idx[q->cur_flow]);
+ }
+ return skb;
+}
+
+/* Discard leftover packets from a tin no longer in use. */
+static void cake_clear_tin(struct Qdisc *sch, u16 tin)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ q->cur_tin = tin;
+ for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
+ while (!!(skb = cake_dequeue_one(sch)))
+ kfree_skb(skb);
+}
+
+static struct sk_buff *cake_dequeue(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[q->cur_tin];
+ struct cake_host *srchost, *dsthost;
+ ktime_t now = ktime_get();
+ struct cake_flow *flow;
+ struct list_head *head;
+ bool first_flow = true;
+ struct sk_buff *skb;
+ u16 host_load;
+ u64 delay;
+ u32 len;
+
+begin:
+ if (!sch->q.qlen)
+ return NULL;
+
+ /* global hard shaper */
+ if (ktime_after(q->time_next_packet, now) &&
+ ktime_after(q->failsafe_next_packet, now)) {
+ u64 next = min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(q->failsafe_next_packet));
+
+ sch->qstats.overlimits++;
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ return NULL;
+ }
+
+ /* Choose a class to work on. */
+ if (!q->rate_ns) {
+ /* In unlimited mode, can't rely on shaper timings, just balance
+ * with DRR
+ */
+ bool wrapped = false, empty = true;
+
+ while (b->tin_deficit < 0 ||
+ !(b->sparse_flow_count + b->bulk_flow_count)) {
+ if (b->tin_deficit <= 0)
+ b->tin_deficit += b->tin_quantum_band;
+ if (b->sparse_flow_count + b->bulk_flow_count)
+ empty = false;
+
+ q->cur_tin++;
+ b++;
+ if (q->cur_tin >= q->tin_cnt) {
+ q->cur_tin = 0;
+ b = q->tins;
+
+ if (wrapped) {
+ /* It's possible for q->qlen to be
+ * nonzero when we actually have no
+ * packets anywhere.
+ */
+ if (empty)
+ return NULL;
+ } else {
+ wrapped = true;
+ }
+ }
+ }
+ } else {
+ /* In shaped mode, choose:
+ * - Highest-priority tin with queue and meeting schedule, or
+ * - The earliest-scheduled tin with queue.
+ */
+ ktime_t best_time = KTIME_MAX;
+ int tin, best_tin = 0;
+
+ for (tin = 0; tin < q->tin_cnt; tin++) {
+ b = q->tins + tin;
+ if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
+ ktime_t time_to_pkt = \
+ ktime_sub(b->time_next_packet, now);
+
+ if (ktime_to_ns(time_to_pkt) <= 0 ||
+ ktime_compare(time_to_pkt,
+ best_time) <= 0) {
+ best_time = time_to_pkt;
+ best_tin = tin;
+ }
+ }
+ }
+
+ q->cur_tin = best_tin;
+ b = q->tins + best_tin;
+
+ /* No point in going further if no packets to deliver. */
+ if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
+ return NULL;
+ }
+
+retry:
+ /* service this class */
+ head = &b->decaying_flows;
+ if (!first_flow || list_empty(head)) {
+ head = &b->new_flows;
+ if (list_empty(head)) {
+ head = &b->old_flows;
+ if (unlikely(list_empty(head))) {
+ head = &b->decaying_flows;
+ if (unlikely(list_empty(head)))
+ goto begin;
+ }
+ }
+ }
+ flow = list_first_entry(head, struct cake_flow, flowchain);
+ q->cur_flow = flow - b->flows;
+ first_flow = false;
+
+ /* triple isolation (modified DRR++) */
+ srchost = &b->hosts[flow->srchost];
+ dsthost = &b->hosts[flow->dsthost];
+ host_load = 1;
+
+ if (cake_dsrc(q->flow_mode))
+ host_load = max(host_load, srchost->srchost_refcnt);
+
+ if (cake_ddst(q->flow_mode))
+ host_load = max(host_load, dsthost->dsthost_refcnt);
+
+ WARN_ON(host_load > CAKE_QUEUES);
+
+ /* flow isolation (DRR++) */
+ if (flow->deficit <= 0) {
+ /* The shifted prandom_u32() is a way to apply dithering to
+ * avoid accumulating roundoff errors
+ */
+ flow->deficit += (b->flow_quantum * quantum_div[host_load] +
+ (prandom_u32() >> 16)) >> 16;
+ list_move_tail(&flow->flowchain, &b->old_flows);
+
+ /* Keep all flows with deficits out of the sparse and decaying
+ * rotations. No non-empty flow can go into the decaying
+ * rotation, so they can't get deficits
+ */
+ if (flow->set == CAKE_SET_SPARSE) {
+ if (flow->head) {
+ b->sparse_flow_count--;
+ b->bulk_flow_count++;
+ flow->set = CAKE_SET_BULK;
+ } else {
+ /* we've moved it to the bulk rotation for
+ * correct deficit accounting but we still want
+ * to count it as a sparse flow, not a bulk one.
+ */
+ flow->set = CAKE_SET_SPARSE_WAIT;
+ }
+ }
+ goto retry;
+ }
+
+ /* Retrieve a packet via the AQM */
+ while (1) {
+ skb = cake_dequeue_one(sch);
+ if (!skb) {
+ /* this queue was actually empty */
+ if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
+ b->unresponsive_flow_count--;
+
+ if (flow->cvars.p_drop || flow->cvars.count ||
+ ktime_before(now, flow->cvars.drop_next)) {
+ /* keep in the flowchain until the state has
+ * decayed to rest
+ */
+ list_move_tail(&flow->flowchain,
+ &b->decaying_flows);
+ if (flow->set == CAKE_SET_BULK) {
+ b->bulk_flow_count--;
+ b->decaying_flow_count++;
+ } else if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT) {
+ b->sparse_flow_count--;
+ b->decaying_flow_count++;
+ }
+ flow->set = CAKE_SET_DECAYING;
+ } else {
+ /* remove empty queue from the flowchain */
+ list_del_init(&flow->flowchain);
+ if (flow->set == CAKE_SET_SPARSE ||
+ flow->set == CAKE_SET_SPARSE_WAIT)
+ b->sparse_flow_count--;
+ else if (flow->set == CAKE_SET_BULK)
+ b->bulk_flow_count--;
+ else
+ b->decaying_flow_count--;
+
+ flow->set = CAKE_SET_NONE;
+ srchost->srchost_refcnt--;
+ dsthost->dsthost_refcnt--;
+ }
+ goto begin;
+ }
+
+ /* Last packet in queue may be marked, shouldn't be dropped */
+ if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
+ (b->bulk_flow_count *
+ !!(q->rate_flags &
+ CAKE_FLAG_INGRESS))) ||
+ !flow->head)
+ break;
+
+ /* drop this packet, get another one */
+ if (q->rate_flags & CAKE_FLAG_INGRESS) {
+ len = cake_advance_shaper(q, b, skb,
+ now, true);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+ }
+ flow->dropped++;
+ b->tin_dropped++;
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
+ qdisc_qstats_drop(sch);
+ kfree_skb(skb);
+ if (q->rate_flags & CAKE_FLAG_INGRESS)
+ goto retry;
+ }
+
+ b->tin_ecn_mark += !!flow->cvars.ecn_marked;
+ qdisc_bstats_update(sch, skb);
+
+ /* collect delay stats */
+ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
+ b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
+ b->peak_delay = cake_ewma(b->peak_delay, delay,
+ delay > b->peak_delay ? 2 : 8);
+ b->base_delay = cake_ewma(b->base_delay, delay,
+ delay < b->base_delay ? 2 : 8);
+
+ len = cake_advance_shaper(q, b, skb, now, false);
+ flow->deficit -= len;
+ b->tin_deficit -= len;
+
+ if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
+ u64 next = min(ktime_to_ns(q->time_next_packet),
+ ktime_to_ns(q->failsafe_next_packet));
+
+ qdisc_watchdog_schedule_ns(&q->watchdog, next);
+ } else if (!sch->q.qlen) {
+ int i;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ if (q->tins[i].decaying_flow_count) {
+ ktime_t next = \
+ ktime_add_ns(now,
+ q->tins[i].cparams.target);
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ ktime_to_ns(next));
+ break;
+ }
+ }
+ }
+
+ if (q->overflow_timeout)
+ q->overflow_timeout--;
+
+ return skb;
+}
+
+static void cake_reset(struct Qdisc *sch)
+{
+ u32 c;
+
+ for (c = 0; c < CAKE_MAX_TINS; c++)
+ cake_clear_tin(sch, c);
+}
+
+static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
+ [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
+ [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_ATM] = { .type = NLA_U32 },
+ [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
+ [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
+ [TCA_CAKE_RTT] = { .type = NLA_U32 },
+ [TCA_CAKE_TARGET] = { .type = NLA_U32 },
+ [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
+ [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
+ [TCA_CAKE_NAT] = { .type = NLA_U32 },
+ [TCA_CAKE_RAW] = { .type = NLA_U32 },
+ [TCA_CAKE_WASH] = { .type = NLA_U32 },
+ [TCA_CAKE_MPU] = { .type = NLA_U32 },
+ [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
+ [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+};
+
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+ u64 target_ns, u64 rtt_est_ns)
+{
+ /* convert byte-rate into time-per-byte
+ * so it will always unwedge in reasonable time.
+ */
+ static const u64 MIN_RATE = 64;
+ u32 byte_target = mtu;
+ u64 byte_target_ns;
+ u8 rate_shft = 0;
+ u64 rate_ns = 0;
+
+ b->flow_quantum = 1514;
+ if (rate) {
+ b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
+ rate_shft = 34;
+ rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
+ rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
+ while (!!(rate_ns >> 34)) {
+ rate_ns >>= 1;
+ rate_shft--;
+ }
+ } /* else unlimited, ie. zero delay */
+
+ b->tin_rate_bps = rate;
+ b->tin_rate_ns = rate_ns;
+ b->tin_rate_shft = rate_shft;
+
+ byte_target_ns = (byte_target * rate_ns) >> rate_shft;
+
+ b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
+ b->cparams.interval = max(rtt_est_ns +
+ b->cparams.target - target_ns,
+ b->cparams.target * 2);
+ b->cparams.mtu_time = byte_target_ns;
+ b->cparams.p_inc = 1 << 24; /* 1/256 */
+ b->cparams.p_dec = 1 << 20; /* 1/4096 */
+}
+
+static int cake_config_besteffort(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_tin_data *b = &q->tins[0];
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+
+ q->tin_cnt = 1;
+
+ q->tin_index = besteffort;
+ q->tin_order = normal_order;
+
+ cake_set_rate(b, rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ b->tin_quantum_band = 65535;
+ b->tin_quantum_prio = 65535;
+
+ return 0;
+}
+
+static int cake_config_precedence(struct Qdisc *sch)
+{
+ /* convert high-level (user visible) parameters into internal format */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+ q->tin_index = precedence;
+ q->tin_order = normal_order;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+ us_to_ns(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+/* List of known Diffserv codepoints:
+ *
+ * Least Effort (CS1)
+ * Best Effort (CS0)
+ * Max Reliability & LLT "Lo" (TOS1)
+ * Max Throughput (TOS2)
+ * Min Delay (TOS4)
+ * LLT "La" (TOS5)
+ * Assured Forwarding 1 (AF1x) - x3
+ * Assured Forwarding 2 (AF2x) - x3
+ * Assured Forwarding 3 (AF3x) - x3
+ * Assured Forwarding 4 (AF4x) - x3
+ * Precedence Class 2 (CS2)
+ * Precedence Class 3 (CS3)
+ * Precedence Class 4 (CS4)
+ * Precedence Class 5 (CS5)
+ * Precedence Class 6 (CS6)
+ * Precedence Class 7 (CS7)
+ * Voice Admit (VA)
+ * Expedited Forwarding (EF)
+
+ * Total 25 codepoints.
+ */
+
+/* List of traffic classes in RFC 4594:
+ * (roughly descending order of contended priority)
+ * (roughly ascending order of uncontended throughput)
+ *
+ * Network Control (CS6,CS7) - routing traffic
+ * Telephony (EF,VA) - aka. VoIP streams
+ * Signalling (CS5) - VoIP setup
+ * Multimedia Conferencing (AF4x) - aka. video calls
+ * Realtime Interactive (CS4) - eg. games
+ * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
+ * Broadcast Video (CS3)
+ * Low Latency Data (AF2x,TOS4) - eg. database
+ * Ops, Admin, Management (CS2,TOS1) - eg. ssh
+ * Standard Service (CS0 & unrecognised codepoints)
+ * High Throughput Data (AF1x,TOS2) - eg. web traffic
+ * Low Priority Data (CS1) - eg. BitTorrent
+
+ * Total 12 traffic classes.
+ */
+
+static int cake_config_diffserv8(struct Qdisc *sch)
+{
+/* Pruned list of traffic classes for typical applications:
+ *
+ * Network Control (CS6, CS7)
+ * Minimum Latency (EF, VA, CS5, CS4)
+ * Interactive Shell (CS2, TOS1)
+ * Low Latency Transactions (AF2x, TOS4)
+ * Video Streaming (AF4x, AF3x, CS3)
+ * Bog Standard (CS0 etc.)
+ * High Throughput (AF1x, TOS2)
+ * Background Traffic (CS1)
+ *
+ * Total 8 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum1 = 256;
+ u32 quantum2 = 256;
+ u32 i;
+
+ q->tin_cnt = 8;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv8;
+ q->tin_order = normal_order;
+
+ /* class characteristics */
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[i];
+
+ cake_set_rate(b, rate, mtu, us_to_ns(q->target),
+ us_to_ns(q->interval));
+
+ b->tin_quantum_prio = max_t(u16, 1U, quantum1);
+ b->tin_quantum_band = max_t(u16, 1U, quantum2);
+
+ /* calculate next class's parameters */
+ rate *= 7;
+ rate >>= 3;
+
+ quantum1 *= 3;
+ quantum1 >>= 1;
+
+ quantum2 *= 7;
+ quantum2 >>= 3;
+ }
+
+ return 0;
+}
+
+static int cake_config_diffserv4(struct Qdisc *sch)
+{
+/* Further pruned list of traffic classes for four-class system:
+ *
+ * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
+ * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
+ * Best Effort (CS0, AF1x, TOS2, and those not specified)
+ * Background Traffic (CS1)
+ *
+ * Total 4 traffic classes.
+ */
+
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum = 1024;
+
+ q->tin_cnt = 4;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv4;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 1, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[3], rate >> 2, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 2;
+ q->tins[3].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 1;
+ q->tins[3].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static int cake_config_diffserv3(struct Qdisc *sch)
+{
+/* Simplified Diffserv structure with 3 tins.
+ * Low Priority (CS1)
+ * Best Effort
+ * Latency Sensitive (TOS4, VA, EF, CS6, CS7)
+ */
+ struct cake_sched_data *q = qdisc_priv(sch);
+ u32 mtu = psched_mtu(qdisc_dev(sch));
+ u64 rate = q->rate_bps;
+ u32 quantum = 1024;
+
+ q->tin_cnt = 3;
+
+ /* codepoint to class mapping */
+ q->tin_index = diffserv3;
+ q->tin_order = bulk_order;
+
+ /* class characteristics */
+ cake_set_rate(&q->tins[0], rate, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[1], rate >> 4, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+ cake_set_rate(&q->tins[2], rate >> 2, mtu,
+ us_to_ns(q->target), us_to_ns(q->interval));
+
+ /* priority weights */
+ q->tins[0].tin_quantum_prio = quantum;
+ q->tins[1].tin_quantum_prio = quantum >> 4;
+ q->tins[2].tin_quantum_prio = quantum << 4;
+
+ /* bandwidth-sharing weights */
+ q->tins[0].tin_quantum_band = quantum;
+ q->tins[1].tin_quantum_band = quantum >> 4;
+ q->tins[2].tin_quantum_band = quantum >> 2;
+
+ return 0;
+}
+
+static void cake_reconfigure(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int c, ft;
+
+ switch (q->tin_mode) {
+ case CAKE_DIFFSERV_BESTEFFORT:
+ ft = cake_config_besteffort(sch);
+ break;
+
+ case CAKE_DIFFSERV_PRECEDENCE:
+ ft = cake_config_precedence(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV8:
+ ft = cake_config_diffserv8(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV4:
+ ft = cake_config_diffserv4(sch);
+ break;
+
+ case CAKE_DIFFSERV_DIFFSERV3:
+ default:
+ ft = cake_config_diffserv3(sch);
+ break;
+ }
+
+ for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
+ cake_clear_tin(sch, c);
+ q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
+ }
+
+ q->rate_ns = q->tins[ft].tin_rate_ns;
+ q->rate_shft = q->tins[ft].tin_rate_shft;
+
+ if (q->buffer_config_limit) {
+ q->buffer_limit = q->buffer_config_limit;
+ } else if (q->rate_bps) {
+ u64 t = q->rate_bps * q->interval;
+
+ do_div(t, USEC_PER_SEC / 4);
+ q->buffer_limit = max_t(u32, t, 4U << 20);
+ } else {
+ q->buffer_limit = ~0;
+ }
+
+ sch->flags &= ~TCQ_F_CAN_BYPASS;
+
+ q->buffer_limit = min(q->buffer_limit,
+ max(sch->limit * psched_mtu(qdisc_dev(sch)),
+ q->buffer_config_limit));
+}
+
+static int cake_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_CAKE_MAX + 1];
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CAKE_NAT]) {
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
+ q->flow_mode |= CAKE_FLOW_NAT_FLAG *
+ !!nla_get_u32(tb[TCA_CAKE_NAT]);
+#else
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
+ "No conntrack support in kernel");
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ if (tb[TCA_CAKE_BASE_RATE64])
+ q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
+
+ if (tb[TCA_CAKE_DIFFSERV_MODE])
+ q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
+
+ if (tb[TCA_CAKE_WASH]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
+ q->rate_flags |= CAKE_FLAG_WASH;
+ else
+ q->rate_flags &= ~CAKE_FLAG_WASH;
+ }
+
+ if (tb[TCA_CAKE_FLOW_MODE])
+ q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
+ (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
+ CAKE_FLOW_MASK));
+
+ if (tb[TCA_CAKE_ATM])
+ q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
+
+ if (tb[TCA_CAKE_OVERHEAD]) {
+ q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
+ q->rate_flags |= CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = 0;
+ q->max_adjlen = 0;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_RAW]) {
+ q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
+
+ q->max_netlen = 0;
+ q->max_adjlen = 0;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ }
+
+ if (tb[TCA_CAKE_MPU])
+ q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
+
+ if (tb[TCA_CAKE_RTT]) {
+ q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
+
+ if (!q->interval)
+ q->interval = 1;
+ }
+
+ if (tb[TCA_CAKE_TARGET]) {
+ q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
+
+ if (!q->target)
+ q->target = 1;
+ }
+
+ if (tb[TCA_CAKE_AUTORATE]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
+ q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_INGRESS]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
+ q->rate_flags |= CAKE_FLAG_INGRESS;
+ else
+ q->rate_flags &= ~CAKE_FLAG_INGRESS;
+ }
+
+ if (tb[TCA_CAKE_ACK_FILTER])
+ q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
+
+ if (tb[TCA_CAKE_MEMORY])
+ q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+
+ if (tb[TCA_CAKE_SPLIT_GSO]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ else
+ q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+ }
+
+ if (q->tins) {
+ sch_tree_lock(sch);
+ cake_reconfigure(sch);
+ sch_tree_unlock(sch);
+ }
+
+ return 0;
+}
+
+static void cake_destroy(struct Qdisc *sch)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ qdisc_watchdog_cancel(&q->watchdog);
+ tcf_block_put(q->block);
+ kvfree(q->tins);
+}
+
+static int cake_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ int i, j, err;
+
+ sch->limit = 10240;
+ q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
+ q->flow_mode = CAKE_FLOW_TRIPLE;
+
+ q->rate_bps = 0; /* unlimited by default */
+
+ q->interval = 100000; /* 100ms default */
+ q->target = 5000; /* 5ms: codel RFC argues
+ * for 5 to 10% of interval
+ */
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ q->cur_tin = 0;
+ q->cur_flow = 0;
+
+ qdisc_watchdog_init(&q->watchdog, sch);
+
+ if (opt) {
+ int err = cake_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ quantum_div[0] = ~0;
+ for (i = 1; i <= CAKE_QUEUES; i++)
+ quantum_div[i] = 65535 / i;
+
+ q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
+ GFP_KERNEL);
+ if (!q->tins)
+ goto nomem;
+
+ for (i = 0; i < CAKE_MAX_TINS; i++) {
+ struct cake_tin_data *b = q->tins + i;
+
+ INIT_LIST_HEAD(&b->new_flows);
+ INIT_LIST_HEAD(&b->old_flows);
+ INIT_LIST_HEAD(&b->decaying_flows);
+ b->sparse_flow_count = 0;
+ b->bulk_flow_count = 0;
+ b->decaying_flow_count = 0;
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ struct cake_flow *flow = b->flows + j;
+ u32 k = j * CAKE_MAX_TINS + i;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ cobalt_vars_init(&flow->cvars);
+
+ q->overflow_heap[k].t = i;
+ q->overflow_heap[k].b = j;
+ b->overflow_idx[j] = k;
+ }
+ }
+
+ cake_reconfigure(sch);
+ q->avg_peak_bandwidth = q->rate_bps;
+ q->min_netlen = ~0;
+ q->min_adjlen = ~0;
+ return 0;
+
+nomem:
+ cake_destroy(sch);
+ return -ENOMEM;
+}
+
+static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
+ TCA_CAKE_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
+ q->flow_mode & CAKE_FLOW_MASK))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
+ !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_INGRESS,
+ !!(q->rate_flags & CAKE_FLAG_INGRESS)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_NAT,
+ !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_WASH,
+ !!(q->rate_flags & CAKE_FLAG_WASH)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
+ goto nla_put_failure;
+
+ if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
+ if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
+ !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ return -1;
+}
+
+static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ struct cake_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tstats, *ts;
+ int i;
+
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_STAT_U64(attr, data) do { \
+ if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
+ data, TCA_CAKE_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
+ PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
+ PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
+ PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
+ PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
+ PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
+ PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
+ PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+
+#undef PUT_STAT_U32
+#undef PUT_STAT_U64
+
+ tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
+ if (!tstats)
+ goto nla_put_failure;
+
+#define PUT_TSTAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_TSTAT_U64(attr, data) do { \
+ if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
+ data, TCA_CAKE_TIN_STATS_PAD)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ ts = nla_nest_start(d->skb, i + 1);
+ if (!ts)
+ goto nla_put_failure;
+
+ PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
+ PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
+ PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
+
+ PUT_TSTAT_U32(TARGET_US,
+ ktime_to_us(ns_to_ktime(b->cparams.target)));
+ PUT_TSTAT_U32(INTERVAL_US,
+ ktime_to_us(ns_to_ktime(b->cparams.interval)));
+
+ PUT_TSTAT_U32(SENT_PACKETS, b->packets);
+ PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
+ PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
+ PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
+
+ PUT_TSTAT_U32(PEAK_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->peak_delay)));
+ PUT_TSTAT_U32(AVG_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->avge_delay)));
+ PUT_TSTAT_U32(BASE_DELAY_US,
+ ktime_to_us(ns_to_ktime(b->base_delay)));
+
+ PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
+ PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
+ PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
+
+ PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
+ b->decaying_flow_count);
+ PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
+ PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
+ PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
+
+ PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
+ nla_nest_end(d->skb, ts);
+ }
+
+#undef PUT_TSTAT_U32
+#undef PUT_TSTAT_U64
+
+ nla_nest_end(d->skb, tstats);
+ return nla_nest_end(d->skb, stats);
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long cake_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return 0;
+}
+
+static void cake_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return q->block;
+}
+
+static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ const struct cake_flow *flow = NULL;
+ struct gnet_stats_queue qs = { 0 };
+ struct nlattr *stats;
+ u32 idx = cl - 1;
+
+ if (idx < CAKE_QUEUES * q->tin_cnt) {
+ const struct cake_tin_data *b = \
+ &q->tins[q->tin_order[idx / CAKE_QUEUES]];
+ const struct sk_buff *skb;
+
+ flow = &b->flows[idx % CAKE_QUEUES];
+
+ if (flow->head) {
+ sch_tree_lock(sch);
+ skb = flow->head;
+ while (skb) {
+ qs.qlen++;
+ skb = skb->next;
+ }
+ sch_tree_unlock(sch);
+ }
+ qs.backlog = b->backlogs[idx % CAKE_QUEUES];
+ qs.drops = flow->dropped;
+ }
+ if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
+ return -1;
+ if (flow) {
+ ktime_t now = ktime_get();
+
+ stats = nla_nest_start(d->skb, TCA_STATS_APP);
+ if (!stats)
+ return -1;
+
+#define PUT_STAT_U32(attr, data) do { \
+ if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+#define PUT_STAT_S32(attr, data) do { \
+ if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
+ goto nla_put_failure; \
+ } while (0)
+
+ PUT_STAT_S32(DEFICIT, flow->deficit);
+ PUT_STAT_U32(DROPPING, flow->cvars.dropping);
+ PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
+ PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
+ if (flow->cvars.p_drop) {
+ PUT_STAT_S32(BLUE_TIMER_US,
+ ktime_to_us(
+ ktime_sub(now,
+ flow->cvars.blue_timer)));
+ }
+ if (flow->cvars.dropping) {
+ PUT_STAT_S32(DROP_NEXT_US,
+ ktime_to_us(
+ ktime_sub(now,
+ flow->cvars.drop_next)));
+ }
+
+ if (nla_nest_end(d->skb, stats) < 0)
+ return -1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(d->skb, stats);
+ return -1;
+}
+
+static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct cake_sched_data *q = qdisc_priv(sch);
+ unsigned int i, j;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->tin_cnt; i++) {
+ struct cake_tin_data *b = &q->tins[q->tin_order[i]];
+
+ for (j = 0; j < CAKE_QUEUES; j++) {
+ if (list_empty(&b->flows[j].flowchain) ||
+ arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+ }
+}
+
+static const struct Qdisc_class_ops cake_class_ops = {
+ .leaf = cake_leaf,
+ .find = cake_find,
+ .tcf_block = cake_tcf_block,
+ .bind_tcf = cake_bind,
+ .unbind_tcf = cake_unbind,
+ .dump = cake_dump_class,
+ .dump_stats = cake_dump_class_stats,
+ .walk = cake_walk,
+};
+
+static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
+ .cl_ops = &cake_class_ops,
+ .id = "cake",
+ .priv_size = sizeof(struct cake_sched_data),
+ .enqueue = cake_enqueue,
+ .dequeue = cake_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = cake_init,
+ .reset = cake_reset,
+ .destroy = cake_destroy,
+ .change = cake_change,
+ .dump = cake_dump,
+ .dump_stats = cake_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init cake_module_init(void)
+{
+ return register_qdisc(&cake_qdisc_ops);
+}
+
+static void __exit cake_module_exit(void)
+{
+ unregister_qdisc(&cake_qdisc_ops);
+}
+
+module_init(cake_module_init)
+module_exit(cake_module_exit)
+MODULE_AUTHOR("Jonathan Morton");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("The CAKE shaper.");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d53cfe..4dc05409e3fb 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1418,7 +1418,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
WARN_ON(cl->filters);
tcf_block_put(cl->block);
- qdisc_destroy(cl->q);
+ qdisc_put(cl->q);
qdisc_put_rtab(cl->R_tab);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index cdd96b9a27bc..e689e11b6d0f 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -78,18 +78,42 @@ struct cbs_sched_data {
s64 sendslope; /* in bytes/s */
s64 idleslope; /* in bytes/s */
struct qdisc_watchdog watchdog;
- int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch);
+ int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free);
struct sk_buff *(*dequeue)(struct Qdisc *sch);
+ struct Qdisc *qdisc;
};
-static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch)
+static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
{
- return qdisc_enqueue_tail(skb, sch);
+ int err;
+
+ err = child->ops->enqueue(skb, child, to_free);
+ if (err != NET_XMIT_SUCCESS)
+ return err;
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
}
-static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
+static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ return cbs_child_enqueue(skb, sch, qdisc, to_free);
+}
+
+static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
if (sch->q.qlen == 0 && q->credits > 0) {
/* We need to stop accumulating credits when there's
@@ -99,7 +123,7 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
q->last = ktime_get_ns();
}
- return qdisc_enqueue_tail(skb, sch);
+ return cbs_child_enqueue(skb, sch, qdisc, to_free);
}
static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -107,7 +131,7 @@ static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
struct cbs_sched_data *q = qdisc_priv(sch);
- return q->enqueue(skb, sch);
+ return q->enqueue(skb, sch, to_free);
}
/* timediff is in ns, slope is in bytes/s */
@@ -132,9 +156,25 @@ static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
return div64_s64(len * slope, port_rate);
}
+static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child)
+{
+ struct sk_buff *skb;
+
+ skb = child->ops->dequeue(child);
+ if (!skb)
+ return NULL;
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
{
struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
s64 now = ktime_get_ns();
struct sk_buff *skb;
s64 credits;
@@ -157,8 +197,7 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
return NULL;
}
}
-
- skb = qdisc_dequeue_head(sch);
+ skb = cbs_child_dequeue(sch, qdisc);
if (!skb)
return NULL;
@@ -178,7 +217,10 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
{
- return qdisc_dequeue_head(sch);
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ return cbs_child_dequeue(sch, qdisc);
}
static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
@@ -310,6 +352,13 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
return -EINVAL;
}
+ q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, extack);
+ if (!q->qdisc)
+ return -ENOMEM;
+
+ qdisc_hash_add(q->qdisc, false);
+
q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
q->enqueue = cbs_enqueue_soft;
@@ -328,6 +377,9 @@ static void cbs_destroy(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
+
+ if (q->qdisc)
+ qdisc_put(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -356,8 +408,72 @@ nla_put_failure:
return -1;
}
+static int cbs_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1 || !q->qdisc) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, NULL);
+ if (!new)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long cbs_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip) {
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops cbs_class_ops = {
+ .graft = cbs_graft,
+ .leaf = cbs_leaf,
+ .find = cbs_find,
+ .walk = cbs_walk,
+ .dump = cbs_dump_class,
+};
+
static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
.id = "cbs",
+ .cl_ops = &cbs_class_ops,
.priv_size = sizeof(struct cbs_sched_data),
.enqueue = cbs_enqueue,
.dequeue = cbs_dequeue,
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8a9939..cdebaed0f8cf 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -134,7 +134,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to replace estimator");
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
return err;
}
@@ -153,7 +153,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
gen_kill_estimator(&cl->rate_est);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
}
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 049714c57075..f6f480784bc6 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -412,7 +412,7 @@ static void dsmark_destroy(struct Qdisc *sch)
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
tcf_block_put(p->block);
- qdisc_destroy(p->q);
+ qdisc_put(p->q);
if (p->mv != p->embedded)
kfree(p->mv);
}
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..1538d6fa8165
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c Earliest TxTime First queueing discipline.
+ *
+ * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
+ * Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON)
+
+struct etf_sched_data {
+ bool offload;
+ bool deadline_mode;
+ int clockid;
+ int queue;
+ s32 delta; /* in ns */
+ ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+ struct rb_root head;
+ struct qdisc_watchdog watchdog;
+ ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+ [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ /* Check if params comply to the following rules:
+ * * Clockid and delta must be valid.
+ *
+ * * Dynamic clockids are not supported.
+ *
+ * * Delta must be a positive integer.
+ *
+ * Also note that for the HW offload case, we must
+ * expect that system clocks have been synchronized to PHC.
+ */
+ if (qopt->clockid < 0) {
+ NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+ return -ENOTSUPP;
+ }
+
+ if (qopt->clockid != CLOCK_TAI) {
+ NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used");
+ return -EINVAL;
+ }
+
+ if (qopt->delta < 0) {
+ NL_SET_ERR_MSG(extack, "Delta must be positive");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ ktime_t txtime = nskb->tstamp;
+ struct sock *sk = nskb->sk;
+ ktime_t now;
+
+ if (!sk)
+ return false;
+
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return false;
+
+ /* We don't perform crosstimestamping.
+ * Drop if packet's clockid differs from qdisc's.
+ */
+ if (sk->sk_clockid != q->clockid)
+ return false;
+
+ if (sk->sk_txtime_deadline_mode != q->deadline_mode)
+ return false;
+
+ now = q->get_time();
+ if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ p = rb_first(&q->head);
+ if (!p)
+ return NULL;
+
+ return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = etf_peek_timesortedlist(sch);
+ ktime_t next;
+
+ if (!skb)
+ return;
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+ qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
+{
+ struct sock_exterr_skb *serr;
+ struct sk_buff *clone;
+ ktime_t txtime = skb->tstamp;
+
+ if (!skb->sk || !(skb->sk->sk_txtime_report_errors))
+ return;
+
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone)
+ return;
+
+ serr = SKB_EXT_ERR(clone);
+ serr->ee.ee_errno = err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME;
+ serr->ee.ee_type = 0;
+ serr->ee.ee_code = code;
+ serr->ee.ee_pad = 0;
+ serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
+ serr->ee.ee_info = txtime; /* low part of tstamp */
+
+ if (sock_queue_err_skb(skb->sk, clone))
+ kfree_skb(clone);
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node **p = &q->head.rb_node, *parent = NULL;
+ ktime_t txtime = nskb->tstamp;
+
+ if (!is_packet_valid(sch, nskb)) {
+ report_sock_error(nskb, EINVAL,
+ SO_EE_CODE_TXTIME_INVALID_PARAM);
+ return qdisc_drop(nskb, sch, to_free);
+ }
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (ktime_after(txtime, skb->tstamp))
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->head);
+
+ qdisc_qstats_backlog_inc(sch, nskb);
+ sch->q.qlen++;
+
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+ bool drop)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ rb_erase(&skb->rbnode, &q->head);
+
+ /* The rbnode field in the skb re-uses these fields, now that
+ * we are done with the rbnode, reset them.
+ */
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+
+ if (drop) {
+ struct sk_buff *to_free = NULL;
+
+ report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED);
+
+ qdisc_drop(skb, sch, &to_free);
+ kfree_skb_list(to_free);
+ qdisc_qstats_overlimit(sch);
+ } else {
+ qdisc_bstats_update(sch, skb);
+
+ q->last = skb->tstamp;
+ }
+
+ sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ ktime_t now, next;
+
+ skb = etf_peek_timesortedlist(sch);
+ if (!skb)
+ return NULL;
+
+ now = q->get_time();
+
+ /* Drop if packet has expired while in queue. */
+ if (ktime_before(skb->tstamp, now)) {
+ timesortedlist_erase(sch, skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ /* When in deadline mode, dequeue as soon as possible and change the
+ * txtime from deadline to (now + delta).
+ */
+ if (q->deadline_mode) {
+ timesortedlist_erase(sch, skb, false);
+ skb->tstamp = now;
+ goto out;
+ }
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+
+ /* Dequeue only if now is within the [txtime - delta, txtime] range. */
+ if (ktime_after(now, next))
+ timesortedlist_erase(sch, skb, false);
+ else
+ skb = NULL;
+
+out:
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return skb;
+}
+
+static void etf_disable_offload(struct net_device *dev,
+ struct etf_sched_data *q)
+{
+ struct tc_etf_qopt_offload etf = { };
+ const struct net_device_ops *ops;
+ int err;
+
+ if (!q->offload)
+ return;
+
+ ops = dev->netdev_ops;
+ if (!ops->ndo_setup_tc)
+ return;
+
+ etf.queue = q->queue;
+ etf.enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+ if (err < 0)
+ pr_warn("Couldn't disable ETF offload for queue %d\n",
+ etf.queue);
+}
+
+static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_etf_qopt_offload etf = { };
+ int err;
+
+ if (q->offload)
+ return 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload");
+ return -EOPNOTSUPP;
+ }
+
+ etf.queue = q->queue;
+ etf.enable = 1;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload");
+ return err;
+ }
+
+ return 0;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct nlattr *tb[TCA_ETF_MAX + 1];
+ struct tc_etf_qopt *qopt;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack,
+ "Missing ETF qdisc options which are mandatory");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETF_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+ return -EINVAL;
+ }
+
+ qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+ pr_debug("delta %d clockid %d offload %s deadline %s\n",
+ qopt->delta, qopt->clockid,
+ OFFLOAD_IS_ON(qopt) ? "on" : "off",
+ DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+ err = validate_input_params(qopt, extack);
+ if (err < 0)
+ return err;
+
+ q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+ if (OFFLOAD_IS_ON(qopt)) {
+ err = etf_enable_offload(dev, q, extack);
+ if (err < 0)
+ return err;
+ }
+
+ /* Everything went OK, save the parameters used. */
+ q->delta = qopt->delta;
+ q->clockid = qopt->clockid;
+ q->offload = OFFLOAD_IS_ON(qopt);
+ q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+ switch (q->clockid) {
+ case CLOCK_REALTIME:
+ q->get_time = ktime_get_real;
+ break;
+ case CLOCK_MONOTONIC:
+ q->get_time = ktime_get;
+ break;
+ case CLOCK_BOOTTIME:
+ q->get_time = ktime_get_boottime;
+ break;
+ case CLOCK_TAI:
+ q->get_time = ktime_get_clocktai;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Clockid is not supported");
+ return -ENOTSUPP;
+ }
+
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+ return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p = rb_first(&q->head);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+
+ rb_erase(&skb->rbnode, &q->head);
+ rtnl_kfree_skbs(skb, skb);
+ sch->q.qlen--;
+ }
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ /* No matter which mode we are on, it's safe to clear both lists. */
+ timesortedlist_clear(sch);
+ __qdisc_reset_queue(&sch->q);
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ etf_disable_offload(dev, q);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct tc_etf_qopt opt = { };
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ opt.delta = q->delta;
+ opt.clockid = q->clockid;
+ if (q->offload)
+ opt.flags |= TC_ETF_OFFLOAD_ON;
+
+ if (q->deadline_mode)
+ opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+ if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+ .id = "etf",
+ .priv_size = sizeof(struct etf_sched_data),
+ .enqueue = etf_enqueue_timesortedlist,
+ .dequeue = etf_dequeue_timesortedlist,
+ .peek = etf_peek_timesortedlist,
+ .init = etf_init,
+ .reset = etf_reset,
+ .destroy = etf_destroy,
+ .dump = etf_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+ return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+ unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 24893d3b5d22..3809c9bf8896 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -177,7 +177,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
if (q) {
err = fifo_set_limit(q, limit);
if (err < 0) {
- qdisc_destroy(q);
+ qdisc_put(q);
q = NULL;
}
}
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4808713c73b9..4b1af706896c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -92,8 +92,8 @@ struct fq_sched_data {
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
- u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
+ unsigned long flow_max_rate; /* optional max rate per flow */
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
@@ -106,7 +106,6 @@ struct fq_sched_data {
u64 stat_gc_flows;
u64 stat_internal_packets;
- u64 stat_tcp_retrans;
u64 stat_throttled;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
@@ -319,7 +318,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
if (skb) {
flow->head = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
flow->qlen--;
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
@@ -327,62 +326,17 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
return skb;
}
-/* We might add in the future detection of retransmits
- * For the time being, just return false
- */
-static bool skb_is_retransmit(struct sk_buff *skb)
-{
- return false;
-}
-
-/* add skb to flow queue
- * flow queue is a linked list, kind of FIFO, except for TCP retransmits
- * We special case tcp retransmits to be transmitted before other packets.
- * We rely on fact that TCP retransmits are unlikely, so we do not waste
- * a separate queue or a pointer.
- * head-> [retrans pkt 1]
- * [retrans pkt 2]
- * [ normal pkt 1]
- * [ normal pkt 2]
- * [ normal pkt 3]
- * tail-> [ normal pkt 4]
- */
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
- struct sk_buff *prev, *head = flow->head;
+ struct sk_buff *head = flow->head;
skb->next = NULL;
- if (!head) {
+ if (!head)
flow->head = skb;
- flow->tail = skb;
- return;
- }
- if (likely(!skb_is_retransmit(skb))) {
+ else
flow->tail->next = skb;
- flow->tail = skb;
- return;
- }
- /* This skb is a tcp retransmit,
- * find the last retrans packet in the queue
- */
- prev = NULL;
- while (skb_is_retransmit(head)) {
- prev = head;
- head = head->next;
- if (!head)
- break;
- }
- if (!prev) { /* no rtx packet in queue, become the new head */
- skb->next = flow->head;
- flow->head = skb;
- } else {
- if (prev == flow->tail)
- flow->tail = skb;
- else
- skb->next = prev->next;
- prev->next = skb;
- }
+ flow->tail = skb;
}
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -401,8 +355,6 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
f->qlen++;
- if (skb_is_retransmit(skb))
- q->stat_tcp_retrans++;
qdisc_qstats_backlog_inc(sch, skb);
if (fq_flow_is_detached(f)) {
struct sock *sk = skb->sk;
@@ -464,7 +416,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
- u32 rate, plen;
+ unsigned long rate;
+ u32 plen;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
@@ -491,11 +444,16 @@ begin:
}
skb = f->head;
- if (unlikely(skb && now < f->time_next_packet &&
- !skb_is_tcp_pure_ack(skb))) {
- head->first = f->next;
- fq_flow_set_throttled(q, f);
- goto begin;
+ if (skb) {
+ u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
+ f->time_next_packet);
+
+ if (now < time_next_packet) {
+ head->first = f->next;
+ f->time_next_packet = time_next_packet;
+ fq_flow_set_throttled(q, f);
+ goto begin;
+ }
}
skb = fq_dequeue_head(sch, f);
@@ -513,11 +471,7 @@ begin:
prefetch(&skb->end);
f->credit -= qdisc_pkt_len(skb);
- if (!q->rate_enable)
- goto out;
-
- /* Do not pace locally generated ack packets */
- if (skb_is_tcp_pure_ack(skb))
+ if (ktime_to_ns(skb->tstamp) || !q->rate_enable)
goto out;
rate = q->flow_max_rate;
@@ -532,11 +486,11 @@ begin:
if (f->credit > 0)
goto out;
}
- if (rate != ~0U) {
+ if (rate != ~0UL) {
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
- do_div(len, rate);
+ len = div64_ul(len, rate);
/* Since socket rate can change later,
* clamp the delay to 1 second.
* Really, providers of too big packets should be fixed !
@@ -748,9 +702,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
- if (tb[TCA_FQ_FLOW_MAX_RATE])
- q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ if (tb[TCA_FQ_FLOW_MAX_RATE]) {
+ u32 rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+ q->flow_max_rate = (rate == ~0U) ? ~0UL : rate;
+ }
if (tb[TCA_FQ_LOW_RATE_THRESHOLD])
q->low_rate_threshold =
nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]);
@@ -813,7 +769,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->quantum = 2 * psched_mtu(qdisc_dev(sch));
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
- q->flow_max_rate = ~0U;
+ q->flow_max_rate = ~0UL;
q->time_next_delayed_flow = ~0ULL;
q->rate_enable = 1;
q->new_flows.first = NULL;
@@ -823,7 +779,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
- qdisc_watchdog_init(&q->watchdog, sch);
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
if (opt)
err = fq_change(sch, opt, extack);
@@ -849,7 +805,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
- nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+ nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE,
+ min_t(unsigned long, q->flow_max_rate, ~0U)) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
@@ -873,7 +830,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.gc_flows = q->stat_gc_flows;
st.highprio_packets = q->stat_internal_packets;
- st.tcp_retrans = q->stat_tcp_retrans;
+ st.tcp_retrans = 0;
st.throttled = q->stat_throttled;
st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c0a9d5dbf94..cd04d40c30b6 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -124,7 +124,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
struct sk_buff *skb = flow->head;
flow->head = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
return skb;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69078c82963e..de1663f7d3ad 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -184,7 +184,7 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
skb = nskb;
(*packets)++; /* GSO counts as one pkt */
}
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
/* This variant of try_bulk_dequeue_skb() makes sure
@@ -210,7 +210,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
skb = nskb;
} while (++cnt < 8);
(*packets) += cnt;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
}
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
@@ -572,6 +572,18 @@ struct Qdisc noop_qdisc = {
.dev_queue = &noop_netdev_queue,
.running = SEQCNT_ZERO(noop_qdisc.running),
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
+ .gso_skb = {
+ .next = (struct sk_buff *)&noop_qdisc.gso_skb,
+ .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
+ .qlen = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
+ },
+ .skb_bad_txq = {
+ .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
+ .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
+ .qlen = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
+ },
};
EXPORT_SYMBOL(noop_qdisc);
@@ -901,7 +913,7 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
if (!ops->init || ops->init(sch, NULL, extack) == 0)
return sch;
- qdisc_destroy(sch);
+ qdisc_put(sch);
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
@@ -941,15 +953,18 @@ void qdisc_free(struct Qdisc *qdisc)
kfree((char *) qdisc - qdisc->padded);
}
-void qdisc_destroy(struct Qdisc *qdisc)
+static void qdisc_free_cb(struct rcu_head *head)
+{
+ struct Qdisc *q = container_of(head, struct Qdisc, rcu);
+
+ qdisc_free(q);
+}
+
+static void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
struct sk_buff *skb, *tmp;
- if (qdisc->flags & TCQ_F_BUILTIN ||
- !refcount_dec_and_test(&qdisc->refcnt))
- return;
-
#ifdef CONFIG_NET_SCHED
qdisc_hash_del(qdisc);
@@ -974,9 +989,34 @@ void qdisc_destroy(struct Qdisc *qdisc)
kfree_skb_list(skb);
}
- qdisc_free(qdisc);
+ call_rcu(&qdisc->rcu, qdisc_free_cb);
+}
+
+void qdisc_put(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_test(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+}
+EXPORT_SYMBOL(qdisc_put);
+
+/* Version of qdisc_put() that is called with rtnl mutex unlocked.
+ * Intended to be used as optimization, this function only takes rtnl lock if
+ * qdisc reference counter reached zero.
+ */
+
+void qdisc_put_unlocked(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN ||
+ !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
+ return;
+
+ qdisc_destroy(qdisc);
+ rtnl_unlock();
}
-EXPORT_SYMBOL(qdisc_destroy);
+EXPORT_SYMBOL(qdisc_put_unlocked);
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
@@ -1245,8 +1285,6 @@ static void dev_init_scheduler_queue(struct net_device *dev,
rcu_assign_pointer(dev_queue->qdisc, qdisc);
dev_queue->qdisc_sleeping = qdisc;
- __skb_queue_head_init(&qdisc->gso_skb);
- __skb_queue_head_init(&qdisc->skb_bad_txq);
}
void dev_init_scheduler(struct net_device *dev)
@@ -1270,7 +1308,7 @@ static void shutdown_scheduler_queue(struct net_device *dev,
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
dev_queue->qdisc_sleeping = qdisc_default;
- qdisc_destroy(qdisc);
+ qdisc_put(qdisc);
}
}
@@ -1279,7 +1317,7 @@ void dev_shutdown(struct net_device *dev)
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
- qdisc_destroy(dev->qdisc);
+ qdisc_put(dev->qdisc);
dev->qdisc = &noop_qdisc;
WARN_ON(timer_pending(&dev->watchdog_timer));
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3278a76f6861..b18ec1f6de60 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1092,7 +1092,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
struct hfsc_sched *q = qdisc_priv(sch);
tcf_block_put(cl->block);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c3a8388dcdf6..9d6a47697406 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -330,7 +330,7 @@ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
struct sk_buff *skb = bucket->head;
bucket->head = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
return skb;
}
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 2a4ab7caf553..58b449490757 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -126,14 +126,13 @@ struct htb_class {
union {
struct htb_class_leaf {
- struct list_head drop_list;
int deficit[TC_HTB_MAXDEPTH];
struct Qdisc *q;
} leaf;
struct htb_class_inner {
struct htb_prio clprio[TC_HTB_NUMPRIO];
} inner;
- } un;
+ };
s64 pq_key;
int prio_activity; /* for which prios are we active */
@@ -171,7 +170,6 @@ struct htb_sched {
struct qdisc_watchdog watchdog;
s64 now; /* cached dequeue time */
- struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
/* time of nearest event per level (row) */
s64 near_ev_cache[TC_HTB_MAXDEPTH];
@@ -413,13 +411,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.clprio[prio].feed.rb_node)
+ if (p->inner.clprio[prio].feed.rb_node)
/* parent already has its feed in use so that
* reset bit in mask as parent is already ok
*/
mask &= ~(1 << prio);
- htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio);
+ htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio);
}
p->prio_activity |= mask;
cl = p;
@@ -449,19 +447,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
- if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
+ if (p->inner.clprio[prio].ptr == cl->node + prio) {
/* we are removing child which is pointed to from
* parent feed - forget the pointer but remember
* classid
*/
- p->un.inner.clprio[prio].last_ptr_id = cl->common.classid;
- p->un.inner.clprio[prio].ptr = NULL;
+ p->inner.clprio[prio].last_ptr_id = cl->common.classid;
+ p->inner.clprio[prio].ptr = NULL;
}
htb_safe_rb_erase(cl->node + prio,
- &p->un.inner.clprio[prio].feed);
+ &p->inner.clprio[prio].feed);
- if (!p->un.inner.clprio[prio].feed.rb_node)
+ if (!p->inner.clprio[prio].feed.rb_node)
mask |= 1 << prio;
}
@@ -557,13 +555,11 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff)
*/
static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
{
- WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+ WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
if (!cl->prio_activity) {
cl->prio_activity = 1 << cl->prio;
htb_activate_prios(q, cl);
- list_add_tail(&cl->un.leaf.drop_list,
- q->drops + cl->prio);
}
}
@@ -579,23 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
htb_deactivate_prios(q, cl);
cl->prio_activity = 0;
- list_del_init(&cl->un.leaf.drop_list);
-}
-
-static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
- struct qdisc_skb_head *qh)
-{
- struct sk_buff *last = qh->tail;
-
- if (last) {
- skb->next = NULL;
- last->next = skb;
- qh->tail = skb;
- } else {
- qh->tail = skb;
- qh->head = skb;
- }
- qh->qlen++;
}
static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -608,7 +587,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (cl == HTB_DIRECT) {
/* enqueue to helper queue */
if (q->direct_queue.qlen < q->direct_qlen) {
- htb_enqueue_tail(skb, sch, &q->direct_queue);
+ __qdisc_enqueue_tail(skb, &q->direct_queue);
q->direct_pkts++;
} else {
return qdisc_drop(skb, sch, to_free);
@@ -620,7 +599,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
__qdisc_drop(skb, to_free);
return ret;
#endif
- } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+ } else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
to_free)) != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret)) {
qdisc_qstats_drop(sch);
@@ -828,7 +807,7 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio)
cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
if (!cl->level)
return cl;
- clp = &cl->un.inner.clprio[prio];
+ clp = &cl->inner.clprio[prio];
(++sp)->root = clp->feed.rb_node;
sp->pptr = &clp->ptr;
sp->pid = &clp->last_ptr_id;
@@ -862,7 +841,7 @@ next:
* graft operation on the leaf since last dequeue;
* simply deactivate and skip such class
*/
- if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+ if (unlikely(cl->leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q, cl);
@@ -878,12 +857,12 @@ next:
goto next;
}
- skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+ skb = cl->leaf.q->dequeue(cl->leaf.q);
if (likely(skb != NULL))
break;
- qdisc_warn_nonwc("htb", cl->un.leaf.q);
- htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr:
+ qdisc_warn_nonwc("htb", cl->leaf.q);
+ htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr:
&q->hlevel[0].hprio[prio].ptr);
cl = htb_lookup_leaf(hprio, prio);
@@ -891,16 +870,16 @@ next:
if (likely(skb != NULL)) {
bstats_update(&cl->bstats, skb);
- cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
- if (cl->un.leaf.deficit[level] < 0) {
- cl->un.leaf.deficit[level] += cl->quantum;
- htb_next_rb_node(level ? &cl->parent->un.inner.clprio[prio].ptr :
+ cl->leaf.deficit[level] -= qdisc_pkt_len(skb);
+ if (cl->leaf.deficit[level] < 0) {
+ cl->leaf.deficit[level] += cl->quantum;
+ htb_next_rb_node(level ? &cl->parent->inner.clprio[prio].ptr :
&q->hlevel[0].hprio[prio].ptr);
}
/* this used to be after charge_class but this constelation
* gives us slightly better performance
*/
- if (!cl->un.leaf.q->q.qlen)
+ if (!cl->leaf.q->q.qlen)
htb_deactivate(q, cl);
htb_charge_class(q, cl, level, skb);
}
@@ -977,11 +956,10 @@ static void htb_reset(struct Qdisc *sch)
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->level)
- memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+ memset(&cl->inner, 0, sizeof(cl->inner));
else {
- if (cl->un.leaf.q)
- qdisc_reset(cl->un.leaf.q);
- INIT_LIST_HEAD(&cl->un.leaf.drop_list);
+ if (cl->leaf.q)
+ qdisc_reset(cl->leaf.q);
}
cl->prio_activity = 0;
cl->cmode = HTB_CAN_SEND;
@@ -993,8 +971,6 @@ static void htb_reset(struct Qdisc *sch)
sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
- for (i = 0; i < TC_HTB_NUMPRIO; i++)
- INIT_LIST_HEAD(q->drops + i);
}
static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
@@ -1024,7 +1000,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
struct nlattr *tb[TCA_HTB_MAX + 1];
struct tc_htb_glob *gopt;
int err;
- int i;
qdisc_watchdog_init(&q->watchdog, sch);
INIT_WORK(&q->work, htb_work_func);
@@ -1050,8 +1025,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
return err;
- for (i = 0; i < TC_HTB_NUMPRIO; i++)
- INIT_LIST_HEAD(q->drops + i);
qdisc_skb_head_init(&q->direct_queue);
@@ -1109,8 +1082,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
*/
tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
- if (!cl->level && cl->un.leaf.q)
- tcm->tcm_info = cl->un.leaf.q->handle;
+ if (!cl->level && cl->leaf.q)
+ tcm->tcm_info = cl->leaf.q->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
@@ -1153,9 +1126,9 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
};
__u32 qlen = 0;
- if (!cl->level && cl->un.leaf.q) {
- qlen = cl->un.leaf.q->q.qlen;
- qs.backlog = cl->un.leaf.q->qstats.backlog;
+ if (!cl->level && cl->leaf.q) {
+ qlen = cl->leaf.q->q.qlen;
+ qs.backlog = cl->leaf.q->qstats.backlog;
}
cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens),
INT_MIN, INT_MAX);
@@ -1183,14 +1156,14 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
cl->common.classid, extack)) == NULL)
return -ENOBUFS;
- *old = qdisc_replace(sch, new, &cl->un.leaf.q);
+ *old = qdisc_replace(sch, new, &cl->leaf.q);
return 0;
}
static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
{
struct htb_class *cl = (struct htb_class *)arg;
- return !cl->level ? cl->un.leaf.q : NULL;
+ return !cl->level ? cl->leaf.q : NULL;
}
static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
@@ -1216,16 +1189,15 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
{
struct htb_class *parent = cl->parent;
- WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
+ WARN_ON(cl->level || !cl->leaf.q || cl->prio_activity);
if (parent->cmode != HTB_CAN_SEND)
htb_safe_rb_erase(&parent->pq_node,
&q->hlevel[parent->level].wait_pq);
parent->level = 0;
- memset(&parent->un.inner, 0, sizeof(parent->un.inner));
- INIT_LIST_HEAD(&parent->un.leaf.drop_list);
- parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
+ memset(&parent->inner, 0, sizeof(parent->inner));
+ parent->leaf.q = new_q ? new_q : &noop_qdisc;
parent->tokens = parent->buffer;
parent->ctokens = parent->cbuffer;
parent->t_c = ktime_get_ns();
@@ -1235,8 +1207,8 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
{
if (!cl->level) {
- WARN_ON(!cl->un.leaf.q);
- qdisc_destroy(cl->un.leaf.q);
+ WARN_ON(!cl->leaf.q);
+ qdisc_put(cl->leaf.q);
}
gen_kill_estimator(&cl->rate_est);
tcf_block_put(cl->block);
@@ -1298,11 +1270,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
sch_tree_lock(sch);
if (!cl->level) {
- unsigned int qlen = cl->un.leaf.q->q.qlen;
- unsigned int backlog = cl->un.leaf.q->qstats.backlog;
+ unsigned int qlen = cl->leaf.q->q.qlen;
+ unsigned int backlog = cl->leaf.q->qstats.backlog;
- qdisc_reset(cl->un.leaf.q);
- qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
+ qdisc_reset(cl->leaf.q);
+ qdisc_tree_reduce_backlog(cl->leaf.q, qlen, backlog);
}
/* delete from hash and active; remainder in destroy_class */
@@ -1418,7 +1390,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
}
cl->children = 0;
- INIT_LIST_HEAD(&cl->un.leaf.drop_list);
RB_CLEAR_NODE(&cl->pq_node);
for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
@@ -1432,13 +1403,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
classid, NULL);
sch_tree_lock(sch);
if (parent && !parent->level) {
- unsigned int qlen = parent->un.leaf.q->q.qlen;
- unsigned int backlog = parent->un.leaf.q->qstats.backlog;
+ unsigned int qlen = parent->leaf.q->q.qlen;
+ unsigned int backlog = parent->leaf.q->qstats.backlog;
/* turn parent into inner node */
- qdisc_reset(parent->un.leaf.q);
- qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
- qdisc_destroy(parent->un.leaf.q);
+ qdisc_reset(parent->leaf.q);
+ qdisc_tree_reduce_backlog(parent->leaf.q, qlen, backlog);
+ qdisc_put(parent->leaf.q);
if (parent->prio_activity)
htb_deactivate(q, parent);
@@ -1449,10 +1420,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
}
parent->level = (parent->parent ? parent->parent->level
: TC_HTB_MAXDEPTH) - 1;
- memset(&parent->un.inner, 0, sizeof(parent->un.inner));
+ memset(&parent->inner, 0, sizeof(parent->inner));
}
/* leaf (we) needs elementary qdisc */
- cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
+ cl->leaf.q = new_q ? new_q : &noop_qdisc;
cl->common.classid = classid;
cl->parent = parent;
@@ -1468,8 +1439,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
qdisc_class_hash_insert(&q->clhash, &cl->common);
if (parent)
parent->children++;
- if (cl->un.leaf.q != &noop_qdisc)
- qdisc_hash_add(cl->un.leaf.q, true);
+ if (cl->leaf.q != &noop_qdisc)
+ qdisc_hash_add(cl->leaf.q, true);
} else {
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
@@ -1491,7 +1462,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64);
/* it used to be a nasty bug here, we have to check that node
- * is really leaf before changing cl->un.leaf !
+ * is really leaf before changing cl->leaf !
*/
if (!cl->level) {
u64 quantum = cl->rate.rate_bytes_ps;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index d6b8ae4ed7a3..f20f3a0f8424 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -65,7 +65,7 @@ static void mq_destroy(struct Qdisc *sch)
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
@@ -119,7 +119,7 @@ static void mq_attach(struct Qdisc *sch)
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
#ifdef CONFIG_NET_SCHED
if (ntx < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 0e9d761cdd80..d364e63c396d 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -40,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
for (ntx = 0;
ntx < dev->num_tx_queues && priv->qdiscs[ntx];
ntx++)
- qdisc_destroy(priv->qdiscs[ntx]);
+ qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
@@ -300,7 +300,7 @@ static void mqprio_attach(struct Qdisc *sch)
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
- qdisc_destroy(old);
+ qdisc_put(old);
if (ntx < dev->real_num_tx_queues)
qdisc_hash_add(qdisc, false);
}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 1da7ea8de0ad..7410ce4d0321 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -175,7 +175,7 @@ multiq_destroy(struct Qdisc *sch)
tcf_block_put(q->block);
for (band = 0; band < q->bands; band++)
- qdisc_destroy(q->queues[band]);
+ qdisc_put(q->queues[band]);
kfree(q->queues);
}
@@ -204,7 +204,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
q->queues[i] = &noop_qdisc;
qdisc_tree_reduce_backlog(child, child->q.qlen,
child->qstats.backlog);
- qdisc_destroy(child);
+ qdisc_put(child);
}
}
@@ -228,7 +228,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(old,
old->q.qlen,
old->qstats.backlog);
- qdisc_destroy(old);
+ qdisc_put(old);
}
sch_tree_unlock(sch);
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7d6801fc5340..57b3ad9394ad 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -68,6 +68,11 @@
Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
+struct disttable {
+ u32 size;
+ s16 table[0];
+};
+
struct netem_sched_data {
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
struct rb_root t_root;
@@ -99,10 +104,7 @@ struct netem_sched_data {
u32 rho;
} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
- struct disttable {
- u32 size;
- s16 table[0];
- } *delay_dist;
+ struct disttable *delay_dist;
enum {
CLG_RANDOM,
@@ -142,6 +144,7 @@ struct netem_sched_data {
s32 bytes_left;
} slot;
+ struct disttable *slot_dist;
};
/* Time stamp put into socket buffer control block
@@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state)
u64 value, rho;
unsigned long answer;
- if (state->rho == 0) /* no correlation */
+ if (!state || state->rho == 0) /* no correlation */
return prandom_u32();
value = prandom_u32();
@@ -409,16 +412,6 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
return segs;
}
-static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
-{
- skb->next = qh->head;
-
- if (!qh->head)
- qh->tail = skb;
- qh->head = skb;
- qh->qlen++;
-}
-
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
@@ -567,7 +560,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cb->time_to_send = ktime_get_ns();
q->counter = 0;
- netem_enqueue_skb_head(&sch->q, skb);
+ __qdisc_enqueue_head(skb, &sch->q);
sch->qstats.requeues++;
}
@@ -575,7 +568,7 @@ finish_segs:
if (segs) {
while (segs) {
skb2 = segs->next;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
last_len = segs->len;
rc = qdisc_enqueue(segs, sch, to_free);
@@ -601,10 +594,19 @@ finish_segs:
static void get_slot_next(struct netem_sched_data *q, u64 now)
{
- q->slot.slot_next = now + q->slot_config.min_delay +
- (prandom_u32() *
- (q->slot_config.max_delay -
- q->slot_config.min_delay) >> 32);
+ s64 next_delay;
+
+ if (!q->slot_dist)
+ next_delay = q->slot_config.min_delay +
+ (prandom_u32() *
+ (q->slot_config.max_delay -
+ q->slot_config.min_delay) >> 32);
+ else
+ next_delay = tabledist(q->slot_config.dist_delay,
+ (s32)(q->slot_config.dist_jitter),
+ NULL, q->slot_dist);
+
+ q->slot.slot_next = now + next_delay;
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
}
@@ -721,9 +723,9 @@ static void dist_free(struct disttable *d)
* signed 16 bit values.
*/
-static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
+static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
+ const struct nlattr *attr)
{
- struct netem_sched_data *q = qdisc_priv(sch);
size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
@@ -744,7 +746,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
- swap(q->delay_dist, d);
+ swap(*tbl, d);
spin_unlock_bh(root_lock);
dist_free(d);
@@ -762,7 +764,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
q->slot_config.max_bytes = INT_MAX;
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
- if (q->slot_config.min_delay | q->slot_config.max_delay)
+ if (q->slot_config.min_delay | q->slot_config.max_delay |
+ q->slot_config.dist_jitter)
q->slot.slot_next = ktime_get_ns();
else
q->slot.slot_next = 0;
@@ -926,16 +929,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
}
if (tb[TCA_NETEM_DELAY_DIST]) {
- ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
- if (ret) {
- /* recover clg and loss_model, in case of
- * q->clg and q->loss_model were modified
- * in get_loss_clg()
- */
- q->clg = old_clg;
- q->loss_model = old_loss_model;
- return ret;
- }
+ ret = get_dist_table(sch, &q->delay_dist,
+ tb[TCA_NETEM_DELAY_DIST]);
+ if (ret)
+ goto get_table_failure;
+ }
+
+ if (tb[TCA_NETEM_SLOT_DIST]) {
+ ret = get_dist_table(sch, &q->slot_dist,
+ tb[TCA_NETEM_SLOT_DIST]);
+ if (ret)
+ goto get_table_failure;
}
sch->limit = qopt->limit;
@@ -983,6 +987,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
get_slot(q, tb[TCA_NETEM_SLOT]);
return ret;
+
+get_table_failure:
+ /* recover clg and loss_model, in case of
+ * q->clg and q->loss_model were modified
+ * in get_loss_clg()
+ */
+ q->clg = old_clg;
+ q->loss_model = old_loss_model;
+ return ret;
}
static int netem_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1009,8 +1022,9 @@ static void netem_destroy(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
if (q->qdisc)
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
dist_free(q->delay_dist);
+ dist_free(q->slot_dist);
}
static int dump_loss_model(const struct netem_sched_data *q,
@@ -1127,7 +1141,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
if (dump_loss_model(q, skb) != 0)
goto nla_put_failure;
- if (q->slot_config.min_delay | q->slot_config.max_delay) {
+ if (q->slot_config.min_delay | q->slot_config.max_delay |
+ q->slot_config.dist_jitter) {
slot = q->slot_config;
if (slot.max_packets == INT_MAX)
slot.max_packets = 0;
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 18d30bb86881..d1429371592f 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -110,8 +110,8 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size)
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
- if ((q->vars.qdelay < q->params.target / 2)
- && (q->vars.prob < MAX_PROB / 5))
+ if ((q->vars.qdelay < q->params.target / 2) &&
+ (q->vars.prob < MAX_PROB / 5))
return false;
/* If we have fewer than 2 mtu-sized packets, disable drop_early,
@@ -209,7 +209,8 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
/* tupdate is in jiffies */
if (tb[TCA_PIE_TUPDATE])
- q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
+ q->params.tupdate =
+ usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
if (tb[TCA_PIE_LIMIT]) {
u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
@@ -247,7 +248,6 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
{
-
struct pie_sched_data *q = qdisc_priv(sch);
int qlen = sch->qstats.backlog; /* current queue size in bytes */
@@ -294,9 +294,9 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
* dq_count to 0 to re-enter the if block when the next
* packet is dequeued
*/
- if (qlen < QUEUE_THRESHOLD)
+ if (qlen < QUEUE_THRESHOLD) {
q->vars.dq_count = DQCOUNT_INVALID;
- else {
+ } else {
q->vars.dq_count = 0;
q->vars.dq_tstamp = psched_get_time();
}
@@ -370,7 +370,7 @@ static void calculate_probability(struct Qdisc *sch)
oldprob = q->vars.prob;
/* to ensure we increase probability in steps of no more than 2% */
- if (delta > (s32) (MAX_PROB / (100 / 2)) &&
+ if (delta > (s32)(MAX_PROB / (100 / 2)) &&
q->vars.prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
@@ -405,7 +405,7 @@ static void calculate_probability(struct Qdisc *sch)
* delay is 0 for 2 consecutive Tupdate periods.
*/
- if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
+ if (qdelay == 0 && qdelay_old == 0 && update_prob)
q->vars.prob = (q->vars.prob * 98) / 100;
q->vars.qdelay = qdelay;
@@ -419,8 +419,8 @@ static void calculate_probability(struct Qdisc *sch)
*/
if ((q->vars.qdelay < q->params.target / 2) &&
(q->vars.qdelay_old < q->params.target / 2) &&
- (q->vars.prob == 0) &&
- (q->vars.avg_dq_rate > 0))
+ q->vars.prob == 0 &&
+ q->vars.avg_dq_rate > 0)
pie_vars_init(&q->vars);
}
@@ -437,7 +437,6 @@ static void pie_timer(struct timer_list *t)
if (q->params.tupdate)
mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
spin_unlock(root_lock);
-
}
static int pie_init(struct Qdisc *sch, struct nlattr *opt,
@@ -469,15 +468,16 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
- if (opts == NULL)
+ if (!opts)
goto nla_put_failure;
/* convert target from pschedtime to us */
if (nla_put_u32(skb, TCA_PIE_TARGET,
- ((u32) PSCHED_TICKS2NS(q->params.target)) /
+ ((u32)PSCHED_TICKS2NS(q->params.target)) /
NSEC_PER_USEC) ||
nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
- nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
+ nla_put_u32(skb, TCA_PIE_TUPDATE,
+ jiffies_to_usecs(q->params.tupdate)) ||
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
@@ -489,7 +489,6 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_failure:
nla_nest_cancel(skb, opts);
return -1;
-
}
static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
@@ -497,7 +496,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
.prob = q->vars.prob,
- .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
+ .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
/* unscale and return dq_rate in bytes per sec */
.avg_dq_rate = q->vars.avg_dq_rate *
@@ -514,8 +513,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
- struct sk_buff *skb;
- skb = qdisc_dequeue_head(sch);
+ struct sk_buff *skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
@@ -527,6 +525,7 @@ static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
static void pie_reset(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
+
qdisc_reset_queue(sch);
pie_vars_init(&q->vars);
}
@@ -534,6 +533,7 @@ static void pie_reset(struct Qdisc *sch)
static void pie_destroy(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
+
q->params.tupdate = 0;
del_timer_sync(&q->adapt_timer);
}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 222e53d3d27a..f8af98621179 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -175,7 +175,7 @@ prio_destroy(struct Qdisc *sch)
tcf_block_put(q->block);
prio_offload(sch, NULL);
for (prio = 0; prio < q->bands; prio++)
- qdisc_destroy(q->queues[prio]);
+ qdisc_put(q->queues[prio]);
}
static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
@@ -205,7 +205,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
extack);
if (!queues[i]) {
while (i > oldbands)
- qdisc_destroy(queues[--i]);
+ qdisc_put(queues[--i]);
return -ENOMEM;
}
}
@@ -220,7 +220,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(child, child->q.qlen,
child->qstats.backlog);
- qdisc_destroy(child);
+ qdisc_put(child);
}
for (i = oldbands; i < q->bands; i++) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c11fc54..dc37c4ead439 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -526,7 +526,7 @@ set_change_agg:
return 0;
destroy_class:
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
return err;
}
@@ -537,7 +537,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
qfq_rm_from_agg(q, cl);
gen_kill_estimator(&cl->rate_est);
- qdisc_destroy(cl->qdisc);
+ qdisc_put(cl->qdisc);
kfree(cl);
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 56c181c3feeb..3ce6c0a2c493 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -181,7 +181,7 @@ static void red_destroy(struct Qdisc *sch)
del_timer_sync(&q->adapt_timer);
red_offload(sch, false);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
@@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 7cbdad8419b7..bab506b01a32 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -469,7 +469,7 @@ static void sfb_destroy(struct Qdisc *sch)
struct sfb_sched_data *q = qdisc_priv(sch);
tcf_block_put(q->block);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
@@ -523,7 +523,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 000000000000..52c0b6d8f1d7
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,320 @@
+/*
+ * net/sched/sch_skbprio.c SKB Priority Queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Nishanth Devarajan, <ndev2021@gmail.com>
+ * Cody Doucette, <doucette@bu.edu>
+ * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+/* SKB Priority Queue
+ * =================================
+ *
+ * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes
+ * packets according to their skb->priority field. Under congestion,
+ * Skbprio drops already-enqueued lower priority packets to make space
+ * available for higher priority packets; it was conceived as a solution
+ * for denial-of-service defenses that need to route packets with different
+ * priorities as a mean to overcome DoS attacks.
+ */
+
+struct skbprio_sched_data {
+ /* Queue state. */
+ struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
+ struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
+ u16 highest_prio;
+ u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
+{
+ int prio;
+
+ for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+ if (!skb_queue_empty(&q->qdiscs[prio]))
+ return prio;
+ }
+
+ /* SKB queue is empty, return 0 (default highest priority setting). */
+ return 0;
+}
+
+static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
+{
+ int prio;
+
+ for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+ if (!skb_queue_empty(&q->qdiscs[prio]))
+ return prio;
+ }
+
+ /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
+ * (default lowest priority setting).
+ */
+ return SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *qdisc;
+ struct sk_buff_head *lp_qdisc;
+ struct sk_buff *to_drop;
+ u16 prio, lp;
+
+ /* Obtain the priority of @skb. */
+ prio = min(skb->priority, max_priority);
+
+ qdisc = &q->qdiscs[prio];
+ if (sch->q.qlen < sch->limit) {
+ __skb_queue_tail(qdisc, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+ /* Check to update highest and lowest priorities. */
+ if (prio > q->highest_prio)
+ q->highest_prio = prio;
+
+ if (prio < q->lowest_prio)
+ q->lowest_prio = prio;
+
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+
+ /* If this packet has the lowest priority, drop it. */
+ lp = q->lowest_prio;
+ if (prio <= lp) {
+ q->qstats[prio].drops++;
+ q->qstats[prio].overlimits++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ __skb_queue_tail(qdisc, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+ /* Drop the packet at the tail of the lowest priority qdisc. */
+ lp_qdisc = &q->qdiscs[lp];
+ to_drop = __skb_dequeue_tail(lp_qdisc);
+ BUG_ON(!to_drop);
+ qdisc_qstats_backlog_dec(sch, to_drop);
+ qdisc_drop(to_drop, sch, to_free);
+
+ q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
+ q->qstats[lp].drops++;
+ q->qstats[lp].overlimits++;
+
+ /* Check to update highest and lowest priorities. */
+ if (skb_queue_empty(lp_qdisc)) {
+ if (q->lowest_prio == q->highest_prio) {
+ /* The incoming packet is the only packet in queue. */
+ BUG_ON(sch->q.qlen != 1);
+ q->lowest_prio = prio;
+ q->highest_prio = prio;
+ } else {
+ q->lowest_prio = calc_new_low_prio(q);
+ }
+ }
+
+ if (prio > q->highest_prio)
+ q->highest_prio = prio;
+
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+ struct sk_buff *skb = __skb_dequeue(hpq);
+
+ if (unlikely(!skb))
+ return NULL;
+
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+
+ q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
+
+ /* Update highest priority field. */
+ if (skb_queue_empty(hpq)) {
+ if (q->lowest_prio == q->highest_prio) {
+ BUG_ON(sch->q.qlen);
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+ } else {
+ q->highest_prio = calc_new_high_prio(q);
+ }
+ }
+ return skb;
+}
+
+static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_skbprio_qopt *ctl = nla_data(opt);
+
+ sch->limit = ctl->limit;
+ return 0;
+}
+
+static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ /* Initialise all queues, one for each possible priority. */
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_head_init(&q->qdiscs[prio]);
+
+ memset(&q->qstats, 0, sizeof(q->qstats));
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+ sch->limit = 64;
+ if (!opt)
+ return 0;
+
+ return skbprio_change(sch, opt, extack);
+}
+
+static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tc_skbprio_qopt opt;
+
+ opt.limit = sch->limit;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ return -1;
+
+ return skb->len;
+}
+
+static void skbprio_reset(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_purge(&q->qdiscs[prio]);
+
+ memset(&q->qstats, 0, sizeof(q->qstats));
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static void skbprio_destroy(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_purge(&q->qdiscs[prio]);
+}
+
+static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
+ q->qstats[cl - 1].qlen) < 0)
+ return -1;
+ return 0;
+}
+
+static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops skbprio_class_ops = {
+ .leaf = skbprio_leaf,
+ .find = skbprio_find,
+ .dump = skbprio_dump_class,
+ .dump_stats = skbprio_dump_class_stats,
+ .walk = skbprio_walk,
+};
+
+static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
+ .cl_ops = &skbprio_class_ops,
+ .id = "skbprio",
+ .priv_size = sizeof(struct skbprio_sched_data),
+ .enqueue = skbprio_enqueue,
+ .dequeue = skbprio_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = skbprio_init,
+ .reset = skbprio_reset,
+ .change = skbprio_change,
+ .dump = skbprio_dump,
+ .destroy = skbprio_destroy,
+ .owner = THIS_MODULE,
+};
+
+static int __init skbprio_module_init(void)
+{
+ return register_qdisc(&skbprio_qdisc_ops);
+}
+
+static void __exit skbprio_module_exit(void)
+{
+ unregister_qdisc(&skbprio_qdisc_ops);
+}
+
+module_init(skbprio_module_init)
+module_exit(skbprio_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
new file mode 100644
index 000000000000..206e4dbed12f
--- /dev/null
+++ b/net/sched/sch_taprio.c
@@ -0,0 +1,962 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_taprio.c Time Aware Priority Scheduler
+ *
+ * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+#include <net/sch_generic.h>
+
+#define TAPRIO_ALL_GATES_OPEN -1
+
+struct sched_entry {
+ struct list_head list;
+
+ /* The instant that this entry "closes" and the next one
+ * should open, the qdisc will make some effort so that no
+ * packet leaves after this time.
+ */
+ ktime_t close_time;
+ atomic_t budget;
+ int index;
+ u32 gate_mask;
+ u32 interval;
+ u8 command;
+};
+
+struct taprio_sched {
+ struct Qdisc **qdiscs;
+ struct Qdisc *root;
+ s64 base_time;
+ int clockid;
+ int picos_per_byte; /* Using picoseconds because for 10Gbps+
+ * speeds it's sub-nanoseconds per byte
+ */
+ size_t num_entries;
+
+ /* Protects the update side of the RCU protected current_entry */
+ spinlock_t current_entry_lock;
+ struct sched_entry __rcu *current_entry;
+ struct list_head entries;
+ ktime_t (*get_time)(void);
+ struct hrtimer advance_timer;
+};
+
+static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct Qdisc *child;
+ int queue;
+
+ queue = skb_get_queue_mapping(skb);
+
+ child = q->qdiscs[queue];
+ if (unlikely(!child))
+ return qdisc_drop(skb, sch, to_free);
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return qdisc_enqueue(skb, child, to_free);
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sched_entry *entry;
+ struct sk_buff *skb;
+ u32 gate_mask;
+ int i;
+
+ rcu_read_lock();
+ entry = rcu_dereference(q->current_entry);
+ gate_mask = entry ? entry->gate_mask : -1;
+ rcu_read_unlock();
+
+ if (!gate_mask)
+ return NULL;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+ int prio;
+ u8 tc;
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ if (!(gate_mask & BIT(tc)))
+ return NULL;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static inline int length_to_duration(struct taprio_sched *q, int len)
+{
+ return (len * q->picos_per_byte) / 1000;
+}
+
+static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sched_entry *entry;
+ struct sk_buff *skb;
+ u32 gate_mask;
+ int i;
+
+ rcu_read_lock();
+ entry = rcu_dereference(q->current_entry);
+ /* if there's no entry, it means that the schedule didn't
+ * start yet, so force all gates to be open, this is in
+ * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
+ * "AdminGateSates"
+ */
+ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
+ rcu_read_unlock();
+
+ if (!gate_mask)
+ return NULL;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+ ktime_t guard;
+ int prio;
+ int len;
+ u8 tc;
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ if (!(gate_mask & BIT(tc)))
+ continue;
+
+ len = qdisc_pkt_len(skb);
+ guard = ktime_add_ns(q->get_time(),
+ length_to_duration(q, len));
+
+ /* In the case that there's no gate entry, there's no
+ * guard band ...
+ */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ ktime_after(guard, entry->close_time))
+ return NULL;
+
+ /* ... and no budget. */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ atomic_sub_return(len, &entry->budget) < 0)
+ return NULL;
+
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static bool should_restart_cycle(const struct taprio_sched *q,
+ const struct sched_entry *entry)
+{
+ WARN_ON(!entry);
+
+ return list_is_last(&entry->list, &q->entries);
+}
+
+static enum hrtimer_restart advance_sched(struct hrtimer *timer)
+{
+ struct taprio_sched *q = container_of(timer, struct taprio_sched,
+ advance_timer);
+ struct sched_entry *entry, *next;
+ struct Qdisc *sch = q->root;
+ ktime_t close_time;
+
+ spin_lock(&q->current_entry_lock);
+ entry = rcu_dereference_protected(q->current_entry,
+ lockdep_is_held(&q->current_entry_lock));
+
+ /* This is the case that it's the first time that the schedule
+ * runs, so it only happens once per schedule. The first entry
+ * is pre-calculated during the schedule initialization.
+ */
+ if (unlikely(!entry)) {
+ next = list_first_entry(&q->entries, struct sched_entry,
+ list);
+ close_time = next->close_time;
+ goto first_run;
+ }
+
+ if (should_restart_cycle(q, entry))
+ next = list_first_entry(&q->entries, struct sched_entry,
+ list);
+ else
+ next = list_next_entry(entry, list);
+
+ close_time = ktime_add_ns(entry->close_time, next->interval);
+
+ next->close_time = close_time;
+ atomic_set(&next->budget,
+ (next->interval * 1000) / q->picos_per_byte);
+
+first_run:
+ rcu_assign_pointer(q->current_entry, next);
+ spin_unlock(&q->current_entry_lock);
+
+ hrtimer_set_expires(&q->advance_timer, close_time);
+
+ rcu_read_lock();
+ __netif_schedule(sch);
+ rcu_read_unlock();
+
+ return HRTIMER_RESTART;
+}
+
+static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
+ [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 },
+ [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 },
+ [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
+ [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
+ [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
+ [TCA_TAPRIO_ATTR_PRIOMAP] = {
+ .len = sizeof(struct tc_mqprio_qopt)
+ },
+ [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED },
+ [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED },
+ [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
+};
+
+static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
+ struct netlink_ext_ack *extack)
+{
+ u32 interval = 0;
+
+ if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
+ entry->command = nla_get_u8(
+ tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);
+
+ if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
+ entry->gate_mask = nla_get_u32(
+ tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);
+
+ if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
+ interval = nla_get_u32(
+ tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);
+
+ if (interval == 0) {
+ NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
+ return -EINVAL;
+ }
+
+ entry->interval = interval;
+
+ return 0;
+}
+
+static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
+ int index, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
+ int err;
+
+ err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
+ entry_policy, NULL);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+ return -EINVAL;
+ }
+
+ entry->index = index;
+
+ return fill_sched_entry(tb, entry, extack);
+}
+
+/* Returns the number of entries in case of success */
+static int parse_sched_single_entry(struct nlattr *n,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
+ struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { };
+ struct sched_entry *entry;
+ bool found = false;
+ u32 index;
+ int err;
+
+ err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX,
+ n, entry_list_policy, NULL);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+ return -EINVAL;
+ }
+
+ if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) {
+ NL_SET_ERR_MSG(extack, "Single-entry must include an entry");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX,
+ tb_list[TCA_TAPRIO_SCHED_ENTRY],
+ entry_policy, NULL);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Could not parse nested entry");
+ return -EINVAL;
+ }
+
+ if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Entry must specify an index\n");
+ return -EINVAL;
+ }
+
+ index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]);
+ if (index >= q->num_entries) {
+ NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule");
+ return -EINVAL;
+ }
+
+ list_for_each_entry(entry, &q->entries, list) {
+ if (entry->index == index) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ NL_SET_ERR_MSG(extack, "Could not find entry");
+ return -ENOENT;
+ }
+
+ err = fill_sched_entry(tb_entry, entry, extack);
+ if (err < 0)
+ return err;
+
+ return q->num_entries;
+}
+
+static int parse_sched_list(struct nlattr *list,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *n;
+ int err, rem;
+ int i = 0;
+
+ if (!list)
+ return -EINVAL;
+
+ nla_for_each_nested(n, list, rem) {
+ struct sched_entry *entry;
+
+ if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
+ NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
+ continue;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ NL_SET_ERR_MSG(extack, "Not enough memory for entry");
+ return -ENOMEM;
+ }
+
+ err = parse_sched_entry(n, entry, i, extack);
+ if (err < 0) {
+ kfree(entry);
+ return err;
+ }
+
+ list_add_tail(&entry->list, &q->entries);
+ i++;
+ }
+
+ q->num_entries = i;
+
+ return i;
+}
+
+/* Returns the number of entries in case of success */
+static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ int err = 0;
+ int clockid;
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] &&
+ tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
+ return -EINVAL;
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0)
+ return -EINVAL;
+
+ if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID])
+ return -EINVAL;
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
+ q->base_time = nla_get_s64(
+ tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+ /* We only support static clockids and we don't allow
+ * for it to be modified after the first init.
+ */
+ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid))
+ return -EINVAL;
+
+ q->clockid = clockid;
+ }
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
+ err = parse_sched_list(
+ tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack);
+ else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
+ err = parse_sched_single_entry(
+ tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack);
+
+ /* parse_sched_* return the number of entries in the schedule,
+ * a schedule with zero entries is an error.
+ */
+ if (err == 0) {
+ NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry");
+ return -EINVAL;
+ }
+
+ return err;
+}
+
+static int taprio_parse_mqprio_opt(struct net_device *dev,
+ struct tc_mqprio_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ int i, j;
+
+ if (!qopt) {
+ NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
+ return -EINVAL;
+ }
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE) {
+ NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
+ return -EINVAL;
+ }
+
+ /* taprio imposes that traffic classes map 1:n to tx queues */
+ if (qopt->num_tc > dev->num_tx_queues) {
+ NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
+ return -EINVAL;
+ }
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i < TC_BITMASK + 1; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc) {
+ NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->num_tx_queues ||
+ !qopt->count[i] ||
+ last > dev->real_num_tx_queues) {
+ NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
+ return -EINVAL;
+ }
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (last > qopt->offset[j]) {
+ NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static ktime_t taprio_get_start_time(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_entry *entry;
+ ktime_t now, base, cycle;
+ s64 n;
+
+ base = ns_to_ktime(q->base_time);
+ cycle = 0;
+
+ /* Calculate the cycle_time, by summing all the intervals.
+ */
+ list_for_each_entry(entry, &q->entries, list)
+ cycle = ktime_add_ns(cycle, entry->interval);
+
+ if (!cycle)
+ return base;
+
+ now = q->get_time();
+
+ if (ktime_after(base, now))
+ return base;
+
+ /* Schedule the start time for the beginning of the next
+ * cycle.
+ */
+ n = div64_s64(ktime_sub_ns(now, base), cycle);
+
+ return ktime_add_ns(base, (n + 1) * cycle);
+}
+
+static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct sched_entry *first;
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->current_entry_lock, flags);
+
+ first = list_first_entry(&q->entries, struct sched_entry,
+ list);
+
+ first->close_time = ktime_add_ns(start, first->interval);
+ atomic_set(&first->budget,
+ (first->interval * 1000) / q->picos_per_byte);
+ rcu_assign_pointer(q->current_entry, NULL);
+
+ spin_unlock_irqrestore(&q->current_entry_lock, flags);
+
+ hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
+}
+
+static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mqprio_qopt *mqprio = NULL;
+ struct ethtool_link_ksettings ecmd;
+ int i, err, size;
+ s64 link_speed;
+ ktime_t start;
+
+ err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt,
+ taprio_policy, extack);
+ if (err < 0)
+ return err;
+
+ err = -EINVAL;
+ if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
+ mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
+
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack);
+ if (err < 0)
+ return err;
+
+ /* A schedule with less than one entry is an error */
+ size = parse_taprio_opt(tb, q, extack);
+ if (size < 0)
+ return size;
+
+ hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
+ q->advance_timer.function = advance_sched;
+
+ switch (q->clockid) {
+ case CLOCK_REALTIME:
+ q->get_time = ktime_get_real;
+ break;
+ case CLOCK_MONOTONIC:
+ q->get_time = ktime_get;
+ break;
+ case CLOCK_BOOTTIME:
+ q->get_time = ktime_get_boottime;
+ break;
+ case CLOCK_TAI:
+ q->get_time = ktime_get_clocktai;
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *dev_queue;
+ struct Qdisc *qdisc;
+
+ dev_queue = netdev_get_tx_queue(dev, i);
+ qdisc = qdisc_create_dflt(dev_queue,
+ &pfifo_qdisc_ops,
+ TC_H_MAKE(TC_H_MAJ(sch->handle),
+ TC_H_MIN(i + 1)),
+ extack);
+ if (!qdisc)
+ return -ENOMEM;
+
+ if (i < dev->real_num_tx_queues)
+ qdisc_hash_add(qdisc, false);
+
+ q->qdiscs[i] = qdisc;
+ }
+
+ if (mqprio) {
+ netdev_set_num_tc(dev, mqprio->num_tc);
+ for (i = 0; i < mqprio->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ mqprio->count[i],
+ mqprio->offset[i]);
+
+ /* Always use supplied priority mappings */
+ for (i = 0; i < TC_BITMASK + 1; i++)
+ netdev_set_prio_tc_map(dev, i,
+ mqprio->prio_tc_map[i]);
+ }
+
+ if (!__ethtool_get_link_ksettings(dev, &ecmd))
+ link_speed = ecmd.base.speed;
+ else
+ link_speed = SPEED_1000;
+
+ q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
+ link_speed * 1000 * 1000);
+
+ start = taprio_get_start_time(sch);
+ if (!start)
+ return 0;
+
+ taprio_start_sched(sch, start);
+
+ return 0;
+}
+
+static void taprio_destroy(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sched_entry *entry, *n;
+ unsigned int i;
+
+ hrtimer_cancel(&q->advance_timer);
+
+ if (q->qdiscs) {
+ for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
+ qdisc_put(q->qdiscs[i]);
+
+ kfree(q->qdiscs);
+ }
+ q->qdiscs = NULL;
+
+ netdev_set_num_tc(dev, 0);
+
+ list_for_each_entry_safe(entry, n, &q->entries, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+}
+
+static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ INIT_LIST_HEAD(&q->entries);
+ spin_lock_init(&q->current_entry_lock);
+
+ /* We may overwrite the configuration later */
+ hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
+
+ q->root = sch;
+
+ /* We only support static clockids. Use an invalid value as default
+ * and get the valid one on taprio_change().
+ */
+ q->clockid = -1;
+
+ if (sch->parent != TC_H_ROOT)
+ return -EOPNOTSUPP;
+
+ if (!netif_is_multiqueue(dev))
+ return -EOPNOTSUPP;
+
+ /* pre-allocate qdisc, attachment can't fail */
+ q->qdiscs = kcalloc(dev->num_tx_queues,
+ sizeof(q->qdiscs[0]),
+ GFP_KERNEL);
+
+ if (!q->qdiscs)
+ return -ENOMEM;
+
+ if (!opt)
+ return -EINVAL;
+
+ return taprio_change(sch, opt, extack);
+}
+
+static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx = cl - 1;
+
+ if (ntx >= dev->num_tx_queues)
+ return NULL;
+
+ return netdev_get_tx_queue(dev, ntx);
+}
+
+static int taprio_graft(struct Qdisc *sch, unsigned long cl,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return -EINVAL;
+
+ if (dev->flags & IFF_UP)
+ dev_deactivate(dev);
+
+ *old = q->qdiscs[cl - 1];
+ q->qdiscs[cl - 1] = new;
+
+ if (new)
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+
+ if (dev->flags & IFF_UP)
+ dev_activate(dev);
+
+ return 0;
+}
+
+static int dump_entry(struct sk_buff *msg,
+ const struct sched_entry *entry)
+{
+ struct nlattr *item;
+
+ item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY);
+ if (!item)
+ return -ENOSPC;
+
+ if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
+ entry->gate_mask))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
+ entry->interval))
+ goto nla_put_failure;
+
+ return nla_nest_end(msg, item);
+
+nla_put_failure:
+ nla_nest_cancel(msg, item);
+ return -1;
+}
+
+static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_mqprio_qopt opt = { 0 };
+ struct nlattr *nest, *entry_list;
+ struct sched_entry *entry;
+ unsigned int i;
+
+ opt.num_tc = netdev_get_num_tc(dev);
+ memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+
+ for (i = 0; i < netdev_get_num_tc(dev); i++) {
+ opt.count[i] = dev->tc_to_txq[i].count;
+ opt.offset[i] = dev->tc_to_txq[i].offset;
+ }
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ return -ENOSPC;
+
+ if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
+ goto options_error;
+
+ if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
+ q->base_time, TCA_TAPRIO_PAD))
+ goto options_error;
+
+ if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+ goto options_error;
+
+ entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
+ if (!entry_list)
+ goto options_error;
+
+ list_for_each_entry(entry, &q->entries, list) {
+ if (dump_entry(skb, entry) < 0)
+ goto options_error;
+ }
+
+ nla_nest_end(skb, entry_list);
+
+ return nla_nest_end(skb, nest);
+
+options_error:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ if (!dev_queue)
+ return NULL;
+
+ return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned int ntx = TC_H_MIN(classid);
+
+ if (!taprio_queue_get(sch, ntx))
+ return 0;
+ return ntx;
+}
+
+static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+
+ return 0;
+}
+
+static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+ __releases(d->lock)
+ __acquires(d->lock)
+{
+ struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);
+
+ sch = dev_queue->qdisc_sleeping;
+ if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+ gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
+ return -1;
+ return 0;
+}
+
+static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ unsigned long ntx;
+
+ if (arg->stop)
+ return;
+
+ arg->count = arg->skip;
+ for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+ if (arg->fn(sch, ntx + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm)
+{
+ return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
+}
+
+static const struct Qdisc_class_ops taprio_class_ops = {
+ .graft = taprio_graft,
+ .leaf = taprio_leaf,
+ .find = taprio_find,
+ .walk = taprio_walk,
+ .dump = taprio_dump_class,
+ .dump_stats = taprio_dump_class_stats,
+ .select_queue = taprio_select_queue,
+};
+
+static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
+ .cl_ops = &taprio_class_ops,
+ .id = "taprio",
+ .priv_size = sizeof(struct taprio_sched),
+ .init = taprio_init,
+ .destroy = taprio_destroy,
+ .peek = taprio_peek,
+ .dequeue = taprio_dequeue,
+ .enqueue = taprio_enqueue,
+ .dump = taprio_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init taprio_module_init(void)
+{
+ return register_qdisc(&taprio_qdisc_ops);
+}
+
+static void __exit taprio_module_exit(void)
+{
+ unregister_qdisc(&taprio_qdisc_ops);
+}
+
+module_init(taprio_module_init);
+module_exit(taprio_module_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6f74a426f159..942dcca09cf2 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -162,7 +162,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
nb = 0;
while (segs) {
nskb = segs->next;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
ret = qdisc_enqueue(segs, q->qdisc, to_free);
@@ -392,7 +392,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
q->qdisc = child;
}
q->limit = qopt->limit;
@@ -438,7 +438,7 @@ static void tbf_destroy(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
- qdisc_destroy(q->qdisc);
+ qdisc_put(q->qdisc);
}
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index c740b189d4ba..950ecf6e7439 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -41,8 +41,8 @@ config SCTP_DBG_OBJCNT
bool "SCTP: Debug object counts"
depends on PROC_FS
help
- If you say Y, this will enable debugging support for counting the
- type of objects that are currently allocated. This is useful for
+ If you say Y, this will enable debugging support for counting the
+ type of objects that are currently allocated. This is useful for
identifying memory leaks. This debug information can be viewed by
'cat /proc/net/sctp/sctp_dbg_objcnt'
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5d5a16204d50..a827a1f562bf 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init(
/* Initialize path max retrans value. */
asoc->pathmaxrxt = sp->pathmaxrxt;
+ asoc->flowlabel = sp->flowlabel;
+ asoc->dscp = sp->dscp;
+
/* Initialize default path MTU. */
asoc->pathmtu = sp->pathmtu;
@@ -647,6 +650,18 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
peer->sackdelay = asoc->sackdelay;
peer->sackfreq = asoc->sackfreq;
+ if (addr->sa.sa_family == AF_INET6) {
+ __be32 info = addr->v6.sin6_flowinfo;
+
+ if (info) {
+ peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK);
+ peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else {
+ peer->flowlabel = asoc->flowlabel;
+ }
+ }
+ peer->dscp = asoc->dscp;
+
/* Enable/disable heartbeat, SACK delay, and path MTU discovery
* based on association setting.
*/
@@ -1435,7 +1450,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
/* Get the lowest pmtu of all the transports. */
list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
if (t->pmtu_pending && t->dst) {
- sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
+ sctp_transport_update_pmtu(t,
+ atomic_read(&t->mtu_info));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index bfb9f812e2ef..ce8087846f05 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -325,7 +325,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
time_after(jiffies, chunk->msg->expires_at)) {
struct sctp_stream_out *streamout =
- &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream];
+ SCTP_SO(&chunk->asoc->stream,
+ chunk->sinfo.sinfo_stream);
if (chunk->sent_count) {
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
@@ -339,7 +340,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk)
} else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
struct sctp_stream_out *streamout =
- &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream];
+ SCTP_SO(&chunk->asoc->stream,
+ chunk->sinfo.sinfo_stream);
chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index ba8a6e6c36fa..5c36a99882ed 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -56,6 +56,7 @@
#include <net/sctp/sm.h>
#include <net/sctp/checksum.h>
#include <net/net_namespace.h>
+#include <linux/rhashtable.h>
/* Forward declarations for internal helpers. */
static int sctp_rcv_ootb(struct sk_buff *);
@@ -394,6 +395,7 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
return;
if (sock_owned_by_user(sk)) {
+ atomic_set(&t->mtu_info, pmtu);
asoc->pmtu_pending = 1;
t->pmtu_pending = 1;
return;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0cd2e764f47f..fc6c5e4bffa5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
struct flowi6 *fl6 = &transport->fl.u.ip6;
+ __u8 tclass = np->tclass;
int res;
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
skb->len, &fl6->saddr, &fl6->daddr);
- IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+ if (transport->dscp & SCTP_DSCP_SET_MASK)
+ tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+
+ if (INET_ECN_is_capable(tclass))
+ IP6_ECN_flow_xmit(sk, fl6->flowlabel);
if (!(transport->param_flags & SPP_PMTUD_ENABLE))
skb->ignore_df = 1;
@@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
rcu_read_lock();
res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass);
+ tclass);
rcu_read_unlock();
return res;
}
@@ -254,6 +259,17 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl6->flowi6_oif = daddr->v6.sin6_scope_id;
else if (asoc)
fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
+ if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
+ fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
+
+ if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+ struct ip6_flowlabel *flowlabel;
+
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (!flowlabel)
+ goto out;
+ fl6_sock_release(flowlabel);
+ }
pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 7f849b01ec8e..67939ad99c01 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -120,6 +120,12 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
sctp_assoc_sync_pmtu(asoc);
}
+ if (asoc->pmtu_pending) {
+ if (asoc->param_flags & SPP_PMTUD_ENABLE)
+ sctp_assoc_sync_pmtu(asoc);
+ asoc->pmtu_pending = 0;
+ }
+
/* If there a is a prepend chunk stick it on the list before
* any other chunks get appended.
*/
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index d68aa33485a9..9cb854b05342 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -80,7 +80,7 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
q->out_qlen += ch->skb->len;
stream = sctp_chunk_stream_no(ch);
- oute = q->asoc->stream.out[stream].ext;
+ oute = SCTP_SO(&q->asoc->stream, stream)->ext;
list_add(&ch->stream_list, &oute->outq);
}
@@ -101,7 +101,7 @@ static inline void sctp_outq_tail_data(struct sctp_outq *q,
q->out_qlen += ch->skb->len;
stream = sctp_chunk_stream_no(ch);
- oute = q->asoc->stream.out[stream].ext;
+ oute = SCTP_SO(&q->asoc->stream, stream)->ext;
list_add_tail(&ch->stream_list, &oute->outq);
}
@@ -372,7 +372,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
sctp_insert_list(&asoc->outqueue.abandoned,
&chk->transmitted_list);
- streamout = &asoc->stream.out[chk->sinfo.sinfo_stream];
+ streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
asoc->sent_cnt_removable--;
asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++;
@@ -385,9 +385,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
}
- msg_len -= SCTP_DATA_SNDSIZE(chk) +
- sizeof(struct sk_buff) +
- sizeof(struct sctp_chunk);
+ msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
if (msg_len <= 0)
break;
}
@@ -416,14 +414,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) {
struct sctp_stream_out *streamout =
- &asoc->stream.out[chk->sinfo.sinfo_stream];
+ SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream);
streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
}
- msg_len -= SCTP_DATA_SNDSIZE(chk) +
- sizeof(struct sk_buff) +
- sizeof(struct sctp_chunk);
+ msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
sctp_chunk_free(chk);
if (msg_len <= 0)
break;
@@ -1048,7 +1044,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
if (!ctx->packet || !ctx->packet->has_cookie_echo)
return;
- /* fallthru */
+ /* fall through */
case SCTP_STATE_ESTABLISHED:
case SCTP_STATE_SHUTDOWN_PENDING:
case SCTP_STATE_SHUTDOWN_RECEIVED:
@@ -1082,6 +1078,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
/* Finally, transmit new packets. */
while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) {
__u32 sid = ntohs(chunk->subh.data_hdr->stream);
+ __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state;
/* Has this chunk expired? */
if (sctp_chunk_abandoned(chunk)) {
@@ -1091,7 +1088,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
continue;
}
- if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
+ if (stream_state == SCTP_STREAM_CLOSED) {
sctp_outq_head_data(ctx->q, chunk);
break;
}
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ef5c9a82d4e8..a644292f9faf 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -215,7 +215,6 @@ static const struct seq_operations sctp_eps_ops = {
struct sctp_ht_iter {
struct seq_net_private p;
struct rhashtable_iter hti;
- int start_fail;
};
static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
@@ -224,7 +223,6 @@ static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos)
sctp_transport_walk_start(&iter->hti);
- iter->start_fail = 0;
return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos);
}
@@ -232,8 +230,6 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
{
struct sctp_ht_iter *iter = seq->private;
- if (iter->start_fail)
- return;
sctp_transport_walk_stop(&iter->hti);
}
@@ -264,8 +260,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
}
transport = (struct sctp_transport *)v;
- if (!sctp_transport_hold(transport))
- return 0;
assoc = transport->asoc;
epb = &assoc->base;
sk = epb->sk;
@@ -322,8 +316,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
}
transport = (struct sctp_transport *)v;
- if (!sctp_transport_hold(transport))
- return 0;
assoc = transport->asoc;
list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 67f73d3a1356..e948db29ab53 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct dst_entry *dst = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
+ __u8 tos = inet_sk(sk)->tos;
+ if (t->dscp & SCTP_DSCP_SET_MASK)
+ tos = t->dscp & SCTP_DSCP_VAL_MASK;
memset(fl4, 0x0, sizeof(struct flowi4));
fl4->daddr = daddr->v4.sin_addr.s_addr;
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+ fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos);
fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
fl4->fl4_sport = htons(asoc->base.bind_addr.port);
}
@@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl4->fl4_sport = laddr->a.v4.sin_port;
flowi4_update_output(fl4,
asoc->base.sk->sk_bound_dev_if,
- RT_CONN_FLAGS(asoc->base.sk),
+ RT_CONN_FLAGS_TOS(asoc->base.sk, tos),
daddr->v4.sin_addr.s_addr,
laddr->a.v4.sin_addr.s_addr);
@@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
struct sctp_transport *transport)
{
struct inet_sock *inet = inet_sk(skb->sk);
+ __u8 dscp = inet->tos;
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
- skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr);
+ skb->len, &transport->fl.u.ip4.saddr,
+ &transport->fl.u.ip4.daddr);
+
+ if (transport->dscp & SCTP_DSCP_SET_MASK)
+ dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
- return ip_queue_xmit(&inet->sk, skb, &transport->fl);
+ return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
}
static struct sctp_af sctp_af_inet;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 298112ca8c06..85d393090238 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1827,4 +1827,3 @@ nomem:
error = -ENOMEM;
goto out;
}
-
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ce620e878538..fc0386e8ff23 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -66,6 +66,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/compat.h>
+#include <linux/rhashtable.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -82,7 +83,7 @@
#include <net/sctp/stream_sched.h>
/* Forward declarations for internal helper functions. */
-static int sctp_writeable(struct sock *sk);
+static bool sctp_writeable(struct sock *sk);
static void sctp_wfree(struct sk_buff *skb);
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
size_t msg_len);
@@ -118,25 +119,10 @@ static void sctp_enter_memory_pressure(struct sock *sk)
/* Get the sndbuf space available at the time on the association. */
static inline int sctp_wspace(struct sctp_association *asoc)
{
- int amt;
+ struct sock *sk = asoc->base.sk;
- if (asoc->ep->sndbuf_policy)
- amt = asoc->sndbuf_used;
- else
- amt = sk_wmem_alloc_get(asoc->base.sk);
-
- if (amt >= asoc->base.sk->sk_sndbuf) {
- if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK)
- amt = 0;
- else {
- amt = sk_stream_wspace(asoc->base.sk);
- if (amt < 0)
- amt = 0;
- }
- } else {
- amt = asoc->base.sk->sk_sndbuf - amt;
- }
- return amt;
+ return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used
+ : sk_stream_wspace(sk);
}
/* Increment the used sndbuf space count of the corresponding association by
@@ -165,12 +151,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
/* Save the chunk pointer in skb for sctp_wfree to use later. */
skb_shinfo(chunk->skb)->destructor_arg = chunk;
- asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
- sizeof(struct sk_buff) +
- sizeof(struct sctp_chunk);
-
refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
- sk->sk_wmem_queued += chunk->skb->truesize;
+ asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk);
+ sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk);
sk_mem_charge(sk, chunk->skb->truesize);
}
@@ -270,11 +253,10 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)
spin_lock_bh(&sctp_assocs_id_lock);
asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id);
+ if (asoc && (asoc->base.sk != sk || asoc->base.dead))
+ asoc = NULL;
spin_unlock_bh(&sctp_assocs_id_lock);
- if (!asoc || (asoc->base.sk != sk) || asoc->base.dead)
- return NULL;
-
return asoc;
}
@@ -1696,6 +1678,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
struct sctp_association *asoc;
enum sctp_scope scope;
struct cmsghdr *cmsg;
+ __be32 flowinfo = 0;
struct sctp_af *af;
int err;
@@ -1780,6 +1763,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
if (!cmsgs->addrs_msg)
return 0;
+ if (daddr->sa.sa_family == AF_INET6)
+ flowinfo = daddr->v6.sin6_flowinfo;
+
/* sendv addr list parse */
for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
struct sctp_transport *transport;
@@ -1812,6 +1798,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
}
dlen = sizeof(struct in6_addr);
+ daddr->v6.sin6_flowinfo = flowinfo;
daddr->v6.sin6_family = AF_INET6;
daddr->v6.sin6_port = htons(asoc->peer.port);
memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
@@ -1905,7 +1892,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
goto err;
}
- if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) {
+ if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) {
err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
if (err)
goto err;
@@ -1922,10 +1909,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
asoc->pmtu_pending = 0;
}
- if (sctp_wspace(asoc) < msg_len)
+ if (sctp_wspace(asoc) < (int)msg_len)
sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
- if (!sctp_wspace(asoc)) {
+ if (sctp_wspace(asoc) <= 0) {
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
if (err)
@@ -1940,8 +1927,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (sp->strm_interleave) {
timeo = sock_sndtimeo(sk, 0);
err = sctp_wait_for_connect(asoc, &timeo);
- if (err)
+ if (err) {
+ err = -ESRCH;
goto err;
+ }
} else {
wait_connect = true;
}
@@ -2392,6 +2381,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* uint32_t spp_pathmtu;
* uint32_t spp_sackdelay;
* uint32_t spp_flags;
+ * uint32_t spp_ipv6_flowlabel;
+ * uint8_t spp_dscp;
* };
*
* spp_assoc_id - (one-to-many style socket) This is filled in the
@@ -2471,6 +2462,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
* also that this field is mutually exclusive to
* SPP_SACKDELAY_ENABLE, setting both will have undefined
* results.
+ *
+ * SPP_IPV6_FLOWLABEL: Setting this flag enables the
+ * setting of the IPV6 flow label value. The value is
+ * contained in the spp_ipv6_flowlabel field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_ipv6_flowlabel field has a valid value returned.
+ * If a specific destination address is set (in the
+ * spp_address field), then the value returned is that of
+ * the address. If just an association is specified (and
+ * no address), then the association's default flow label
+ * is returned. If neither an association nor a destination
+ * is specified, then the socket's default flow label is
+ * returned. For non-IPv6 sockets, this flag will be left
+ * cleared.
+ *
+ * SPP_DSCP: Setting this flag enables the setting of the
+ * Differentiated Services Code Point (DSCP) value
+ * associated with either the association or a specific
+ * address. The value is obtained in the spp_dscp field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_dscp field has a valid value returned. If a
+ * specific destination address is set when called (in the
+ * spp_address field), then that specific destination
+ * address's DSCP value is returned. If just an association
+ * is specified, then the association's default DSCP is
+ * returned. If neither an association nor a destination is
+ * specified, then the socket's default DSCP is returned.
+ *
+ * spp_ipv6_flowlabel
+ * - This field is used in conjunction with the
+ * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ * The 20 least significant bits are used for the flow
+ * label. This setting has precedence over any IPv6-layer
+ * setting.
+ *
+ * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
+ * and contains the DSCP. The 6 most significant bits are
+ * used for the DSCP. This setting has precedence over any
+ * IPv4- or IPv6- layer setting.
*/
static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
struct sctp_transport *trans,
@@ -2610,6 +2640,55 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
}
}
+ if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
+ if (trans) {
+ if (trans->ipaddr.sa.sa_family == AF_INET6) {
+ trans->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ } else if (asoc) {
+ struct sctp_transport *t;
+
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ if (t->ipaddr.sa.sa_family != AF_INET6)
+ continue;
+ t->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ t->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ asoc->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
+ sp->flowlabel = params->spp_ipv6_flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+ }
+ }
+
+ if (params->spp_flags & SPP_DSCP) {
+ if (trans) {
+ trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ trans->dscp |= SCTP_DSCP_SET_MASK;
+ } else if (asoc) {
+ struct sctp_transport *t;
+
+ list_for_each_entry(t, &asoc->peer.transport_addr_list,
+ transports) {
+ t->dscp = params->spp_dscp &
+ SCTP_DSCP_VAL_MASK;
+ t->dscp |= SCTP_DSCP_SET_MASK;
+ }
+ asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ asoc->dscp |= SCTP_DSCP_SET_MASK;
+ } else {
+ sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+ sp->dscp |= SCTP_DSCP_SET_MASK;
+ }
+ }
+
return 0;
}
@@ -2624,11 +2703,18 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk,
int error;
int hb_change, pmtud_change, sackdelay_change;
- if (optlen != sizeof(struct sctp_paddrparams))
+ if (optlen == sizeof(params)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+ } else if (optlen == ALIGN(offsetof(struct sctp_paddrparams,
+ spp_ipv6_flowlabel), 4)) {
+ if (copy_from_user(&params, optval, optlen))
+ return -EFAULT;
+ if (params.spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL))
+ return -EINVAL;
+ } else {
return -EINVAL;
-
- if (copy_from_user(&params, optval, optlen))
- return -EFAULT;
+ }
/* Validate flags and value parameters. */
hb_change = params.spp_flags & SPP_HB;
@@ -4169,6 +4255,28 @@ out:
return retval;
}
+static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval,
+ unsigned int optlen)
+{
+ int val;
+
+ if (!sctp_style(sk, TCP))
+ return -EOPNOTSUPP;
+
+ if (sctp_sk(sk)->ep->base.bind_addr.port)
+ return -EFAULT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ sctp_sk(sk)->reuse = !!val;
+
+ return 0;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4363,6 +4471,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_interleaving_supported(sk, optval,
optlen);
break;
+ case SCTP_REUSE_PORT:
+ retval = sctp_setsockopt_reuse_port(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -4881,9 +4992,14 @@ struct sctp_transport *sctp_transport_get_next(struct net *net,
break;
}
+ if (!sctp_transport_hold(t))
+ continue;
+
if (net_eq(sock_net(t->asoc->base.sk), net) &&
t->asoc->peer.primary_path == t)
break;
+
+ sctp_transport_put(t);
}
return t;
@@ -4893,13 +5009,18 @@ struct sctp_transport *sctp_transport_get_idx(struct net *net,
struct rhashtable_iter *iter,
int pos)
{
- void *obj = SEQ_START_TOKEN;
+ struct sctp_transport *t;
- while (pos && (obj = sctp_transport_get_next(net, iter)) &&
- !IS_ERR(obj))
- pos--;
+ if (!pos)
+ return SEQ_START_TOKEN;
- return obj;
+ while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) {
+ if (!--pos)
+ break;
+ sctp_transport_put(t);
+ }
+
+ return t;
}
int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
@@ -4958,8 +5079,6 @@ again:
tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
- if (!sctp_transport_hold(tsp))
- continue;
ret = cb(tsp, p);
if (ret)
break;
@@ -5427,6 +5546,45 @@ out:
* also that this field is mutually exclusive to
* SPP_SACKDELAY_ENABLE, setting both will have undefined
* results.
+ *
+ * SPP_IPV6_FLOWLABEL: Setting this flag enables the
+ * setting of the IPV6 flow label value. The value is
+ * contained in the spp_ipv6_flowlabel field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_ipv6_flowlabel field has a valid value returned.
+ * If a specific destination address is set (in the
+ * spp_address field), then the value returned is that of
+ * the address. If just an association is specified (and
+ * no address), then the association's default flow label
+ * is returned. If neither an association nor a destination
+ * is specified, then the socket's default flow label is
+ * returned. For non-IPv6 sockets, this flag will be left
+ * cleared.
+ *
+ * SPP_DSCP: Setting this flag enables the setting of the
+ * Differentiated Services Code Point (DSCP) value
+ * associated with either the association or a specific
+ * address. The value is obtained in the spp_dscp field.
+ * Upon retrieval, this flag will be set to indicate that
+ * the spp_dscp field has a valid value returned. If a
+ * specific destination address is set when called (in the
+ * spp_address field), then that specific destination
+ * address's DSCP value is returned. If just an association
+ * is specified, then the association's default DSCP is
+ * returned. If neither an association nor a destination is
+ * specified, then the socket's default DSCP is returned.
+ *
+ * spp_ipv6_flowlabel
+ * - This field is used in conjunction with the
+ * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ * The 20 least significant bits are used for the flow
+ * label. This setting has precedence over any IPv6-layer
+ * setting.
+ *
+ * spp_dscp - This field is used in conjunction with the SPP_DSCP flag
+ * and contains the DSCP. The 6 most significant bits are
+ * used for the DSCP. This setting has precedence over any
+ * IPv4- or IPv6- layer setting.
*/
static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
char __user *optval, int __user *optlen)
@@ -5436,9 +5594,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
struct sctp_association *asoc = NULL;
struct sctp_sock *sp = sctp_sk(sk);
- if (len < sizeof(struct sctp_paddrparams))
+ if (len >= sizeof(params))
+ len = sizeof(params);
+ else if (len >= ALIGN(offsetof(struct sctp_paddrparams,
+ spp_ipv6_flowlabel), 4))
+ len = ALIGN(offsetof(struct sctp_paddrparams,
+ spp_ipv6_flowlabel), 4);
+ else
return -EINVAL;
- len = sizeof(struct sctp_paddrparams);
+
if (copy_from_user(&params, optval, len))
return -EFAULT;
@@ -5473,6 +5637,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = trans->param_flags;
+ if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = trans->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (trans->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
} else if (asoc) {
/* Fetch association values. */
params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
@@ -5482,6 +5655,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = asoc->param_flags;
+ if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = asoc->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (asoc->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
} else {
/* Fetch socket values. */
params.spp_hbinterval = sp->hbinterval;
@@ -5491,6 +5673,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
/*draft-11 doesn't say what to return in spp_flags*/
params.spp_flags = sp->param_flags;
+ if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+ params.spp_ipv6_flowlabel = sp->flowlabel &
+ SCTP_FLOWLABEL_VAL_MASK;
+ params.spp_flags |= SPP_IPV6_FLOWLABEL;
+ }
+ if (sp->dscp & SCTP_DSCP_SET_MASK) {
+ params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK;
+ params.spp_flags |= SPP_DSCP;
+ }
}
if (copy_to_user(optval, &params, len))
@@ -6892,14 +7083,14 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len,
}
policy = params.sprstat_policy;
- if (policy & ~SCTP_PR_SCTP_MASK)
+ if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)))
goto out;
asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
if (!asoc)
goto out;
- if (policy == SCTP_PR_SCTP_NONE) {
+ if (policy & SCTP_PR_SCTP_ALL) {
params.sprstat_abandoned_unsent = 0;
params.sprstat_abandoned_sent = 0;
for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
@@ -6951,14 +7142,14 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
}
policy = params.sprstat_policy;
- if (policy & ~SCTP_PR_SCTP_MASK)
+ if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)))
goto out;
asoc = sctp_id2assoc(sk, params.sprstat_assoc_id);
if (!asoc || params.sprstat_sid >= asoc->stream.outcnt)
goto out;
- streamoute = asoc->stream.out[params.sprstat_sid].ext;
+ streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext;
if (!streamoute) {
/* Not allocated yet, means all stats are 0 */
params.sprstat_abandoned_unsent = 0;
@@ -6967,7 +7158,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len,
goto out;
}
- if (policy == SCTP_PR_SCTP_NONE) {
+ if (policy == SCTP_PR_SCTP_ALL) {
params.sprstat_abandoned_unsent = 0;
params.sprstat_abandoned_sent = 0;
for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) {
@@ -7196,6 +7387,26 @@ out:
return retval;
}
+static int sctp_getsockopt_reuse_port(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ int val;
+
+ if (len < sizeof(int))
+ return -EINVAL;
+
+ len = sizeof(int);
+ val = sctp_sk(sk)->reuse;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+
+ return 0;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -7391,6 +7602,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_interleaving_supported(sk, len, optval,
optlen);
break;
+ case SCTP_REUSE_PORT:
+ retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7428,6 +7642,7 @@ static struct sctp_bind_bucket *sctp_bucket_create(
static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
+ bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
struct sctp_bind_bucket *pp;
unsigned short snum;
@@ -7500,13 +7715,11 @@ pp_found:
* used by other socket (pp->owner not empty); that other
* socket is going to be sk2.
*/
- int reuse = sk->sk_reuse;
struct sock *sk2;
pr_debug("%s: found a possible match\n", __func__);
- if (pp->fastreuse && sk->sk_reuse &&
- sk->sk_state != SCTP_SS_LISTENING)
+ if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING)
goto success;
/* Run through the list of sockets bound to the port
@@ -7524,7 +7737,7 @@ pp_found:
ep2 = sctp_sk(sk2)->ep;
if (sk == sk2 ||
- (reuse && sk2->sk_reuse &&
+ (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) &&
sk2->sk_state != SCTP_SS_LISTENING))
continue;
@@ -7548,12 +7761,12 @@ pp_not_found:
* SO_REUSEADDR on this socket -sk-).
*/
if (hlist_empty(&pp->owner)) {
- if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING)
+ if (reuse && sk->sk_state != SCTP_SS_LISTENING)
pp->fastreuse = 1;
else
pp->fastreuse = 0;
} else if (pp->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING))
+ (!reuse || sk->sk_state == SCTP_SS_LISTENING))
pp->fastreuse = 0;
/* We are set, so fill up all the data in the hash table
@@ -7684,7 +7897,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
err = 0;
sctp_unhash_endpoint(ep);
sk->sk_state = SCTP_SS_CLOSED;
- if (sk->sk_reuse)
+ if (sk->sk_reuse || sctp_sk(sk)->reuse)
sctp_sk(sk)->bind_hash->fastreuse = 1;
goto out;
}
@@ -8230,17 +8443,11 @@ static void sctp_wfree(struct sk_buff *skb)
struct sctp_association *asoc = chunk->asoc;
struct sock *sk = asoc->base.sk;
- asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
- sizeof(struct sk_buff) +
- sizeof(struct sctp_chunk);
-
- WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc));
-
- /*
- * This undoes what is done via sctp_set_owner_w and sk_mem_charge
- */
- sk->sk_wmem_queued -= skb->truesize;
sk_mem_uncharge(sk, skb->truesize);
+ sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk);
+ asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk);
+ WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk),
+ &sk->sk_wmem_alloc));
if (chunk->shkey) {
struct sctp_shared_key *shkey = chunk->shkey;
@@ -8314,7 +8521,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
goto do_error;
if (signal_pending(current))
goto do_interrupted;
- if (msg_len <= sctp_wspace(asoc))
+ if ((int)msg_len <= sctp_wspace(asoc))
break;
/* Let another process have a go. Since we are going
@@ -8389,14 +8596,9 @@ void sctp_write_space(struct sock *sk)
* UDP-style sockets or TCP-style sockets, this code should work.
* - Daisy
*/
-static int sctp_writeable(struct sock *sk)
+static bool sctp_writeable(struct sock *sk)
{
- int amt = 0;
-
- amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
- if (amt < 0)
- amt = 0;
- return amt;
+ return sk->sk_sndbuf > sk->sk_wmem_queued;
}
/* Wait for an association to go into ESTABLISHED state. If timeout is 0,
@@ -8551,6 +8753,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
+ sctp_sk(newsk)->reuse = sp->reuse;
newsk->sk_shutdown = sk->sk_shutdown;
newsk->sk_destruct = sctp_destruct_sock;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f1f1d1b232ba..ffb940d3b57c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,6 +37,53 @@
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>
+static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
+ gfp_t gfp)
+{
+ struct flex_array *result;
+ int err;
+
+ result = flex_array_alloc(elem_size, elem_count, gfp);
+ if (result) {
+ err = flex_array_prealloc(result, 0, elem_count, gfp);
+ if (err) {
+ flex_array_free(result);
+ result = NULL;
+ }
+ }
+
+ return result;
+}
+
+static void fa_free(struct flex_array *fa)
+{
+ if (fa)
+ flex_array_free(fa);
+}
+
+static void fa_copy(struct flex_array *fa, struct flex_array *from,
+ size_t index, size_t count)
+{
+ void *elem;
+
+ while (count--) {
+ elem = flex_array_get(from, index);
+ flex_array_put(fa, index, elem, 0);
+ index++;
+ }
+}
+
+static void fa_zero(struct flex_array *fa, size_t index, size_t count)
+{
+ void *elem;
+
+ while (count--) {
+ elem = flex_array_get(fa, index);
+ memset(elem, 0, fa->element_size);
+ index++;
+ }
+}
+
/* Migrates chunks from stream queues to new stream queues if needed,
* but not across associations. Also, removes those chunks to streams
* higher than the new max.
@@ -78,34 +125,33 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
* sctp_stream_update will swap ->out pointers.
*/
for (i = 0; i < outcnt; i++) {
- kfree(new->out[i].ext);
- new->out[i].ext = stream->out[i].ext;
- stream->out[i].ext = NULL;
+ kfree(SCTP_SO(new, i)->ext);
+ SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext;
+ SCTP_SO(stream, i)->ext = NULL;
}
}
for (i = outcnt; i < stream->outcnt; i++)
- kfree(stream->out[i].ext);
+ kfree(SCTP_SO(stream, i)->ext);
}
static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
gfp_t gfp)
{
- struct sctp_stream_out *out;
+ struct flex_array *out;
+ size_t elem_size = sizeof(struct sctp_stream_out);
- out = kmalloc_array(outcnt, sizeof(*out), gfp);
+ out = fa_alloc(elem_size, outcnt, gfp);
if (!out)
return -ENOMEM;
if (stream->out) {
- memcpy(out, stream->out, min(outcnt, stream->outcnt) *
- sizeof(*out));
- kfree(stream->out);
+ fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
+ fa_free(stream->out);
}
if (outcnt > stream->outcnt)
- memset(out + stream->outcnt, 0,
- (outcnt - stream->outcnt) * sizeof(*out));
+ fa_zero(out, stream->outcnt, (outcnt - stream->outcnt));
stream->out = out;
@@ -115,22 +161,20 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
gfp_t gfp)
{
- struct sctp_stream_in *in;
-
- in = kmalloc_array(incnt, sizeof(*stream->in), gfp);
+ struct flex_array *in;
+ size_t elem_size = sizeof(struct sctp_stream_in);
+ in = fa_alloc(elem_size, incnt, gfp);
if (!in)
return -ENOMEM;
if (stream->in) {
- memcpy(in, stream->in, min(incnt, stream->incnt) *
- sizeof(*in));
- kfree(stream->in);
+ fa_copy(in, stream->in, 0, min(incnt, stream->incnt));
+ fa_free(stream->in);
}
if (incnt > stream->incnt)
- memset(in + stream->incnt, 0,
- (incnt - stream->incnt) * sizeof(*in));
+ fa_zero(in, stream->incnt, (incnt - stream->incnt));
stream->in = in;
@@ -162,7 +206,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
stream->outcnt = outcnt;
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_OPEN;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
sched->init(stream);
@@ -174,7 +218,7 @@ in:
ret = sctp_stream_alloc_in(stream, incnt, gfp);
if (ret) {
sched->free(stream);
- kfree(stream->out);
+ fa_free(stream->out);
stream->out = NULL;
stream->outcnt = 0;
goto out;
@@ -193,7 +237,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
soute = kzalloc(sizeof(*soute), GFP_KERNEL);
if (!soute)
return -ENOMEM;
- stream->out[sid].ext = soute;
+ SCTP_SO(stream, sid)->ext = soute;
return sctp_sched_init_sid(stream, sid, GFP_KERNEL);
}
@@ -205,9 +249,9 @@ void sctp_stream_free(struct sctp_stream *stream)
sched->free(stream);
for (i = 0; i < stream->outcnt; i++)
- kfree(stream->out[i].ext);
- kfree(stream->out);
- kfree(stream->in);
+ kfree(SCTP_SO(stream, i)->ext);
+ fa_free(stream->out);
+ fa_free(stream->in);
}
void sctp_stream_clear(struct sctp_stream *stream)
@@ -215,12 +259,12 @@ void sctp_stream_clear(struct sctp_stream *stream)
int i;
for (i = 0; i < stream->outcnt; i++) {
- stream->out[i].mid = 0;
- stream->out[i].mid_uo = 0;
+ SCTP_SO(stream, i)->mid = 0;
+ SCTP_SO(stream, i)->mid_uo = 0;
}
for (i = 0; i < stream->incnt; i++)
- stream->in[i].mid = 0;
+ SCTP_SI(stream, i)->mid = 0;
}
void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
@@ -273,8 +317,8 @@ static bool sctp_stream_outq_is_empty(struct sctp_stream *stream,
for (i = 0; i < str_nums; i++) {
__u16 sid = ntohs(str_list[i]);
- if (stream->out[sid].ext &&
- !list_empty(&stream->out[sid].ext->outq))
+ if (SCTP_SO(stream, sid)->ext &&
+ !list_empty(&SCTP_SO(stream, sid)->ext->outq))
return false;
}
@@ -361,11 +405,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
if (out) {
if (str_nums)
for (i = 0; i < str_nums; i++)
- stream->out[str_list[i]].state =
+ SCTP_SO(stream, str_list[i])->state =
SCTP_STREAM_CLOSED;
else
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_CLOSED;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
}
asoc->strreset_chunk = chunk;
@@ -380,11 +424,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
if (str_nums)
for (i = 0; i < str_nums; i++)
- stream->out[str_list[i]].state =
+ SCTP_SO(stream, str_list[i])->state =
SCTP_STREAM_OPEN;
else
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_OPEN;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
goto out;
}
@@ -418,7 +462,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
/* Block further xmit of data until this request is completed */
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_CLOSED;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
asoc->strreset_chunk = chunk;
sctp_chunk_hold(asoc->strreset_chunk);
@@ -429,7 +473,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
asoc->strreset_chunk = NULL;
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_OPEN;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
return retval;
}
@@ -609,10 +653,10 @@ struct sctp_chunk *sctp_process_strreset_outreq(
}
for (i = 0; i < nums; i++)
- stream->in[ntohs(str_p[i])].mid = 0;
+ SCTP_SI(stream, ntohs(str_p[i]))->mid = 0;
} else {
for (i = 0; i < stream->incnt; i++)
- stream->in[i].mid = 0;
+ SCTP_SI(stream, i)->mid = 0;
}
result = SCTP_STRRESET_PERFORMED;
@@ -683,11 +727,11 @@ struct sctp_chunk *sctp_process_strreset_inreq(
if (nums)
for (i = 0; i < nums; i++)
- stream->out[ntohs(str_p[i])].state =
+ SCTP_SO(stream, ntohs(str_p[i]))->state =
SCTP_STREAM_CLOSED;
else
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_CLOSED;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED;
asoc->strreset_chunk = chunk;
asoc->strreset_outstanding = 1;
@@ -786,11 +830,11 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
* incoming and outgoing streams.
*/
for (i = 0; i < stream->outcnt; i++) {
- stream->out[i].mid = 0;
- stream->out[i].mid_uo = 0;
+ SCTP_SO(stream, i)->mid = 0;
+ SCTP_SO(stream, i)->mid_uo = 0;
}
for (i = 0; i < stream->incnt; i++)
- stream->in[i].mid = 0;
+ SCTP_SI(stream, i)->mid = 0;
result = SCTP_STRRESET_PERFORMED;
@@ -979,15 +1023,18 @@ struct sctp_chunk *sctp_process_strreset_resp(
sizeof(__u16);
if (result == SCTP_STRRESET_PERFORMED) {
+ struct sctp_stream_out *sout;
if (nums) {
for (i = 0; i < nums; i++) {
- stream->out[ntohs(str_p[i])].mid = 0;
- stream->out[ntohs(str_p[i])].mid_uo = 0;
+ sout = SCTP_SO(stream, ntohs(str_p[i]));
+ sout->mid = 0;
+ sout->mid_uo = 0;
}
} else {
for (i = 0; i < stream->outcnt; i++) {
- stream->out[i].mid = 0;
- stream->out[i].mid_uo = 0;
+ sout = SCTP_SO(stream, i);
+ sout->mid = 0;
+ sout->mid_uo = 0;
}
}
@@ -995,7 +1042,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
}
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_OPEN;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
*evp = sctp_ulpevent_make_stream_reset_event(asoc, flags,
nums, str_p, GFP_ATOMIC);
@@ -1050,15 +1097,15 @@ struct sctp_chunk *sctp_process_strreset_resp(
asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
for (i = 0; i < stream->outcnt; i++) {
- stream->out[i].mid = 0;
- stream->out[i].mid_uo = 0;
+ SCTP_SO(stream, i)->mid = 0;
+ SCTP_SO(stream, i)->mid_uo = 0;
}
for (i = 0; i < stream->incnt; i++)
- stream->in[i].mid = 0;
+ SCTP_SI(stream, i)->mid = 0;
}
for (i = 0; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_OPEN;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
*evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags,
stsn, rtsn, GFP_ATOMIC);
@@ -1072,7 +1119,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
if (result == SCTP_STRRESET_PERFORMED)
for (i = number; i < stream->outcnt; i++)
- stream->out[i].state = SCTP_STREAM_OPEN;
+ SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
else
stream->outcnt = number;
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index d3764c181299..0a78cdf86463 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -197,7 +197,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial(
__u32 next_fsn = 0;
int is_last = 0;
- sin = sctp_stream_in(ulpq->asoc, event->stream);
+ sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
skb_queue_walk(&ulpq->reasm, pos) {
struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -278,7 +278,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
__u32 pd_len = 0;
__u32 mid = 0;
- sin = sctp_stream_in(ulpq->asoc, event->stream);
+ sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
skb_queue_walk(&ulpq->reasm, pos) {
struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -368,7 +368,7 @@ static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq,
sctp_intl_store_reasm(ulpq, event);
- sin = sctp_stream_in(ulpq->asoc, event->stream);
+ sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
if (sin->pd_mode && event->mid == sin->mid &&
event->fsn == sin->fsn)
retval = sctp_intl_retrieve_partial(ulpq, event);
@@ -575,7 +575,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo(
__u32 next_fsn = 0;
int is_last = 0;
- sin = sctp_stream_in(ulpq->asoc, event->stream);
+ sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
skb_queue_walk(&ulpq->reasm_uo, pos) {
struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -659,7 +659,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
__u32 pd_len = 0;
__u32 mid = 0;
- sin = sctp_stream_in(ulpq->asoc, event->stream);
+ sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
skb_queue_walk(&ulpq->reasm_uo, pos) {
struct sctp_ulpevent *cevent = sctp_skb2event(pos);
@@ -750,7 +750,7 @@ static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq,
sctp_intl_store_reasm_uo(ulpq, event);
- sin = sctp_stream_in(ulpq->asoc, event->stream);
+ sin = sctp_stream_in(&ulpq->asoc->stream, event->stream);
if (sin->pd_mode_uo && event->mid == sin->mid_uo &&
event->fsn == sin->fsn_uo)
retval = sctp_intl_retrieve_partial_uo(ulpq, event);
@@ -774,7 +774,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
skb_queue_walk(&ulpq->reasm_uo, pos) {
struct sctp_ulpevent *cevent = sctp_skb2event(pos);
- csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+ csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream);
if (csin->pd_mode_uo)
continue;
@@ -875,7 +875,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
skb_queue_walk(&ulpq->reasm, pos) {
struct sctp_ulpevent *cevent = sctp_skb2event(pos);
- csin = sctp_stream_in(ulpq->asoc, cevent->stream);
+ csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream);
if (csin->pd_mode)
continue;
@@ -1053,7 +1053,7 @@ static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
__u16 sid;
for (sid = 0; sid < stream->incnt; sid++) {
- struct sctp_stream_in *sin = &stream->in[sid];
+ struct sctp_stream_in *sin = SCTP_SI(stream, sid);
__u32 mid;
if (sin->pd_mode_uo) {
@@ -1247,7 +1247,7 @@ static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk)
static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid,
__u8 flags)
{
- struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid);
+ struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid);
struct sctp_stream *stream = &ulpq->asoc->stream;
if (flags & SCTP_FTSN_U_BIT) {
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index f5fcd425232a..a6c04a94b08f 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -161,7 +161,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
/* Give the next scheduler a clean slate. */
for (i = 0; i < asoc->stream.outcnt; i++) {
- void *p = asoc->stream.out[i].ext;
+ void *p = SCTP_SO(&asoc->stream, i)->ext;
if (!p)
continue;
@@ -175,7 +175,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
asoc->outqueue.sched = n;
n->init(&asoc->stream);
for (i = 0; i < asoc->stream.outcnt; i++) {
- if (!asoc->stream.out[i].ext)
+ if (!SCTP_SO(&asoc->stream, i)->ext)
continue;
ret = n->init_sid(&asoc->stream, i, GFP_KERNEL);
@@ -217,7 +217,7 @@ int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid,
if (sid >= asoc->stream.outcnt)
return -EINVAL;
- if (!asoc->stream.out[sid].ext) {
+ if (!SCTP_SO(&asoc->stream, sid)->ext) {
int ret;
ret = sctp_stream_init_ext(&asoc->stream, sid);
@@ -234,7 +234,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid,
if (sid >= asoc->stream.outcnt)
return -EINVAL;
- if (!asoc->stream.out[sid].ext)
+ if (!SCTP_SO(&asoc->stream, sid)->ext)
return 0;
return asoc->outqueue.sched->get(&asoc->stream, sid, value);
@@ -252,7 +252,7 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch)
* priority stream comes in.
*/
sid = sctp_chunk_stream_no(ch);
- sout = &q->asoc->stream.out[sid];
+ sout = SCTP_SO(&q->asoc->stream, sid);
q->asoc->stream.out_curr = sout;
return;
}
@@ -272,8 +272,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch)
int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp)
{
struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+ struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext;
- INIT_LIST_HEAD(&stream->out[sid].ext->outq);
+ INIT_LIST_HEAD(&ext->outq);
return sched->init_sid(stream, sid, gfp);
}
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 7997d35dd0fd..2245083a98f2 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -75,10 +75,10 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head(
/* No luck. So we search on all streams now. */
for (i = 0; i < stream->outcnt; i++) {
- if (!stream->out[i].ext)
+ if (!SCTP_SO(stream, i)->ext)
continue;
- p = stream->out[i].ext->prio_head;
+ p = SCTP_SO(stream, i)->ext->prio_head;
if (!p)
/* Means all other streams won't be initialized
* as well.
@@ -165,7 +165,7 @@ static void sctp_sched_prio_sched(struct sctp_stream *stream,
static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
__u16 prio, gfp_t gfp)
{
- struct sctp_stream_out *sout = &stream->out[sid];
+ struct sctp_stream_out *sout = SCTP_SO(stream, sid);
struct sctp_stream_out_ext *soute = sout->ext;
struct sctp_stream_priorities *prio_head, *old;
bool reschedule = false;
@@ -186,7 +186,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
return 0;
for (i = 0; i < stream->outcnt; i++) {
- soute = stream->out[i].ext;
+ soute = SCTP_SO(stream, i)->ext;
if (soute && soute->prio_head == old)
/* It's still in use, nothing else to do here. */
return 0;
@@ -201,7 +201,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid,
static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid,
__u16 *value)
{
- *value = stream->out[sid].ext->prio_head->prio;
+ *value = SCTP_SO(stream, sid)->ext->prio_head->prio;
return 0;
}
@@ -215,7 +215,7 @@ static int sctp_sched_prio_init(struct sctp_stream *stream)
static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
gfp_t gfp)
{
- INIT_LIST_HEAD(&stream->out[sid].ext->prio_list);
+ INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list);
return sctp_sched_prio_set(stream, sid, 0, gfp);
}
@@ -233,9 +233,9 @@ static void sctp_sched_prio_free(struct sctp_stream *stream)
*/
sctp_sched_prio_unsched_all(stream);
for (i = 0; i < stream->outcnt; i++) {
- if (!stream->out[i].ext)
+ if (!SCTP_SO(stream, i)->ext)
continue;
- prio = stream->out[i].ext->prio_head;
+ prio = SCTP_SO(stream, i)->ext->prio_head;
if (prio && list_empty(&prio->prio_sched))
list_add(&prio->prio_sched, &list);
}
@@ -255,7 +255,7 @@ static void sctp_sched_prio_enqueue(struct sctp_outq *q,
ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
sid = sctp_chunk_stream_no(ch);
stream = &q->asoc->stream;
- sctp_sched_prio_sched(stream, stream->out[sid].ext);
+ sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext);
}
static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q)
@@ -297,7 +297,7 @@ static void sctp_sched_prio_dequeue_done(struct sctp_outq *q,
* this priority.
*/
sid = sctp_chunk_stream_no(ch);
- soute = q->asoc->stream.out[sid].ext;
+ soute = SCTP_SO(&q->asoc->stream, sid)->ext;
prio = soute->prio_head;
sctp_sched_prio_next_stream(prio);
@@ -317,7 +317,7 @@ static void sctp_sched_prio_sched_all(struct sctp_stream *stream)
__u16 sid;
sid = sctp_chunk_stream_no(ch);
- sout = &stream->out[sid];
+ sout = SCTP_SO(stream, sid);
if (sout->ext)
sctp_sched_prio_sched(stream, sout->ext);
}
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index 1155692448f1..52ba743fa7a7 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -100,7 +100,7 @@ static int sctp_sched_rr_init(struct sctp_stream *stream)
static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
gfp_t gfp)
{
- INIT_LIST_HEAD(&stream->out[sid].ext->rr_list);
+ INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->rr_list);
return 0;
}
@@ -120,7 +120,7 @@ static void sctp_sched_rr_enqueue(struct sctp_outq *q,
ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
sid = sctp_chunk_stream_no(ch);
stream = &q->asoc->stream;
- sctp_sched_rr_sched(stream, stream->out[sid].ext);
+ sctp_sched_rr_sched(stream, SCTP_SO(stream, sid)->ext);
}
static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q)
@@ -154,7 +154,7 @@ static void sctp_sched_rr_dequeue_done(struct sctp_outq *q,
/* Last chunk on that msg, move to the next stream */
sid = sctp_chunk_stream_no(ch);
- soute = q->asoc->stream.out[sid].ext;
+ soute = SCTP_SO(&q->asoc->stream, sid)->ext;
sctp_sched_rr_next_stream(&q->asoc->stream);
@@ -173,7 +173,7 @@ static void sctp_sched_rr_sched_all(struct sctp_stream *stream)
__u16 sid;
sid = sctp_chunk_stream_no(ch);
- soute = stream->out[sid].ext;
+ soute = SCTP_SO(stream, sid)->ext;
if (soute)
sctp_sched_rr_sched(stream, soute);
}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 12cac85da994..033696e6f74f 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -260,6 +260,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
{
struct dst_entry *dst = sctp_transport_dst_check(t);
+ struct sock *sk = t->asoc->base.sk;
bool change = true;
if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
@@ -271,12 +272,19 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
pmtu = SCTP_TRUNC4(pmtu);
if (dst) {
- dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu);
+ struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family);
+ union sctp_addr addr;
+
+ pf->af->from_sk(&addr, sk);
+ pf->to_sk_daddr(&t->ipaddr, sk);
+ dst->ops->update_pmtu(dst, sk, NULL, pmtu);
+ pf->to_sk_daddr(&addr, sk);
+
dst = sctp_transport_dst_check(t);
}
if (!dst) {
- t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk);
+ t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
dst = t->dst;
}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0b427100b0d4..331cc734e3db 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -459,7 +459,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul
* element in the queue, then count it towards
* possible PD.
*/
- if (pos == ulpq->reasm.next) {
+ if (skb_queue_is_first(&ulpq->reasm, pos)) {
pd_first = pos;
pd_last = pos;
pd_len = pos->len;
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 188104654b54..4df96b4b8130 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_SMC) += smc.o
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
-smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e7de5f282722..80e2119f1c70 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/in.h>
#include <linux/sched/signal.h>
+#include <linux/if_vlan.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -35,6 +36,7 @@
#include "smc_cdc.h"
#include "smc_core.h"
#include "smc_ib.h"
+#include "smc_ism.h"
#include "smc_pnet.h"
#include "smc_tx.h"
#include "smc_rx.h"
@@ -342,20 +344,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
rc = smc_ib_modify_qp_rts(link);
if (rc)
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_RDYLNK;
smc_wr_remember_qp_attr(link);
if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_REGRMB;
/* send CONFIRM LINK response over RoCE fabric */
- rc = smc_llc_send_confirm_link(link,
- link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_RESP);
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_CL;
/* receive ADD LINK request from server over RoCE fabric */
rest = wait_for_completion_interruptible_timeout(&link->llc_add,
@@ -371,18 +370,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
/* send add link reject message, only one link supported for now */
rc = smc_llc_send_add_link(link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_RESP);
+ link->gid, SMC_LLC_RESP);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_AL;
smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
return 0;
}
-static void smc_conn_save_peer_info(struct smc_sock *smc,
- struct smc_clc_msg_accept_confirm *clc)
+static void smcr_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
{
int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
@@ -393,6 +391,28 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}
+static void smcd_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
+
+ smc->conn.peer_rmbe_idx = clc->dmbe_idx;
+ smc->conn.peer_token = clc->token;
+ /* msg header takes up space in the buffer */
+ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+ smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ if (smc->conn.lgr->is_smcd)
+ smcd_conn_save_peer_info(smc, clc);
+ else
+ smcr_conn_save_peer_info(smc, clc);
+}
+
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc)
{
@@ -404,9 +424,10 @@ static void smc_link_save_peer_info(struct smc_link *link,
}
/* fall back during connect */
-static int smc_connect_fallback(struct smc_sock *smc)
+static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
smc->use_fallback = true;
+ smc->fallback_rsn = reason_code;
smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
@@ -423,7 +444,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
sock_put(&smc->sk); /* passive closing */
return reason_code;
}
- if (reason_code != SMC_CLC_DECL_REPLY) {
+ if (reason_code != SMC_CLC_DECL_PEERDECL) {
rc = smc_clc_send_decline(smc, reason_code);
if (rc < 0) {
if (smc->sk.sk_state == SMC_INIT)
@@ -431,7 +452,7 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
return rc;
}
}
- return smc_connect_fallback(smc);
+ return smc_connect_fallback(smc, reason_code);
}
/* abort connecting */
@@ -448,7 +469,7 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code,
/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
- u8 *ibport)
+ u8 *ibport, unsigned short vlan_id, u8 gid[])
{
int reason_code = 0;
@@ -456,22 +477,59 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
- smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
+ gid);
if (!(*ibdev))
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
return reason_code;
}
+/* check if there is an ISM device available for this connection. */
+/* called for connect and listen */
+static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
+{
+ /* Find ISM device with same PNETID as connecting interface */
+ smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
+ if (!(*ismdev))
+ return SMC_CLC_DECL_CNFERR; /* configuration error */
+ return 0;
+}
+
+/* Check for VLAN ID and register it on ISM device just for CLC handshake */
+static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
+ struct smcd_dev *ismdev,
+ unsigned short vlan_id)
+{
+ if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
+ return SMC_CLC_DECL_CNFERR;
+ return 0;
+}
+
+/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
+ * used, the VLAN ID will be registered again during the connection setup.
+ */
+static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
+ struct smcd_dev *ismdev,
+ unsigned short vlan_id)
+{
+ if (!is_smcd)
+ return 0;
+ if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
+ return SMC_CLC_DECL_CNFERR;
+ return 0;
+}
+
/* CLC handshake during connect */
-static int smc_connect_clc(struct smc_sock *smc,
+static int smc_connect_clc(struct smc_sock *smc, int smc_type,
struct smc_clc_msg_accept_confirm *aclc,
- struct smc_ib_device *ibdev, u8 ibport)
+ struct smc_ib_device *ibdev, u8 ibport,
+ u8 gid[], struct smcd_dev *ismdev)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, ibdev, ibport);
+ rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
if (rc)
return rc;
/* receive SMC Accept CLC message */
@@ -488,8 +546,8 @@ static int smc_connect_rdma(struct smc_sock *smc,
int reason_code = 0;
mutex_lock(&smc_create_lgr_pending);
- local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
- aclc->hdr.flag);
+ local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
+ ibport, &aclc->lcl, NULL, 0);
if (local_contact < 0) {
if (local_contact == -ENOMEM)
reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
@@ -504,14 +562,14 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc_conn_save_peer_info(smc, aclc);
/* create send buffer and rmb */
- if (smc_buf_create(smc))
+ if (smc_buf_create(smc, false))
return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
if (local_contact == SMC_FIRST_CONTACT)
smc_link_save_peer_info(link, aclc);
if (smc_rmb_rtoken_handling(&smc->conn, aclc))
- return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
local_contact);
smc_close_init(smc);
@@ -519,12 +577,12 @@ static int smc_connect_rdma(struct smc_sock *smc,
if (local_contact == SMC_FIRST_CONTACT) {
if (smc_ib_ready_link(link))
- return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
local_contact);
} else {
if (!smc->conn.rmb_desc->reused &&
smc_reg_rmb(link, smc->conn.rmb_desc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
local_contact);
}
smc_rmb_sync_sg_for_device(&smc->conn);
@@ -551,41 +609,113 @@ static int smc_connect_rdma(struct smc_sock *smc,
return 0;
}
+/* setup for ISM connection of client */
+static int smc_connect_ism(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *aclc,
+ struct smcd_dev *ismdev)
+{
+ int local_contact = SMC_FIRST_CONTACT;
+ int rc = 0;
+
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0,
+ NULL, ismdev, aclc->gid);
+ if (local_contact < 0)
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
+
+ /* Create send and receive buffers */
+ if (smc_buf_create(smc, true))
+ return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
+
+ smc_conn_save_peer_info(smc, aclc);
+ smc_close_init(smc);
+ smc_rx_init(smc);
+ smc_tx_init(smc);
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+ return smc_connect_abort(smc, rc, local_contact);
+ mutex_unlock(&smc_create_lgr_pending);
+
+ smc_copy_sock_settings_to_clc(smc);
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return 0;
+}
+
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
+ bool ism_supported = false, rdma_supported = false;
struct smc_clc_msg_accept_confirm aclc;
struct smc_ib_device *ibdev;
+ struct smcd_dev *ismdev;
+ u8 gid[SMC_GID_SIZE];
+ unsigned short vlan;
+ int smc_type;
int rc = 0;
u8 ibport;
sock_hold(&smc->sk); /* sock put in passive closing */
if (smc->use_fallback)
- return smc_connect_fallback(smc);
+ return smc_connect_fallback(smc, smc->fallback_rsn);
/* if peer has not signalled SMC-capability, fall back */
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
- return smc_connect_fallback(smc);
+ return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc))
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
- /* check if a RDMA device is available; if not, fall back */
- if (smc_check_rdma(smc, &ibdev, &ibport))
+ /* check for VLAN ID */
+ if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+ /* check if there is an ism device available */
+ if (!smc_check_ism(smc, &ismdev) &&
+ !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
+ /* ISM is supported for this connection */
+ ism_supported = true;
+ smc_type = SMC_TYPE_D;
+ }
+
+ /* check if there is a rdma device available */
+ if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
+ /* RDMA is supported for this connection */
+ rdma_supported = true;
+ if (ism_supported)
+ smc_type = SMC_TYPE_B; /* both */
+ else
+ smc_type = SMC_TYPE_R; /* only RDMA */
+ }
+
+ /* if neither ISM nor RDMA are supported, fallback */
+ if (!rdma_supported && !ism_supported)
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
+
/* perform CLC handshake */
- rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
- if (rc)
+ rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
+ if (rc) {
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return smc_connect_decline_fallback(smc, rc);
+ }
- /* connect using rdma */
- rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
- if (rc)
+ /* depending on previous steps, connect using rdma or ism */
+ if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
+ rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
+ else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
+ rc = smc_connect_ism(smc, &aclc, ismdev);
+ else
+ rc = SMC_CLC_DECL_MODEUNSUPP;
+ if (rc) {
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return smc_connect_decline_fallback(smc, rc);
+ }
+ smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return 0;
}
@@ -612,7 +742,10 @@ static void smc_connect_work(struct work_struct *work)
smc->sk.sk_err = -rc;
out:
- smc->sk.sk_state_change(&smc->sk);
+ if (smc->sk.sk_err)
+ smc->sk.sk_state_change(&smc->sk);
+ else
+ smc->sk.sk_write_space(&smc->sk);
kfree(smc->connect_info);
smc->connect_info = NULL;
release_sock(&smc->sk);
@@ -817,15 +950,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
link = &lgr->lnk[SMC_SINGLE_LINK];
if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_REGRMB;
/* send CONFIRM LINK request to client over the RoCE fabric */
- rc = smc_llc_send_confirm_link(link,
- link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_REQ);
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_CL;
/* receive CONFIRM LINK response from client over the RoCE fabric */
rest = wait_for_completion_interruptible_timeout(
@@ -845,10 +975,9 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
/* send ADD LINK request to client over the RoCE fabric */
rc = smc_llc_send_add_link(link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_REQ);
+ link->gid, SMC_LLC_REQ);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_AL;
/* receive ADD LINK response from client over the RoCE fabric */
rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
@@ -923,7 +1052,8 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
}
smc_conn_free(&new_smc->conn);
new_smc->use_fallback = true;
- if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
+ new_smc->fallback_rsn = reason_code;
+ if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
if (smc_clc_send_decline(new_smc, reason_code) < 0) {
smc_listen_out_err(new_smc);
return;
@@ -953,7 +1083,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
int *local_contact)
{
/* allocate connection / link group */
- *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
+ *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport,
+ &pclc->lcl, NULL, 0);
if (*local_contact < 0) {
if (*local_contact == -ENOMEM)
return SMC_CLC_DECL_MEM;/* insufficient memory*/
@@ -961,8 +1092,46 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc,
}
/* create send buffer and rmb */
- if (smc_buf_create(new_smc))
+ if (smc_buf_create(new_smc, false))
+ return SMC_CLC_DECL_MEM;
+
+ return 0;
+}
+
+/* listen worker: initialize connection and buffers for SMC-D */
+static int smc_listen_ism_init(struct smc_sock *new_smc,
+ struct smc_clc_msg_proposal *pclc,
+ struct smcd_dev *ismdev,
+ int *local_contact)
+{
+ struct smc_clc_msg_smcd *pclc_smcd;
+
+ pclc_smcd = smc_get_clc_msg_smcd(pclc);
+ *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, NULL,
+ ismdev, pclc_smcd->gid);
+ if (*local_contact < 0) {
+ if (*local_contact == -ENOMEM)
+ return SMC_CLC_DECL_MEM;/* insufficient memory*/
+ return SMC_CLC_DECL_INTERR; /* other error */
+ }
+
+ /* Check if peer can be reached via ISM device */
+ if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
+ new_smc->conn.lgr->vlan_id,
+ new_smc->conn.lgr->smcd)) {
+ if (*local_contact == SMC_FIRST_CONTACT)
+ smc_lgr_forget(new_smc->conn.lgr);
+ smc_conn_free(&new_smc->conn);
+ return SMC_CLC_DECL_CNFERR;
+ }
+
+ /* Create send and receive buffers */
+ if (smc_buf_create(new_smc, true)) {
+ if (*local_contact == SMC_FIRST_CONTACT)
+ smc_lgr_forget(new_smc->conn.lgr);
+ smc_conn_free(&new_smc->conn);
return SMC_CLC_DECL_MEM;
+ }
return 0;
}
@@ -975,7 +1144,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
if (local_contact != SMC_FIRST_CONTACT) {
if (!new_smc->conn.rmb_desc->reused) {
if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_REGRMB;
}
}
smc_rmb_sync_sg_for_device(&new_smc->conn);
@@ -984,9 +1153,9 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
}
/* listen worker: finish RDMA setup */
-static void smc_listen_rdma_finish(struct smc_sock *new_smc,
- struct smc_clc_msg_accept_confirm *cclc,
- int local_contact)
+static int smc_listen_rdma_finish(struct smc_sock *new_smc,
+ struct smc_clc_msg_accept_confirm *cclc,
+ int local_contact)
{
struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
int reason_code = 0;
@@ -995,13 +1164,13 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc,
smc_link_save_peer_info(link, cclc);
if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
- reason_code = SMC_CLC_DECL_INTERR;
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
goto decline;
}
if (local_contact == SMC_FIRST_CONTACT) {
if (smc_ib_ready_link(link)) {
- reason_code = SMC_CLC_DECL_INTERR;
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
goto decline;
}
/* QP confirmation over RoCE fabric */
@@ -1009,11 +1178,12 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc,
if (reason_code)
goto decline;
}
- return;
+ return 0;
decline:
mutex_unlock(&smc_create_lgr_pending);
smc_listen_decline(new_smc, reason_code, local_contact);
+ return reason_code;
}
/* setup for RDMA connection of server */
@@ -1025,8 +1195,11 @@ static void smc_listen_work(struct work_struct *work)
struct smc_clc_msg_accept_confirm cclc;
struct smc_clc_msg_proposal *pclc;
struct smc_ib_device *ibdev;
+ bool ism_supported = false;
+ struct smcd_dev *ismdev;
u8 buf[SMC_CLC_MAX_LEN];
int local_contact = 0;
+ unsigned short vlan;
int reason_code = 0;
int rc = 0;
u8 ibport;
@@ -1039,6 +1212,7 @@ static void smc_listen_work(struct work_struct *work)
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
new_smc->use_fallback = true;
+ new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
smc_listen_out_connected(new_smc);
return;
}
@@ -1065,15 +1239,26 @@ static void smc_listen_work(struct work_struct *work)
smc_rx_init(new_smc);
smc_tx_init(new_smc);
+ /* check if ISM is available */
+ if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
+ !smc_check_ism(new_smc, &ismdev) &&
+ !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
+ ism_supported = true;
+ }
+
/* check if RDMA is available */
- if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
- smc_listen_rdma_check(new_smc, pclc) ||
- smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
- &local_contact) ||
- smc_listen_rdma_reg(new_smc, local_contact)) {
+ if (!ism_supported &&
+ ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
+ smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
+ smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
+ smc_listen_rdma_check(new_smc, pclc) ||
+ smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
+ &local_contact) ||
+ smc_listen_rdma_reg(new_smc, local_contact))) {
/* SMC not supported, decline */
mutex_unlock(&smc_create_lgr_pending);
- smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
+ smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
+ local_contact);
return;
}
@@ -1095,7 +1280,10 @@ static void smc_listen_work(struct work_struct *work)
}
/* finish worker */
- smc_listen_rdma_finish(new_smc, &cclc, local_contact);
+ if (!ism_supported) {
+ if (smc_listen_rdma_finish(new_smc, &cclc, local_contact))
+ return;
+ }
smc_conn_save_peer_info(new_smc, &cclc);
mutex_unlock(&smc_create_lgr_pending);
smc_listen_out_connected(new_smc);
@@ -1119,6 +1307,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
+ new_smc->fallback_rsn = lsmc->fallback_rsn;
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
@@ -1275,6 +1464,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
+ smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
rc = -EINVAL;
goto out;
@@ -1345,7 +1535,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
return EPOLLNVAL;
smc = smc_sk(sock->sk);
- if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+ if (smc->use_fallback) {
/* delegate to CLC child sock */
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
sk->sk_err = smc->clcsock->sk->sk_err;
@@ -1353,7 +1543,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
mask |= EPOLLERR;
} else {
if (sk->sk_state != SMC_CLOSED)
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
if (sk->sk_err)
mask |= EPOLLERR;
if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1376,9 +1566,9 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
if (sk->sk_state == SMC_APPCLOSEWAIT1)
mask |= EPOLLIN;
+ if (smc->conn.urg_state == SMC_URG_VALID)
+ mask |= EPOLLPRI;
}
- if (smc->conn.urg_state == SMC_URG_VALID)
- mask |= EPOLLPRI;
}
return mask;
@@ -1471,6 +1661,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
+ smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
if (!smc->use_fallback)
rc = -EINVAL;
@@ -1578,12 +1769,8 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
smc->sk.sk_state == SMC_CLOSED) {
answ = 0;
} else {
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
- smc_curs_write(&urg,
- smc_curs_read(&conn->urg_curs, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&urg, &conn->urg_curs, conn);
answ = smc_curs_diff(conn->rmb_desc->len,
&cons, &urg) == 1;
}
@@ -1716,6 +1903,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
/* create internal TCP socket for CLC handshake and fallback */
smc = smc_sk(sk);
smc->use_fallback = false; /* assume rdma capability first */
+ smc->fallback_rsn = 0;
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
&smc->clcsock);
if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index d7ca26570482..08786ace6010 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -21,8 +21,6 @@
#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */
#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */
-#define SMC_MAX_PORTS 2 /* Max # of ports */
-
extern struct proto smc_proto;
extern struct proto smc_proto6;
@@ -185,6 +183,11 @@ struct smc_connection {
spinlock_t acurs_lock; /* protect cursors */
#endif
struct work_struct close_work; /* peer sent some closing */
+ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */
+ u8 rx_off; /* receive offset:
+ * 0 for SMC-R, 32 for SMC-D
+ */
+ u64 peer_token; /* SMC-D token of peer */
};
struct smc_connect_info {
@@ -205,6 +208,8 @@ struct smc_sock { /* smc sock container */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */
+ int fallback_rsn; /* reason for fallback */
+ u32 peer_diagnosis; /* decline reason from peer */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 9bde1e4ca288..ed5dcf03fe0b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -34,14 +34,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
enum ib_wc_status wc_status)
{
struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_connection *conn = cdcpend->conn;
struct smc_sock *smc;
int diff;
- if (!cdcpend->conn)
+ if (!conn)
/* already dismissed */
return;
- smc = container_of(cdcpend->conn, struct smc_sock, conn);
+ smc = container_of(conn, struct smc_sock, conn);
bh_lock_sock(&smc->sk);
if (!wc_status) {
diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
@@ -52,9 +53,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
atomic_add(diff, &cdcpend->conn->sndbuf_space);
/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
- smc_curs_write(&cdcpend->conn->tx_curs_fin,
- smc_curs_read(&cdcpend->cursor, cdcpend->conn),
- cdcpend->conn);
+ smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
}
smc_tx_sndbuf_nonfull(smc);
bh_unlock_sock(&smc->sk);
@@ -110,14 +109,13 @@ int smc_cdc_msg_send(struct smc_connection *conn,
&conn->local_tx_ctrl, conn);
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
if (!rc)
- smc_curs_write(&conn->rx_curs_confirmed,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&conn->rx_curs_confirmed,
+ &conn->local_tx_ctrl.cons, conn);
return rc;
}
-int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
{
struct smc_cdc_tx_pend *pend;
struct smc_wr_buf *wr_buf;
@@ -130,6 +128,21 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
return smc_cdc_msg_send(conn, wr_buf, pend);
}
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ int rc;
+
+ if (conn->lgr->is_smcd) {
+ spin_lock_bh(&conn->send_lock);
+ rc = smcd_cdc_msg_send(conn);
+ spin_unlock_bh(&conn->send_lock);
+ } else {
+ rc = smcr_cdc_get_slot_and_msg_send(conn);
+ }
+
+ return rc;
+}
+
static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
unsigned long data)
{
@@ -157,6 +170,44 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
(unsigned long)conn);
}
+/* Send a SMC-D CDC header.
+ * This increments the free space available in our send buffer.
+ * Also update the confirmed receive buffer with what was sent to the peer.
+ */
+int smcd_cdc_msg_send(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smcd_cdc_msg cdc;
+ int rc, diff;
+
+ memset(&cdc, 0, sizeof(cdc));
+ cdc.common.type = SMC_CDC_MSG_TYPE;
+ cdc.prod_wrap = conn->local_tx_ctrl.prod.wrap;
+ cdc.prod_count = conn->local_tx_ctrl.prod.count;
+
+ cdc.cons_wrap = conn->local_tx_ctrl.cons.wrap;
+ cdc.cons_count = conn->local_tx_ctrl.cons.count;
+ cdc.prod_flags = conn->local_tx_ctrl.prod_flags;
+ cdc.conn_state_flags = conn->local_tx_ctrl.conn_state_flags;
+ rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
+ if (rc)
+ return rc;
+ smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons,
+ conn);
+ /* Calculate transmitted data and increment free send buffer space */
+ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
+ &conn->tx_curs_sent);
+ /* increased by confirmed number of bytes */
+ smp_mb__before_atomic();
+ atomic_add(diff, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
+ smp_mb__after_atomic();
+ smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
+
+ smc_tx_sndbuf_nonfull(smc);
+ return rc;
+}
+
/********************************* receive ***********************************/
static inline bool smc_cdc_before(u16 seq1, u16 seq2)
@@ -171,14 +222,12 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
char *base;
/* new data included urgent business */
- smc_curs_write(&conn->urg_curs,
- smc_curs_read(&conn->local_rx_ctrl.prod, conn),
- conn);
+ smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn);
conn->urg_state = SMC_URG_VALID;
if (!sock_flag(&smc->sk, SOCK_URGINLINE))
/* we'll skip the urgent byte, so don't account for it */
(*diff_prod)--;
- base = (char *)conn->rmb_desc->cpu_addr;
+ base = (char *)conn->rmb_desc->cpu_addr + conn->rx_off;
if (conn->urg_curs.count)
conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
else
@@ -193,12 +242,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
struct smc_connection *conn = &smc->conn;
int diff_cons, diff_prod;
- smc_curs_write(&prod_old,
- smc_curs_read(&conn->local_rx_ctrl.prod, conn),
- conn);
- smc_curs_write(&cons_old,
- smc_curs_read(&conn->local_rx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
+ smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
@@ -277,6 +322,34 @@ static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
sock_put(&smc->sk); /* no free sk in softirq-context */
}
+/* Schedule a tasklet for this connection. Triggered from the ISM device IRQ
+ * handler to indicate update in the DMBE.
+ *
+ * Context:
+ * - tasklet context
+ */
+static void smcd_cdc_rx_tsklet(unsigned long data)
+{
+ struct smc_connection *conn = (struct smc_connection *)data;
+ struct smcd_cdc_msg cdc;
+ struct smc_sock *smc;
+
+ if (!conn)
+ return;
+
+ memcpy(&cdc, conn->rmb_desc->cpu_addr, sizeof(cdc));
+ smc = container_of(conn, struct smc_sock, conn);
+ smc_cdc_msg_recv(smc, (struct smc_cdc_msg *)&cdc);
+}
+
+/* Initialize receive tasklet. Called from ISM device IRQ handler to start
+ * receiver side.
+ */
+void smcd_cdc_rx_init(struct smc_connection *conn)
+{
+ tasklet_init(&conn->rx_tsklet, smcd_cdc_rx_tsklet, (unsigned long)conn);
+}
+
/***************************** init, exit, misc ******************************/
static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
@@ -293,7 +366,7 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
return; /* invalid message */
/* lookup connection */
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ lgr = smc_get_lgr(link);
read_lock_bh(&lgr->conns_lock);
conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
read_unlock_bh(&lgr->conns_lock);
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index f60082fee5b8..934df4473a7c 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -50,6 +50,20 @@ struct smc_cdc_msg {
u8 reserved[18];
} __packed; /* format defined in RFC7609 */
+/* CDC message for SMC-D */
+struct smcd_cdc_msg {
+ struct smc_wr_rx_hdr common; /* Type = 0xFE */
+ u8 res1[7];
+ u16 prod_wrap;
+ u32 prod_count;
+ u8 res2[2];
+ u16 cons_wrap;
+ u32 cons_count;
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 res3[8];
+} __packed;
+
static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
{
return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
@@ -90,47 +104,34 @@ static inline u64 smc_curs_read(union smc_host_cursor *curs,
#endif
}
-static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
- struct smc_connection *conn)
-{
-#ifndef KERNEL_HAS_ATOMIC64
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&conn->acurs_lock, flags);
- ret = curs->acurs;
- spin_unlock_irqrestore(&conn->acurs_lock, flags);
- return ret;
-#else
- return atomic64_read(&curs->acurs);
-#endif
-}
-
-static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
- struct smc_connection *conn)
+/* Copy cursor src into tgt */
+static inline void smc_curs_copy(union smc_host_cursor *tgt,
+ union smc_host_cursor *src,
+ struct smc_connection *conn)
{
#ifndef KERNEL_HAS_ATOMIC64
unsigned long flags;
spin_lock_irqsave(&conn->acurs_lock, flags);
- curs->acurs = val;
+ tgt->acurs = src->acurs;
spin_unlock_irqrestore(&conn->acurs_lock, flags);
#else
- atomic64_set(&curs->acurs, val);
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
#endif
}
-static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
- struct smc_connection *conn)
+static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt,
+ union smc_cdc_cursor *src,
+ struct smc_connection *conn)
{
#ifndef KERNEL_HAS_ATOMIC64
unsigned long flags;
spin_lock_irqsave(&conn->acurs_lock, flags);
- curs->acurs = val;
+ tgt->acurs = src->acurs;
spin_unlock_irqrestore(&conn->acurs_lock, flags);
#else
- atomic64_set(&curs->acurs, val);
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
#endif
}
@@ -165,7 +166,7 @@ static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
{
union smc_host_cursor temp;
- smc_curs_write(&temp, smc_curs_read(local, conn), conn);
+ smc_curs_copy(&temp, local, conn);
peer->count = htonl(temp.count);
peer->wrap = htons(temp.wrap);
/* peer->reserved = htons(0); must be ensured by caller */
@@ -192,8 +193,8 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
union smc_host_cursor temp, old;
union smc_cdc_cursor net;
- smc_curs_write(&old, smc_curs_read(local, conn), conn);
- smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
+ smc_curs_copy(&old, local, conn);
+ smc_curs_copy_net(&net, peer, conn);
temp.count = ntohl(net.count);
temp.wrap = ntohs(net.wrap);
if ((old.wrap > temp.wrap) && temp.wrap)
@@ -201,12 +202,12 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
if ((old.wrap == temp.wrap) &&
(old.count > temp.count))
return;
- smc_curs_write(local, smc_curs_read(&temp, conn), conn);
+ smc_curs_copy(local, &temp, conn);
}
-static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
- struct smc_cdc_msg *peer,
- struct smc_connection *conn)
+static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
{
local->common.type = peer->common.type;
local->len = peer->len;
@@ -218,6 +219,27 @@ static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
local->conn_state_flags = peer->conn_state_flags;
}
+static inline void smcd_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smcd_cdc_msg *peer)
+{
+ local->prod.wrap = peer->prod_wrap;
+ local->prod.count = peer->prod_count;
+ local->cons.wrap = peer->cons_wrap;
+ local->cons.count = peer->cons_count;
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ if (conn->lgr->is_smcd)
+ smcd_cdc_msg_to_host(local, (struct smcd_cdc_msg *)peer);
+ else
+ smcr_cdc_msg_to_host(local, peer, conn);
+}
+
struct smc_cdc_tx_pend;
int smc_cdc_get_free_slot(struct smc_connection *conn,
@@ -227,6 +249,8 @@ void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
struct smc_cdc_tx_pend *pend);
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+int smcd_cdc_msg_send(struct smc_connection *conn);
int smc_cdc_init(void) __init;
+void smcd_cdc_rx_init(struct smc_connection *conn);
#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index ae5d168653ce..52241d679cc9 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -23,9 +23,15 @@
#include "smc_core.h"
#include "smc_clc.h"
#include "smc_ib.h"
+#include "smc_ism.h"
+
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
+#define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
/* eye catcher "SMCR" EBCDIC for CLC messages */
static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+/* eye catcher "SMCD" EBCDIC for CLC messages */
+static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'};
/* check if received message has a correct header length and contains valid
* heading and trailing eyecatchers
@@ -38,10 +44,14 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
struct smc_clc_msg_decline *dclc;
struct smc_clc_msg_trail *trl;
- if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+ if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(clcm->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
return false;
switch (clcm->type) {
case SMC_CLC_PROPOSAL:
+ if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
+ clcm->path != SMC_TYPE_B)
+ return false;
pclc = (struct smc_clc_msg_proposal *)clcm;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (ntohs(pclc->hdr.length) !=
@@ -56,10 +66,16 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
break;
case SMC_CLC_ACCEPT:
case SMC_CLC_CONFIRM:
+ if (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D)
+ return false;
clc = (struct smc_clc_msg_accept_confirm *)clcm;
- if (ntohs(clc->hdr.length) != sizeof(*clc))
+ if ((clcm->path == SMC_TYPE_R &&
+ ntohs(clc->hdr.length) != SMCR_CLC_ACCEPT_CONFIRM_LEN) ||
+ (clcm->path == SMC_TYPE_D &&
+ ntohs(clc->hdr.length) != SMCD_CLC_ACCEPT_CONFIRM_LEN))
return false;
- trl = &clc->trl;
+ trl = (struct smc_clc_msg_trail *)
+ ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl));
break;
case SMC_CLC_DECLINE:
dclc = (struct smc_clc_msg_decline *)clcm;
@@ -70,7 +86,8 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
default:
return false;
}
- if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
+ if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) &&
+ memcmp(trl->eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)))
return false;
return true;
}
@@ -296,6 +313,9 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
(datlen > buflen) ||
+ (clcm->version != SMC_CLC_V1) ||
+ (clcm->path != SMC_TYPE_R && clcm->path != SMC_TYPE_D &&
+ clcm->path != SMC_TYPE_B) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
@@ -314,7 +334,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
goto out;
}
if (clcm->type == SMC_CLC_DECLINE) {
- reason_code = SMC_CLC_DECL_REPLY;
+ struct smc_clc_msg_decline *dclc;
+
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ reason_code = SMC_CLC_DECL_PEERDECL;
+ smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
smc->conn.lgr->sync_err = 1;
smc_lgr_terminate(smc->conn.lgr);
@@ -357,17 +381,18 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
}
/* send CLC PROPOSAL message across internal TCP socket */
-int smc_clc_send_proposal(struct smc_sock *smc,
- struct smc_ib_device *smcibdev,
- u8 ibport)
+int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
+ struct smc_ib_device *ibdev, u8 ibport, u8 gid[],
+ struct smcd_dev *ismdev)
{
struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
struct smc_clc_msg_proposal_prefix pclc_prfx;
+ struct smc_clc_msg_smcd pclc_smcd;
struct smc_clc_msg_proposal pclc;
struct smc_clc_msg_trail trl;
int len, i, plen, rc;
int reason_code = 0;
- struct kvec vec[4];
+ struct kvec vec[5];
struct msghdr msg;
/* retrieve ip prefixes for CLC proposal msg */
@@ -382,18 +407,34 @@ int smc_clc_send_proposal(struct smc_sock *smc,
memset(&pclc, 0, sizeof(pclc));
memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
pclc.hdr.type = SMC_CLC_PROPOSAL;
- pclc.hdr.length = htons(plen);
pclc.hdr.version = SMC_CLC_V1; /* SMC version */
- memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
- memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
- pclc.iparea_offset = htons(0);
+ pclc.hdr.path = smc_type;
+ if (smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B) {
+ /* add SMC-R specifics */
+ memcpy(pclc.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE);
+ memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN);
+ pclc.iparea_offset = htons(0);
+ }
+ if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
+ /* add SMC-D specifics */
+ memset(&pclc_smcd, 0, sizeof(pclc_smcd));
+ plen += sizeof(pclc_smcd);
+ pclc.iparea_offset = htons(SMC_CLC_PROPOSAL_MAX_OFFSET);
+ pclc_smcd.gid = ismdev->local_gid;
+ }
+ pclc.hdr.length = htons(plen);
memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
i = 0;
vec[i].iov_base = &pclc;
vec[i++].iov_len = sizeof(pclc);
+ if (smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B) {
+ vec[i].iov_base = &pclc_smcd;
+ vec[i++].iov_len = sizeof(pclc_smcd);
+ }
vec[i].iov_base = &pclc_prfx;
vec[i++].iov_len = sizeof(pclc_prfx);
if (pclc_prfx.ipv6_prefixes_cnt > 0) {
@@ -405,14 +446,12 @@ int smc_clc_send_proposal(struct smc_sock *smc,
vec[i++].iov_len = sizeof(trl);
/* due to the few bytes needed for clc-handshake this cannot block */
len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
- if (len < sizeof(pclc)) {
- if (len >= 0) {
- reason_code = -ENETUNREACH;
- smc->sk.sk_err = -reason_code;
- } else {
- smc->sk.sk_err = smc->clcsock->sk->sk_err;
- reason_code = -smc->sk.sk_err;
- }
+ if (len < 0) {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ } else if (len < (int)sizeof(pclc)) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
}
return reason_code;
@@ -429,35 +468,55 @@ int smc_clc_send_confirm(struct smc_sock *smc)
struct kvec vec;
int len;
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
/* send SMC Confirm CLC msg */
memset(&cclc, 0, sizeof(cclc));
- memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
cclc.hdr.type = SMC_CLC_CONFIRM;
- cclc.hdr.length = htons(sizeof(cclc));
cclc.hdr.version = SMC_CLC_V1; /* SMC version */
- memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
- memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
- SMC_GID_SIZE);
- memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
- hton24(cclc.qpn, link->roce_qp->qp_num);
- cclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
- cclc.rmbe_alert_token = htonl(conn->alert_token_local);
- cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
- cclc.rmbe_size = conn->rmbe_size_short;
- cclc.rmb_dma_addr = cpu_to_be64(
- (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
- hton24(cclc.psn, link->psn_initial);
-
- memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ if (smc->conn.lgr->is_smcd) {
+ /* SMC-D specific settings */
+ memcpy(cclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ cclc.hdr.path = SMC_TYPE_D;
+ cclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ cclc.gid = conn->lgr->smcd->local_gid;
+ cclc.token = conn->rmb_desc->token;
+ cclc.dmbe_size = conn->rmbe_size_short;
+ cclc.dmbe_idx = 0;
+ memcpy(&cclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ memcpy(cclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ } else {
+ /* SMC-R specific settings */
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ cclc.hdr.path = SMC_TYPE_R;
+ cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(cclc.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ hton24(cclc.qpn, link->roce_qp->qp_num);
+ cclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ cclc.rmbe_size = conn->rmbe_size_short;
+ cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+ hton24(cclc.psn, link->psn_initial);
+ memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ }
memset(&msg, 0, sizeof(msg));
vec.iov_base = &cclc;
- vec.iov_len = sizeof(cclc);
- len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
- if (len < sizeof(cclc)) {
+ vec.iov_len = ntohs(cclc.hdr.length);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ ntohs(cclc.hdr.length));
+ if (len < ntohs(cclc.hdr.length)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
@@ -480,35 +539,57 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
int rc = 0;
int len;
- link = &conn->lgr->lnk[SMC_SINGLE_LINK];
memset(&aclc, 0, sizeof(aclc));
- memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
aclc.hdr.type = SMC_CLC_ACCEPT;
- aclc.hdr.length = htons(sizeof(aclc));
aclc.hdr.version = SMC_CLC_V1; /* SMC version */
if (srv_first_contact)
aclc.hdr.flag = 1;
- memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
- memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
- SMC_GID_SIZE);
- memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
- hton24(aclc.qpn, link->roce_qp->qp_num);
- aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
- aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
- aclc.rmbe_alert_token = htonl(conn->alert_token_local);
- aclc.qp_mtu = link->path_mtu;
- aclc.rmbe_size = conn->rmbe_size_short,
- aclc.rmb_dma_addr = cpu_to_be64(
- (u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
- hton24(aclc.psn, link->psn_initial);
- memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ if (new_smc->conn.lgr->is_smcd) {
+ /* SMC-D specific settings */
+ aclc.hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(aclc.hdr.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ aclc.hdr.path = SMC_TYPE_D;
+ aclc.gid = conn->lgr->smcd->local_gid;
+ aclc.token = conn->rmb_desc->token;
+ aclc.dmbe_size = conn->rmbe_size_short;
+ aclc.dmbe_idx = 0;
+ memcpy(&aclc.linkid, conn->lgr->id, SMC_LGR_ID_SIZE);
+ memcpy(aclc.smcd_trl.eyecatcher, SMCD_EYECATCHER,
+ sizeof(SMCD_EYECATCHER));
+ } else {
+ /* SMC-R specific settings */
+ aclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ aclc.hdr.path = SMC_TYPE_R;
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memcpy(aclc.lcl.id_for_peer, local_systemid,
+ sizeof(local_systemid));
+ memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
+ aclc.rmbe_size = conn->rmbe_size_short,
+ aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
+ (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER,
+ sizeof(SMC_EYECATCHER));
+ }
memset(&msg, 0, sizeof(msg));
vec.iov_base = &aclc;
- vec.iov_len = sizeof(aclc);
- len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
- if (len < sizeof(aclc)) {
+ vec.iov_len = ntohs(aclc.hdr.length);
+ len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1,
+ ntohs(aclc.hdr.length));
+ if (len < ntohs(aclc.hdr.length)) {
if (len >= 0)
new_smc->sk.sk_err = EPROTO;
else
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 41ff9ea96139..18da89b681c2 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -23,17 +23,26 @@
#define SMC_CLC_DECLINE 0x04
#define SMC_CLC_V1 0x1 /* SMC version */
+#define SMC_TYPE_R 0 /* SMC-R only */
+#define SMC_TYPE_D 1 /* SMC-D only */
+#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
-#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
+#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
+#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
-#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
+#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
+#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
+#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */
+#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
+#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
+#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
-#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
+#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
-#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
-#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
-#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */
+#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */
+#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */
+#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
@@ -42,9 +51,11 @@ struct smc_clc_msg_hdr { /* header1 of clc messages */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
flag : 1,
- rsvd : 3;
+ rsvd : 1,
+ path : 2;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 rsvd : 3,
+ u8 path : 2,
+ rsvd : 1,
flag : 1,
version : 4;
#endif
@@ -77,6 +88,11 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
} __aligned(4);
+struct smc_clc_msg_smcd { /* SMC-D GID information */
+ u64 gid; /* ISM GID of requestor */
+ u8 res[32];
+};
+
struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
struct smc_clc_msg_hdr hdr;
struct smc_clc_msg_local lcl;
@@ -94,23 +110,45 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
struct smc_clc_msg_hdr hdr;
- struct smc_clc_msg_local lcl;
- u8 qpn[3]; /* QP number */
- __be32 rmb_rkey; /* RMB rkey */
- u8 rmbe_idx; /* Index of RMBE in RMB */
- __be32 rmbe_alert_token;/* unique connection id */
+ union {
+ struct { /* SMC-R */
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 rmbe_idx; /* Index of RMBE in RMB */
+ __be32 rmbe_alert_token;/* unique connection id */
#if defined(__BIG_ENDIAN_BITFIELD)
- u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
- qp_mtu : 4; /* QP mtu */
+ u8 rmbe_size : 4, /* buf size (compressed) */
+ qp_mtu : 4; /* QP mtu */
#elif defined(__LITTLE_ENDIAN_BITFIELD)
- u8 qp_mtu : 4,
- rmbe_size : 4;
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
#endif
- u8 reserved;
- __be64 rmb_dma_addr; /* RMB virtual address */
- u8 reserved2;
- u8 psn[3]; /* initial packet sequence number */
- struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* packet sequence number */
+ struct smc_clc_msg_trail smcr_trl;
+ /* eye catcher "SMCR" EBCDIC */
+ } __packed;
+ struct { /* SMC-D */
+ u64 gid; /* Sender GID */
+ u64 token; /* DMB token */
+ u8 dmbe_idx; /* DMBE index */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 dmbe_size : 4, /* buf size (compressed) */
+ reserved3 : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved3 : 4,
+ dmbe_size : 4;
+#endif
+ u16 reserved4;
+ u32 linkid; /* Link identifier */
+ u32 reserved5[3];
+ struct smc_clc_msg_trail smcd_trl;
+ /* eye catcher "SMCD" EBCDIC */
+ } __packed;
+ };
} __packed; /* format defined in RFC7609 */
struct smc_clc_msg_decline { /* clc decline message */
@@ -129,13 +167,26 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
}
+/* get SMC-D info from proposal message */
+static inline struct smc_clc_msg_smcd *
+smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
+{
+ if (ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
+ return NULL;
+
+ return (struct smc_clc_msg_smcd *)(prop + 1);
+}
+
+struct smcd_dev;
+
int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
-int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
- u8 ibport);
+int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
+ struct smc_ib_device *smcibdev, u8 ibport, u8 gid[],
+ struct smcd_dev *ismdev);
int smc_clc_send_confirm(struct smc_sock *smc);
int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index ac961dfb1ea1..ea2b87f29469 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -100,15 +100,14 @@ static void smc_close_active_abort(struct smc_sock *smc)
struct smc_cdc_conn_state_flags *txflags =
&smc->conn.local_tx_ctrl.conn_state_flags;
- sk->sk_err = ECONNABORTED;
- if (smc->clcsock && smc->clcsock->sk) {
- smc->clcsock->sk->sk_err = ECONNABORTED;
- smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+ if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) {
+ sk->sk_err = ECONNABORTED;
+ if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_err = ECONNABORTED;
+ smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+ }
}
switch (sk->sk_state) {
- case SMC_INIT:
- sk->sk_state = SMC_PEERABORTWAIT;
- break;
case SMC_ACTIVE:
sk->sk_state = SMC_PEERABORTWAIT;
release_sock(sk);
@@ -143,6 +142,7 @@ static void smc_close_active_abort(struct smc_sock *smc)
case SMC_PEERFINCLOSEWAIT:
sock_put(sk); /* passive closing */
break;
+ case SMC_INIT:
case SMC_PEERABORTWAIT:
case SMC_CLOSED:
break;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index add82b0266f3..e871368500e3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -16,6 +16,7 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
#include "smc.h"
#include "smc_clc.h"
@@ -25,10 +26,12 @@
#include "smc_llc.h"
#include "smc_cdc.h"
#include "smc_close.h"
+#include "smc_ism.h"
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
+#define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
static struct smc_lgr_list smc_lgr_list = { /* established link groups */
.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
@@ -46,8 +49,13 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
* otherwise there is a risk of out-of-sync link groups.
*/
mod_delayed_work(system_wq, &lgr->free_work,
- lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
- SMC_LGR_FREE_DELAY_SERV);
+ (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
+ SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
+}
+
+void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
+{
+ mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST);
}
/* Register connection's alert token in our lookup structure.
@@ -132,6 +140,20 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
smc_lgr_schedule_free_work(lgr);
}
+/* Send delete link, either as client to request the initiation
+ * of the DELETE LINK sequence from server; or as server to
+ * initiate the delete processing. See smc_llc_rx_delete_link().
+ */
+static int smc_link_send_delete(struct smc_link *lnk)
+{
+ if (lnk->state == SMC_LNK_ACTIVE &&
+ !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
+ smc_llc_link_deleting(lnk);
+ return 0;
+ }
+ return -ENOTCONN;
+}
+
static void smc_lgr_free_work(struct work_struct *work)
{
struct smc_link_group *lgr = container_of(to_delayed_work(work),
@@ -152,17 +174,30 @@ static void smc_lgr_free_work(struct work_struct *work)
list_del_init(&lgr->list); /* remove from smc_lgr_list */
free:
spin_unlock_bh(&smc_lgr_list.lock);
+
+ if (!lgr->is_smcd && !lgr->terminating) {
+ /* try to send del link msg, on error free lgr immediately */
+ if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) {
+ /* reschedule in case we never receive a response */
+ smc_lgr_schedule_free_work(lgr);
+ return;
+ }
+ }
+
if (!delayed_work_pending(&lgr->free_work)) {
- if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
+ smc_llc_link_inactive(lnk);
smc_lgr_free(lgr);
}
}
/* create a new SMC link group */
-static int smc_lgr_create(struct smc_sock *smc,
+static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
struct smc_ib_device *smcibdev, u8 ibport,
- char *peer_systemid, unsigned short vlan_id)
+ char *peer_systemid, unsigned short vlan_id,
+ struct smcd_dev *smcismdev, u64 peer_gid)
{
struct smc_link_group *lgr;
struct smc_link *lnk;
@@ -170,17 +205,23 @@ static int smc_lgr_create(struct smc_sock *smc,
int rc = 0;
int i;
+ if (is_smcd && vlan_id) {
+ rc = smc_ism_get_vlan(smcismdev, vlan_id);
+ if (rc)
+ goto out;
+ }
+
lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
if (!lgr) {
rc = -ENOMEM;
goto out;
}
- lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ lgr->is_smcd = is_smcd;
lgr->sync_err = 0;
- memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
lgr->vlan_id = vlan_id;
rwlock_init(&lgr->sndbufs_lock);
rwlock_init(&lgr->rmbs_lock);
+ rwlock_init(&lgr->conns_lock);
for (i = 0; i < SMC_RMBE_SIZES; i++) {
INIT_LIST_HEAD(&lgr->sndbufs[i]);
INIT_LIST_HEAD(&lgr->rmbs[i]);
@@ -189,36 +230,48 @@ static int smc_lgr_create(struct smc_sock *smc,
memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
lgr->conns_all = RB_ROOT;
-
- lnk = &lgr->lnk[SMC_SINGLE_LINK];
- /* initialize link */
- lnk->state = SMC_LNK_ACTIVATING;
- lnk->link_id = SMC_SINGLE_LINK;
- lnk->smcibdev = smcibdev;
- lnk->ibport = ibport;
- lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
- if (!smcibdev->initialized)
- smc_ib_setup_per_ibdev(smcibdev);
- get_random_bytes(rndvec, sizeof(rndvec));
- lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
- rc = smc_llc_link_init(lnk);
- if (rc)
- goto free_lgr;
- rc = smc_wr_alloc_link_mem(lnk);
- if (rc)
- goto clear_llc_lnk;
- rc = smc_ib_create_protection_domain(lnk);
- if (rc)
- goto free_link_mem;
- rc = smc_ib_create_queue_pair(lnk);
- if (rc)
- goto dealloc_pd;
- rc = smc_wr_create_link(lnk);
- if (rc)
- goto destroy_qp;
-
+ if (is_smcd) {
+ /* SMC-D specific settings */
+ lgr->peer_gid = peer_gid;
+ lgr->smcd = smcismdev;
+ } else {
+ /* SMC-R specific settings */
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+ lnk->state = SMC_LNK_ACTIVATING;
+ lnk->link_id = SMC_SINGLE_LINK;
+ lnk->smcibdev = smcibdev;
+ lnk->ibport = ibport;
+ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+ if (!smcibdev->initialized)
+ smc_ib_setup_per_ibdev(smcibdev);
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
+ (rndvec[2] << 16);
+ rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
+ vlan_id, lnk->gid, &lnk->sgid_index);
+ if (rc)
+ goto free_lgr;
+ rc = smc_llc_link_init(lnk);
+ if (rc)
+ goto free_lgr;
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto clear_llc_lnk;
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ }
smc->conn.lgr = lgr;
- rwlock_init(&lgr->conns_lock);
spin_lock_bh(&smc_lgr_list.lock);
list_add(&lgr->list, &smc_lgr_list.list);
spin_unlock_bh(&smc_lgr_list.lock);
@@ -264,7 +317,12 @@ void smc_conn_free(struct smc_connection *conn)
{
if (!conn->lgr)
return;
- smc_cdc_tx_dismiss_slots(conn);
+ if (conn->lgr->is_smcd) {
+ smc_ism_unset_conn(conn);
+ tasklet_kill(&conn->rx_tsklet);
+ } else {
+ smc_cdc_tx_dismiss_slots(conn);
+ }
smc_lgr_unregister_conn(conn);
smc_buf_unuse(conn);
}
@@ -280,8 +338,8 @@ static void smc_link_clear(struct smc_link *lnk)
smc_wr_free_link_mem(lnk);
}
-static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
- struct smc_buf_desc *buf_desc)
+static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
{
struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
@@ -301,6 +359,28 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
kfree(buf_desc);
}
+static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (is_dmb) {
+ /* restore original buf len */
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ } else {
+ kfree(buf_desc->cpu_addr);
+ }
+ kfree(buf_desc);
+}
+
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+ struct smc_buf_desc *buf_desc)
+{
+ if (lgr->is_smcd)
+ smcd_buf_free(lgr, is_rmb, buf_desc);
+ else
+ smcr_buf_free(lgr, is_rmb, buf_desc);
+}
+
static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
{
struct smc_buf_desc *buf_desc, *bf_desc;
@@ -332,7 +412,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
void smc_lgr_free(struct smc_link_group *lgr)
{
smc_lgr_free_bufs(lgr);
- smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (lgr->is_smcd)
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ else
+ smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
kfree(lgr);
}
@@ -357,7 +440,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
lgr->terminating = 1;
if (!list_empty(&lgr->list)) /* forget lgr */
list_del_init(&lgr->list);
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (!lgr->is_smcd)
+ smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
write_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
@@ -374,7 +458,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
node = rb_first(&lgr->conns_all);
}
write_unlock_bh(&lgr->conns_lock);
- wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
+ if (!lgr->is_smcd)
+ wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
smc_lgr_schedule_free_work(lgr);
}
@@ -392,17 +477,44 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
spin_lock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
- if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+ if (!lgr->is_smcd &&
+ lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
__smc_lgr_terminate(lgr);
}
spin_unlock_bh(&smc_lgr_list.lock);
}
+/* Called when SMC-D device is terminated or peer is lost */
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid)
+{
+ struct smc_link_group *lgr, *l;
+ LIST_HEAD(lgr_free_list);
+
+ /* run common cleanup function and build free list */
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+ if (lgr->is_smcd && lgr->smcd == dev &&
+ (!peer_gid || lgr->peer_gid == peer_gid) &&
+ !list_empty(&lgr->list)) {
+ __smc_lgr_terminate(lgr);
+ list_move(&lgr->list, &lgr_free_list);
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ /* cancel the regular free workers and actually free lgrs */
+ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ cancel_delayed_work_sync(&lgr->free_work);
+ smc_lgr_free(lgr);
+ }
+}
+
/* Determine vlan of internal TCP socket.
* @vlan_id: address to store the determined vlan id into
*/
-static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
struct net_device *ndev;
@@ -446,41 +558,30 @@ out:
return rc;
}
-/* determine the link gid matching the vlan id of the link group */
-static int smc_link_determine_gid(struct smc_link_group *lgr)
+static bool smcr_lgr_match(struct smc_link_group *lgr,
+ struct smc_clc_msg_local *lcl,
+ enum smc_lgr_role role)
{
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
- struct ib_gid_attr gattr;
- union ib_gid gid;
- int i;
-
- if (!lgr->vlan_id) {
- lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
- return 0;
- }
+ return !memcmp(lgr->peer_systemid, lcl->id_for_peer,
+ SMC_SYSTEMID_LEN) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+ SMC_GID_SIZE) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+ sizeof(lcl->mac)) &&
+ lgr->role == role;
+}
- for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
- i++) {
- if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
- &gattr))
- continue;
- if (gattr.ndev) {
- if (is_vlan_dev(gattr.ndev) &&
- vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
- lnk->gid = gid;
- dev_put(gattr.ndev);
- return 0;
- }
- dev_put(gattr.ndev);
- }
- }
- return -ENODEV;
+static bool smcd_lgr_match(struct smc_link_group *lgr,
+ struct smcd_dev *smcismdev, u64 peer_gid)
+{
+ return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev;
}
/* create a new SMC connection (and a new link group if necessary) */
-int smc_conn_create(struct smc_sock *smc,
+int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
struct smc_ib_device *smcibdev, u8 ibport,
- struct smc_clc_msg_local *lcl, int srv_first_contact)
+ struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
+ u64 peer_gid)
{
struct smc_connection *conn = &smc->conn;
int local_contact = SMC_FIRST_CONTACT;
@@ -502,17 +603,12 @@ int smc_conn_create(struct smc_sock *smc,
spin_lock_bh(&smc_lgr_list.lock);
list_for_each_entry(lgr, &smc_lgr_list.list, list) {
write_lock_bh(&lgr->conns_lock);
- if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
- SMC_SYSTEMID_LEN) &&
- !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
- SMC_GID_SIZE) &&
- !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
- sizeof(lcl->mac)) &&
+ if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) :
+ smcr_lgr_match(lgr, lcl, role)) &&
!lgr->sync_err &&
- (lgr->role == role) &&
- (lgr->vlan_id == vlan_id) &&
- ((role == SMC_CLNT) ||
- (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
+ lgr->vlan_id == vlan_id &&
+ (role == SMC_CLNT ||
+ lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) {
/* link group found */
local_contact = SMC_REUSE_CONTACT;
conn->lgr = lgr;
@@ -535,16 +631,19 @@ int smc_conn_create(struct smc_sock *smc,
create:
if (local_contact == SMC_FIRST_CONTACT) {
- rc = smc_lgr_create(smc, smcibdev, ibport,
- lcl->id_for_peer, vlan_id);
+ rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport,
+ lcl->id_for_peer, vlan_id, smcd, peer_gid);
if (rc)
goto out;
smc_lgr_register_conn(conn); /* add smc conn to lgr */
- rc = smc_link_determine_gid(conn->lgr);
}
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
conn->urg_state = SMC_URG_READ;
+ if (is_smcd) {
+ conn->rx_off = sizeof(struct smcd_cdc_msg);
+ smcd_cdc_rx_init(conn); /* init tasklet for this conn */
+ }
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&conn->acurs_lock);
#endif
@@ -609,8 +708,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size)
return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
}
-static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
- bool is_rmb, int bufsize)
+static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr,
+ bool is_rmb, int bufsize)
{
struct smc_buf_desc *buf_desc;
struct smc_link *lnk;
@@ -668,7 +767,44 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
return buf_desc;
}
-static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
+#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */
+
+static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr,
+ bool is_dmb, int bufsize)
+{
+ struct smc_buf_desc *buf_desc;
+ int rc;
+
+ if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES)
+ return ERR_PTR(-EAGAIN);
+
+ /* try to alloc a new DMB */
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return ERR_PTR(-ENOMEM);
+ if (is_dmb) {
+ rc = smc_ism_register_dmb(lgr, bufsize, buf_desc);
+ if (rc) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->pages = virt_to_page(buf_desc->cpu_addr);
+ /* CDC header stored in buf. So, pretend it was smaller */
+ buf_desc->len = bufsize - sizeof(struct smcd_cdc_msg);
+ } else {
+ buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL |
+ __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC);
+ if (!buf_desc->cpu_addr) {
+ kfree(buf_desc);
+ return ERR_PTR(-EAGAIN);
+ }
+ buf_desc->len = bufsize;
+ }
+ return buf_desc;
+}
+
+static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
{
struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
struct smc_connection *conn = &smc->conn;
@@ -706,7 +842,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
break; /* found reusable slot */
}
- buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize);
+ if (is_smcd)
+ buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize);
+ else
+ buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize);
+
if (PTR_ERR(buf_desc) == -ENOMEM)
break;
if (IS_ERR(buf_desc))
@@ -727,7 +867,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
conn->rmbe_size_short = bufsize_short;
smc->sk.sk_rcvbuf = bufsize * 2;
atomic_set(&conn->bytes_to_rcv, 0);
- conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
+ conn->rmbe_update_limit =
+ smc_rmb_wnd_update_limit(buf_desc->len);
+ if (is_smcd)
+ smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */
} else {
conn->sndbuf_desc = buf_desc;
smc->sk.sk_sndbuf = bufsize * 2;
@@ -740,6 +883,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->sndbuf_desc, DMA_TO_DEVICE);
}
@@ -748,6 +893,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->sndbuf_desc, DMA_TO_DEVICE);
}
@@ -756,6 +903,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->rmb_desc, DMA_FROM_DEVICE);
}
@@ -764,6 +913,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
{
struct smc_link_group *lgr = conn->lgr;
+ if (!conn->lgr || conn->lgr->is_smcd)
+ return;
smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
conn->rmb_desc, DMA_FROM_DEVICE);
}
@@ -774,16 +925,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn)
* the Linux implementation uses just one RMB-element per RMB, i.e. uses an
* extra RMB for every connection in a link group
*/
-int smc_buf_create(struct smc_sock *smc)
+int smc_buf_create(struct smc_sock *smc, bool is_smcd)
{
int rc;
/* create send buffer */
- rc = __smc_buf_create(smc, false);
+ rc = __smc_buf_create(smc, is_smcd, false);
if (rc)
return rc;
/* create rmb */
- rc = __smc_buf_create(smc, true);
+ rc = __smc_buf_create(smc, is_smcd, true);
if (rc)
smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
return rc;
@@ -865,7 +1016,14 @@ void smc_core_exit(void)
spin_unlock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
list_del_init(&lgr->list);
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (!lgr->is_smcd) {
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ if (lnk->state == SMC_LNK_ACTIVE)
+ smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
+ false);
+ smc_llc_link_inactive(lnk);
+ }
cancel_delayed_work_sync(&lgr->free_work);
smc_lgr_free(lgr); /* free link group */
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 93cb3523bf50..c156674733c9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -34,7 +34,8 @@ enum smc_lgr_role { /* possible roles of a link group */
enum smc_link_state { /* possible states of a link */
SMC_LNK_INACTIVE, /* link is inactive */
SMC_LNK_ACTIVATING, /* link is being activated */
- SMC_LNK_ACTIVE /* link is active */
+ SMC_LNK_ACTIVE, /* link is active */
+ SMC_LNK_DELETING, /* link is being deleted */
};
#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
@@ -84,14 +85,15 @@ struct smc_link {
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
- union ib_gid gid; /* gid matching used vlan id */
+ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
+ u8 sgid_index; /* gid index for vlan id */
u32 peer_qpn; /* QP number of peer */
enum ib_mtu path_mtu; /* used mtu */
enum ib_mtu peer_mtu; /* mtu size of peer */
u32 psn_initial; /* QP tx initial packet seqno */
u32 peer_psn; /* QP rx initial packet seqno */
u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
- u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
+ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
u8 link_id; /* unique # within link group */
enum smc_link_state state; /* state of link */
@@ -124,15 +126,28 @@ struct smc_buf_desc {
void *cpu_addr; /* virtual address of buffer */
struct page *pages;
int len; /* length of buffer */
- struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
- struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
- /* for rmb only: memory region
- * incl. rkey provided to peer
- */
- u32 order; /* allocation order */
u32 used; /* currently used / unused */
u8 reused : 1; /* new created / reused */
u8 regerr : 1; /* err during registration */
+ union {
+ struct { /* SMC-R */
+ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
+ /* virtual buffer */
+ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ /* for rmb only: memory region
+ * incl. rkey provided to peer
+ */
+ u32 order; /* allocation order */
+ };
+ struct { /* SMC-D */
+ unsigned short sba_idx;
+ /* SBA index number */
+ u64 token;
+ /* DMB token number */
+ dma_addr_t dma_addr;
+ /* DMA address */
+ };
+ };
};
struct smc_rtoken { /* address/key of remote RMB */
@@ -148,12 +163,10 @@ struct smc_rtoken { /* address/key of remote RMB */
* struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
*/
+struct smcd_dev;
+
struct smc_link_group {
struct list_head list;
- enum smc_lgr_role role; /* client or server */
- struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
- char peer_systemid[SMC_SYSTEMID_LEN];
- /* unique system_id of peer */
struct rb_root conns_all; /* connection tree */
rwlock_t conns_lock; /* protects conns_all */
unsigned int conns_num; /* current # of connections */
@@ -163,17 +176,34 @@ struct smc_link_group {
rwlock_t sndbufs_lock; /* protects tx buffers */
struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
rwlock_t rmbs_lock; /* protects rx buffers */
- struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
- [SMC_LINKS_PER_LGR_MAX];
- /* remote addr/key pairs */
- unsigned long rtokens_used_mask[BITS_TO_LONGS(
- SMC_RMBS_PER_LGR_MAX)];
- /* used rtoken elements */
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
+
+ bool is_smcd; /* SMC-R or SMC-D */
+ union {
+ struct { /* SMC-R */
+ enum smc_lgr_role role;
+ /* client or server */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
+ /* smc link */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
+ /* used rtoken elements */
+ };
+ struct { /* SMC-D */
+ u64 peer_gid;
+ /* Peer GID (remote) */
+ struct smcd_dev *smcd;
+ /* ISM device for VLAN reg. */
+ };
+ };
};
/* Find the connection associated with the given alert token in the link group.
@@ -217,7 +247,8 @@ void smc_lgr_free(struct smc_link_group *lgr);
void smc_lgr_forget(struct smc_link_group *lgr);
void smc_lgr_terminate(struct smc_link_group *lgr);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
-int smc_buf_create(struct smc_sock *smc);
+void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid);
+int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn,
struct smc_clc_msg_accept_confirm *clc);
@@ -227,9 +258,19 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
+int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id);
+
void smc_conn_free(struct smc_connection *conn);
-int smc_conn_create(struct smc_sock *smc,
+int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
struct smc_ib_device *smcibdev, u8 ibport,
- struct smc_clc_msg_local *lcl, int srv_first_contact);
+ struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
+ u64 peer_gid);
+void smcd_conn_free(struct smc_connection *conn);
+void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
void smc_core_exit(void);
+
+static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
+{
+ return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+}
#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 839354402215..dbf64a93d68a 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -79,6 +79,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
struct nlattr *bc)
{
struct smc_sock *smc = smc_sk(sk);
+ struct smc_diag_fallback fallback;
struct user_namespace *user_ns;
struct smc_diag_msg *r;
struct nlmsghdr *nlh;
@@ -91,11 +92,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
r = nlmsg_data(nlh);
smc_diag_msg_common_fill(r, sk);
r->diag_state = sk->sk_state;
- r->diag_fallback = smc->use_fallback;
+ if (smc->use_fallback)
+ r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
+ else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
+ r->diag_mode = SMC_DIAG_MODE_SMCD;
+ else
+ r->diag_mode = SMC_DIAG_MODE_SMCR;
user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
goto errout;
+ fallback.reason = smc->fallback_rsn;
+ fallback.peer_diagnosis = smc->peer_diagnosis;
+ if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
+ goto errout;
+
if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
smc->conn.alert_token_local) {
struct smc_connection *conn = &smc->conn;
@@ -136,7 +147,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
goto errout;
}
- if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr &&
+ if (smc->conn.lgr && !smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) &&
!list_empty(&smc->conn.lgr->list)) {
struct smc_diag_lgrinfo linfo = {
.role = smc->conn.lgr->role,
@@ -148,13 +160,28 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
smc_gid_be16_convert(linfo.lnk[0].gid,
- smc->conn.lgr->lnk[0].gid.raw);
+ smc->conn.lgr->lnk[0].gid);
smc_gid_be16_convert(linfo.lnk[0].peer_gid,
smc->conn.lgr->lnk[0].peer_gid);
if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
goto errout;
}
+ if (smc->conn.lgr && smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+ !list_empty(&smc->conn.lgr->list)) {
+ struct smc_connection *conn = &smc->conn;
+ struct smcd_diag_dmbinfo dinfo = {
+ .linkid = *((u32 *)conn->lgr->id),
+ .peer_gid = conn->lgr->peer_gid,
+ .my_gid = conn->lgr->smcd->local_gid,
+ .token = conn->rmb_desc->token,
+ .peer_token = conn->peer_token
+ };
+
+ if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0)
+ goto errout;
+ }
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 0eed7ab9f28b..e519ef29c0ff 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -16,6 +16,7 @@
#include <linux/workqueue.h>
#include <linux/scatterlist.h>
#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
#include "smc_pnet.h"
#include "smc_ib.h"
@@ -68,7 +69,7 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
- rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0);
+ rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
sizeof(lnk->peer_mac));
@@ -112,8 +113,7 @@ int smc_ib_modify_qp_reset(struct smc_link *lnk)
int smc_ib_ready_link(struct smc_link *lnk)
{
- struct smc_link_group *lgr =
- container_of(lnk, struct smc_link_group, lnk[0]);
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
int rc = 0;
rc = smc_ib_modify_qp_init(lnk);
@@ -143,6 +143,95 @@ out:
return rc;
}
+static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ const struct ib_gid_attr *attr;
+ int rc = 0;
+
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
+ if (IS_ERR(attr))
+ return -ENODEV;
+
+ if (attr->ndev)
+ memcpy(smcibdev->mac[ibport - 1], attr->ndev->dev_addr,
+ ETH_ALEN);
+ else
+ rc = -ENODEV;
+
+ rdma_put_gid_attr(attr);
+ return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+/* determine the gid for an ib-device port and vlan id */
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+{
+ const struct ib_gid_attr *attr;
+ int i;
+
+ for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+ if (IS_ERR(attr))
+ continue;
+
+ if (attr->ndev &&
+ ((!vlan_id && !is_vlan_dev(attr->ndev)) ||
+ (vlan_id && is_vlan_dev(attr->ndev) &&
+ vlan_dev_vlan_id(attr->ndev) == vlan_id)) &&
+ attr->gid_type == IB_GID_TYPE_ROCE) {
+ if (gid)
+ memcpy(gid, &attr->gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = attr->index;
+ rdma_put_gid_attr(attr);
+ return 0;
+ }
+ rdma_put_gid_attr(attr);
+ }
+ return -ENODEV;
+}
+
+static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ /* the SMC protocol requires specification of the RoCE MAC address */
+ rc = smc_ib_fill_mac(smcibdev, ibport);
+ if (rc)
+ goto out;
+ if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+ sizeof(local_systemid)) &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
/* process context wrapper for might_sleep smc_ib_remember_port_attr */
static void smc_ib_port_event_work(struct work_struct *work)
{
@@ -370,62 +459,6 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev,
buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0;
}
-static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
-{
- struct ib_gid_attr gattr;
- int rc;
-
- rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
- &smcibdev->gid[ibport - 1], &gattr);
- if (rc || !gattr.ndev)
- return -ENODEV;
-
- memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
- dev_put(gattr.ndev);
- return 0;
-}
-
-/* Create an identifier unique for this instance of SMC-R.
- * The MAC-address of the first active registered IB device
- * plus a random 2-byte number is used to create this identifier.
- * This name is delivered to the peer during connection initialization.
- */
-static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
- u8 ibport)
-{
- memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
- sizeof(smcibdev->mac[ibport - 1]));
- get_random_bytes(&local_systemid[0], 2);
-}
-
-bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
-{
- return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
-}
-
-int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
-{
- int rc;
-
- memset(&smcibdev->pattr[ibport - 1], 0,
- sizeof(smcibdev->pattr[ibport - 1]));
- rc = ib_query_port(smcibdev->ibdev, ibport,
- &smcibdev->pattr[ibport - 1]);
- if (rc)
- goto out;
- /* the SMC protocol requires specification of the RoCE MAC address */
- rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
- if (rc)
- goto out;
- if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
- sizeof(local_systemid)) &&
- smc_ib_port_active(smcibdev, ibport))
- /* create unique system identifier */
- smc_ib_define_local_systemid(smcibdev, ibport);
-out:
- return rc;
-}
-
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
{
struct ib_cq_init_attr cqattr = {
@@ -454,9 +487,6 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
smcibdev->roce_cq_recv = NULL;
goto err;
}
- INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
- smc_ib_global_event_handler);
- ib_register_event_handler(&smcibdev->event_handler);
smc_wr_add_dev(smcibdev);
smcibdev->initialized = 1;
return rc;
@@ -472,7 +502,6 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
return;
smcibdev->initialized = 0;
smc_wr_remove_dev(smcibdev);
- ib_unregister_event_handler(&smcibdev->event_handler);
ib_destroy_cq(smcibdev->roce_cq_recv);
ib_destroy_cq(smcibdev->roce_cq_send);
}
@@ -483,6 +512,8 @@ static struct ib_client smc_ib_client;
static void smc_ib_add_dev(struct ib_device *ibdev)
{
struct smc_ib_device *smcibdev;
+ u8 port_cnt;
+ int i;
if (ibdev->node_type != RDMA_NODE_IB_CA)
return;
@@ -498,6 +529,21 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
list_add_tail(&smcibdev->list, &smc_ib_devices.list);
spin_unlock(&smc_ib_devices.lock);
ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+
+ /* trigger reading of the port attributes */
+ port_cnt = smcibdev->ibdev->phys_port_cnt;
+ for (i = 0;
+ i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
+ i++) {
+ set_bit(i, &smcibdev->port_event_mask);
+ /* determine pnetids of the port */
+ smc_pnetid_by_dev_port(ibdev->dev.parent, i,
+ smcibdev->pnetid[i]);
+ }
+ schedule_work(&smcibdev->port_event_work);
}
/* callback function for ib_register_client() */
@@ -512,6 +558,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
spin_unlock(&smc_ib_devices.lock);
smc_pnet_remove_by_ibdev(smcibdev);
smc_ib_cleanup_per_ibdev(smcibdev);
+ ib_unregister_event_handler(&smcibdev->event_handler);
kfree(smcibdev);
}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index e90630dadf8e..bac7fd65a4c0 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -15,6 +15,7 @@
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <rdma/ib_verbs.h>
+#include <net/smc.h>
#define SMC_MAX_PORTS 2 /* Max # of ports */
#define SMC_GID_SIZE sizeof(union ib_gid)
@@ -39,7 +40,8 @@ struct smc_ib_device { /* ib-device infos for smc */
struct tasklet_struct recv_tasklet; /* called by recv cq handler */
char mac[SMC_MAX_PORTS][ETH_ALEN];
/* mac address per port*/
- union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
+ /* pnetid per port */
u8 initialized : 1; /* ib dev CQ, evthdl done */
struct work_struct port_event_work;
unsigned long port_event_mask;
@@ -51,7 +53,6 @@ struct smc_link;
int smc_ib_register_client(void) __init;
void smc_ib_unregister_client(void);
bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
-int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
@@ -75,4 +76,6 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index);
#endif
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
new file mode 100644
index 000000000000..e36f21ce7252
--- /dev/null
+++ b/net/smc/smc_ism.c
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * Functions for ISM device.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_ism.h"
+#include "smc_pnet.h"
+
+struct smcd_dev_list smcd_dev_list = {
+ .list = LIST_HEAD_INIT(smcd_dev_list.list),
+ .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock)
+};
+
+/* Test if an ISM communication is possible. */
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd)
+{
+ return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
+ vlan_id);
+}
+
+int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos,
+ void *data, size_t len)
+{
+ int rc;
+
+ rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal,
+ pos->offset, data, len);
+
+ return rc < 0 ? rc : 0;
+}
+
+/* Set a connection using this DMBE. */
+void smc_ism_set_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Unset a connection using this DMBE. */
+void smc_ism_unset_conn(struct smc_connection *conn)
+{
+ unsigned long flags;
+
+ if (!conn->rmb_desc)
+ return;
+
+ spin_lock_irqsave(&conn->lgr->smcd->lock, flags);
+ conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL;
+ spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags);
+}
+
+/* Register a VLAN identifier with the ISM device. Use a reference count
+ * and add a VLAN identifier only when the first DMB using this VLAN is
+ * registered.
+ */
+int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *new_vlan, *vlan;
+ unsigned long flags;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ /* create new vlan entry, in case we need it */
+ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
+ if (!new_vlan)
+ return -ENOMEM;
+ new_vlan->vlanid = vlanid;
+ refcount_set(&new_vlan->refcnt, 1);
+
+ /* if there is an existing entry, increase count and return */
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ refcount_inc(&vlan->refcnt);
+ kfree(new_vlan);
+ goto out;
+ }
+ }
+
+ /* no existing entry found.
+ * add new entry to device; might fail, e.g., if HW limit reached
+ */
+ if (smcd->ops->add_vlan_id(smcd, vlanid)) {
+ kfree(new_vlan);
+ rc = -EIO;
+ goto out;
+ }
+ list_add_tail(&new_vlan->list, &smcd->vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+/* Unregister a VLAN identifier with the ISM device. Use a reference count
+ * and remove a VLAN identifier only when the last DMB using this VLAN is
+ * unregistered.
+ */
+int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
+{
+ struct smc_ism_vlanid *vlan;
+ unsigned long flags;
+ bool found = false;
+ int rc = 0;
+
+ if (!vlanid) /* No valid vlan id */
+ return -EINVAL;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ list_for_each_entry(vlan, &smcd->vlan, list) {
+ if (vlan->vlanid == vlanid) {
+ if (!refcount_dec_and_test(&vlan->refcnt))
+ goto out;
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rc = -ENOENT;
+ goto out; /* VLAN id not in table */
+ }
+
+ /* Found and the last reference just gone */
+ if (smcd->ops->del_vlan_id(smcd, vlanid))
+ rc = -EIO;
+ list_del(&vlan->list);
+ kfree(vlan);
+out:
+ spin_unlock_irqrestore(&smcd->lock, flags);
+ return rc;
+}
+
+int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_tok = dmb_desc->token;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.cpu_addr = dmb_desc->cpu_addr;
+ dmb.dma_addr = dmb_desc->dma_addr;
+ dmb.dmb_len = dmb_desc->len;
+ return smcd->ops->unregister_dmb(smcd, &dmb);
+}
+
+int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
+ struct smc_buf_desc *dmb_desc)
+{
+ struct smcd_dmb dmb;
+ int rc;
+
+ memset(&dmb, 0, sizeof(dmb));
+ dmb.dmb_len = dmb_len;
+ dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.vlan_id = lgr->vlan_id;
+ dmb.rgid = lgr->peer_gid;
+ rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb);
+ if (!rc) {
+ dmb_desc->sba_idx = dmb.sba_idx;
+ dmb_desc->token = dmb.dmb_tok;
+ dmb_desc->cpu_addr = dmb.cpu_addr;
+ dmb_desc->dma_addr = dmb.dma_addr;
+ dmb_desc->len = dmb.dmb_len;
+ }
+ return rc;
+}
+
+struct smc_ism_event_work {
+ struct work_struct work;
+ struct smcd_dev *smcd;
+ struct smcd_event event;
+};
+
+#define ISM_EVENT_REQUEST 0x0001
+#define ISM_EVENT_RESPONSE 0x0002
+#define ISM_EVENT_REQUEST_IR 0x00000001
+#define ISM_EVENT_CODE_TESTLINK 0x83
+
+static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
+{
+ union {
+ u64 info;
+ struct {
+ u32 uid;
+ unsigned short vlanid;
+ u16 code;
+ };
+ } ev_info;
+
+ switch (wrk->event.code) {
+ case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
+ ev_info.info = wrk->event.info;
+ if (ev_info.code == ISM_EVENT_REQUEST) {
+ ev_info.code = ISM_EVENT_RESPONSE;
+ wrk->smcd->ops->signal_event(wrk->smcd,
+ wrk->event.tok,
+ ISM_EVENT_REQUEST_IR,
+ ISM_EVENT_CODE_TESTLINK,
+ ev_info.info);
+ }
+ break;
+ }
+}
+
+/* worker for SMC-D events */
+static void smc_ism_event_work(struct work_struct *work)
+{
+ struct smc_ism_event_work *wrk =
+ container_of(work, struct smc_ism_event_work, work);
+
+ switch (wrk->event.type) {
+ case ISM_EVENT_GID: /* GID event, token is peer GID */
+ smc_smcd_terminate(wrk->smcd, wrk->event.tok);
+ break;
+ case ISM_EVENT_DMB:
+ break;
+ case ISM_EVENT_SWR: /* Software defined event */
+ smcd_handle_sw_event(wrk);
+ break;
+ }
+ kfree(wrk);
+}
+
+static void smcd_release(struct device *dev)
+{
+ struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev);
+
+ kfree(smcd->conn);
+ kfree(smcd);
+}
+
+struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
+ const struct smcd_ops *ops, int max_dmbs)
+{
+ struct smcd_dev *smcd;
+
+ smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
+ if (!smcd)
+ return NULL;
+ smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
+ GFP_KERNEL);
+ if (!smcd->conn) {
+ kfree(smcd);
+ return NULL;
+ }
+
+ smcd->dev.parent = parent;
+ smcd->dev.release = smcd_release;
+ device_initialize(&smcd->dev);
+ dev_set_name(&smcd->dev, name);
+ smcd->ops = ops;
+ smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
+
+ spin_lock_init(&smcd->lock);
+ INIT_LIST_HEAD(&smcd->vlan);
+ smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
+ WQ_MEM_RECLAIM, name);
+ return smcd;
+}
+EXPORT_SYMBOL_GPL(smcd_alloc_dev);
+
+int smcd_register_dev(struct smcd_dev *smcd)
+{
+ spin_lock(&smcd_dev_list.lock);
+ list_add_tail(&smcd->list, &smcd_dev_list.list);
+ spin_unlock(&smcd_dev_list.lock);
+
+ return device_add(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_register_dev);
+
+void smcd_unregister_dev(struct smcd_dev *smcd)
+{
+ spin_lock(&smcd_dev_list.lock);
+ list_del(&smcd->list);
+ spin_unlock(&smcd_dev_list.lock);
+ flush_workqueue(smcd->event_wq);
+ destroy_workqueue(smcd->event_wq);
+ smc_smcd_terminate(smcd, 0);
+
+ device_del(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_unregister_dev);
+
+void smcd_free_dev(struct smcd_dev *smcd)
+{
+ put_device(&smcd->dev);
+}
+EXPORT_SYMBOL_GPL(smcd_free_dev);
+
+/* SMCD Device event handler. Called from ISM device interrupt handler.
+ * Parameters are smcd device pointer,
+ * - event->type (0 --> DMB, 1 --> GID),
+ * - event->code (event code),
+ * - event->tok (either DMB token when event type 0, or GID when event type 1)
+ * - event->time (time of day)
+ * - event->info (debug info).
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver event handler.
+ */
+void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
+{
+ struct smc_ism_event_work *wrk;
+
+ /* copy event to event work queue, and let it be handled there */
+ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+ if (!wrk)
+ return;
+ INIT_WORK(&wrk->work, smc_ism_event_work);
+ wrk->smcd = smcd;
+ wrk->event = *event;
+ queue_work(smcd->event_wq, &wrk->work);
+}
+EXPORT_SYMBOL_GPL(smcd_handle_event);
+
+/* SMCD Device interrupt handler. Called from ISM device interrupt handler.
+ * Parameters are smcd device pointer and DMB number. Find the connection and
+ * schedule the tasklet for this connection.
+ *
+ * Context:
+ * - Function called in IRQ context from ISM device driver IRQ handler.
+ */
+void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
+{
+ struct smc_connection *conn = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&smcd->lock, flags);
+ conn = smcd->conn[dmbno];
+ if (conn)
+ tasklet_schedule(&conn->rx_tsklet);
+ spin_unlock_irqrestore(&smcd->lock, flags);
+}
+EXPORT_SYMBOL_GPL(smcd_handle_irq);
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
new file mode 100644
index 000000000000..aee45b860b79
--- /dev/null
+++ b/net/smc/smc_ism.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Shared Memory Communications Direct over ISM devices (SMC-D)
+ *
+ * SMC-D ISM device structure definitions.
+ *
+ * Copyright IBM Corp. 2018
+ */
+
+#ifndef SMCD_ISM_H
+#define SMCD_ISM_H
+
+#include <linux/uio.h>
+
+#include "smc.h"
+
+struct smcd_dev_list { /* List of SMCD devices */
+ struct list_head list;
+ spinlock_t lock; /* Protects list of devices */
+};
+
+extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */
+
+struct smc_ism_vlanid { /* VLAN id set on ISM device */
+ struct list_head list;
+ unsigned short vlanid; /* Vlan id */
+ refcount_t refcnt; /* Reference count */
+};
+
+struct smc_ism_position { /* ISM device position to write to */
+ u64 token; /* Token of DMB */
+ u32 offset; /* Offset into DMBE */
+ u8 index; /* Index of DMBE */
+ u8 signal; /* Generate interrupt on owner side */
+};
+
+struct smcd_dev;
+
+int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev);
+void smc_ism_set_conn(struct smc_connection *conn);
+void smc_ism_unset_conn(struct smc_connection *conn);
+int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
+int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
+ struct smc_buf_desc *dmb_desc);
+int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos,
+ void *data, size_t len);
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5800a6b43d83..9c916c709ca7 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -182,12 +182,10 @@ static int smc_llc_add_pending_send(struct smc_link *link,
}
/* high-level API to send LLC confirm link */
-int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
- union ib_gid *gid,
+int smc_llc_send_confirm_link(struct smc_link *link,
enum smc_llc_reqresp reqresp)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
struct smc_llc_msg_confirm_link *confllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
@@ -203,8 +201,9 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
if (reqresp == SMC_LLC_RESP)
confllc->hd.flags |= SMC_LLC_FLAG_RESP;
- memcpy(confllc->sender_mac, mac, ETH_ALEN);
- memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
+ memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
confllc->link_num = link->link_id;
memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
@@ -241,8 +240,7 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link,
/* prepare an add link message */
static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
- struct smc_link *link, u8 mac[],
- union ib_gid *gid,
+ struct smc_link *link, u8 mac[], u8 gid[],
enum smc_llc_reqresp reqresp)
{
memset(addllc, 0, sizeof(*addllc));
@@ -259,8 +257,7 @@ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
}
/* send ADD LINK request or response */
-int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
- union ib_gid *gid,
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
enum smc_llc_reqresp reqresp)
{
struct smc_llc_msg_add_link *addllc;
@@ -281,7 +278,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
/* prepare a delete link message */
static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
struct smc_link *link,
- enum smc_llc_reqresp reqresp)
+ enum smc_llc_reqresp reqresp, bool orderly)
{
memset(delllc, 0, sizeof(*delllc));
delllc->hd.common.type = SMC_LLC_DELETE_LINK;
@@ -290,13 +287,14 @@ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
delllc->hd.flags |= SMC_LLC_FLAG_RESP;
/* DEL_LINK_ALL because only 1 link supported */
delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
- delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ if (orderly)
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
delllc->link_num = link->link_id;
}
/* send DELETE LINK request or response */
int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp)
+ enum smc_llc_reqresp reqresp, bool orderly)
{
struct smc_llc_msg_del_link *delllc;
struct smc_wr_tx_pend_priv *pend;
@@ -307,7 +305,7 @@ int smc_llc_send_delete_link(struct smc_link *link,
if (rc)
return rc;
delllc = (struct smc_llc_msg_del_link *)wr_buf;
- smc_llc_prep_delete_link(delllc, link, reqresp);
+ smc_llc_prep_delete_link(delllc, link, reqresp, orderly);
/* send llc message */
rc = smc_wr_tx_send(link, pend);
return rc;
@@ -381,11 +379,9 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
static void smc_llc_rx_confirm_link(struct smc_link *link,
struct smc_llc_msg_confirm_link *llc)
{
- struct smc_link_group *lgr;
+ struct smc_link_group *lgr = smc_get_lgr(link);
int conf_rc;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
-
/* RMBE eyecatchers are not supported */
if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)
conf_rc = 0;
@@ -411,8 +407,7 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
static void smc_llc_rx_add_link(struct smc_link *link,
struct smc_llc_msg_add_link *llc)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
if (link->state == SMC_LNK_ACTIVATING)
@@ -426,14 +421,12 @@ static void smc_llc_rx_add_link(struct smc_link *link,
if (lgr->role == SMC_SERV) {
smc_llc_prep_add_link(llc, link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_REQ);
+ link->gid, SMC_LLC_REQ);
} else {
smc_llc_prep_add_link(llc, link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_RESP);
+ link->gid, SMC_LLC_RESP);
}
smc_llc_send_message(link, llc, sizeof(*llc));
}
@@ -442,22 +435,23 @@ static void smc_llc_rx_add_link(struct smc_link *link,
static void smc_llc_rx_delete_link(struct smc_link *link,
struct smc_llc_msg_del_link *llc)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
if (lgr->role == SMC_SERV)
- smc_lgr_terminate(lgr);
+ smc_lgr_schedule_free_work_fast(lgr);
} else {
+ smc_lgr_forget(lgr);
+ smc_llc_link_deleting(link);
if (lgr->role == SMC_SERV) {
- smc_lgr_forget(lgr);
- smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ);
- smc_llc_send_message(link, llc, sizeof(*llc));
+ /* client asks to delete this link, send request */
+ smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
} else {
- smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP);
- smc_llc_send_message(link, llc, sizeof(*llc));
- smc_lgr_terminate(lgr);
+ /* server requests to delete this link, send response */
+ smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
}
+ smc_llc_send_message(link, llc, sizeof(*llc));
+ smc_lgr_schedule_free_work_fast(lgr);
}
}
@@ -476,17 +470,14 @@ static void smc_llc_rx_test_link(struct smc_link *link,
static void smc_llc_rx_confirm_rkey(struct smc_link *link,
struct smc_llc_msg_confirm_rkey *llc)
{
- struct smc_link_group *lgr;
int rc;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
-
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
link->llc_confirm_rkey_rc = llc->hd.flags &
SMC_LLC_FLAG_RKEY_NEG;
complete(&link->llc_confirm_rkey);
} else {
- rc = smc_rtoken_add(lgr,
+ rc = smc_rtoken_add(smc_get_lgr(link),
llc->rtoken[0].rmb_vaddr,
llc->rtoken[0].rmb_key);
@@ -514,18 +505,15 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
static void smc_llc_rx_delete_rkey(struct smc_link *link,
struct smc_llc_msg_delete_rkey *llc)
{
- struct smc_link_group *lgr;
u8 err_mask = 0;
int i, max;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
-
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
/* unused as long as we don't send this type of msg */
} else {
max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
for (i = 0; i < max; i++) {
- if (smc_rtoken_delete(lgr, llc->rkey[i]))
+ if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i]))
err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
}
@@ -583,12 +571,10 @@ static void smc_llc_testlink_work(struct work_struct *work)
struct smc_link *link = container_of(to_delayed_work(work),
struct smc_link, llc_testlink_wrk);
unsigned long next_interval;
- struct smc_link_group *lgr;
unsigned long expire_time;
u8 user_data[16] = { 0 };
int rc;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
if (link->state != SMC_LNK_ACTIVE)
return; /* don't reschedule worker */
expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
@@ -602,7 +588,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
SMC_LLC_WAIT_TIME);
if (rc <= 0) {
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
return;
}
next_interval = link->llc_testlink_time;
@@ -613,8 +599,7 @@ out:
int smc_llc_link_init(struct smc_link *link)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
*((u32 *)lgr->id),
link->link_id);
@@ -640,6 +625,11 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
}
}
+void smc_llc_link_deleting(struct smc_link *link)
+{
+ link->state = SMC_LNK_DELETING;
+}
+
/* called in tasklet context */
void smc_llc_link_inactive(struct smc_link *link)
{
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 65c8645e96a1..9e2ff088e301 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -36,14 +36,15 @@ enum smc_llc_msg_type {
};
/* transmit */
-int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
+int smc_llc_send_confirm_link(struct smc_link *lnk,
enum smc_llc_reqresp reqresp);
-int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
enum smc_llc_reqresp reqresp);
int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp);
+ enum smc_llc_reqresp reqresp, bool orderly);
int smc_llc_link_init(struct smc_link *link);
void smc_llc_link_active(struct smc_link *link, int testlink_time);
+void smc_llc_link_deleting(struct smc_link *link);
void smc_llc_link_inactive(struct smc_link *link);
void smc_llc_link_clear(struct smc_link *link);
int smc_llc_do_confirm_rkey(struct smc_link *link,
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index d7b88b2d1b22..7cb3e4f07c10 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -22,13 +22,12 @@
#include "smc_pnet.h"
#include "smc_ib.h"
-
-#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
+#include "smc_ism.h"
static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
[SMC_PNETID_NAME] = {
.type = NLA_NUL_STRING,
- .len = SMC_MAX_PNET_ID_LEN - 1
+ .len = SMC_MAX_PNETID_LEN - 1
},
[SMC_PNETID_ETHNAME] = {
.type = NLA_NUL_STRING,
@@ -65,7 +64,7 @@ static struct smc_pnettable {
*/
struct smc_pnetentry {
struct list_head list;
- char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+ char pnet_name[SMC_MAX_PNETID_LEN + 1];
struct net_device *ndev;
struct smc_ib_device *smcibdev;
u8 ib_port;
@@ -209,7 +208,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
return false;
while (--end >= bf && isspace(*end))
;
- if (end - bf >= SMC_MAX_PNET_ID_LEN)
+ if (end - bf >= SMC_MAX_PNETID_LEN)
return false;
while (bf <= end) {
if (!isalnum(*bf))
@@ -358,9 +357,6 @@ static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
kfree(pnetelem);
return rc;
}
- rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
- if (rc)
- smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
return rc;
}
@@ -465,7 +461,7 @@ static const struct genl_ops smc_pnet_ops[] = {
};
/* SMC_PNETID family definition */
-static struct genl_family smc_pnet_nl_family = {
+static struct genl_family smc_pnet_nl_family __ro_after_init = {
.hdrsize = 0,
.name = SMCR_GENL_FAMILY_NAME,
.version = SMCR_GENL_FAMILY_VERSION,
@@ -485,10 +481,10 @@ static int smc_pnet_netdev_event(struct notifier_block *this,
case NETDEV_REBOOT:
case NETDEV_UNREGISTER:
smc_pnet_remove_by_ndev(event_dev);
+ return NOTIFY_OK;
default:
- break;
+ return NOTIFY_DONE;
}
- return NOTIFY_DONE;
}
static struct notifier_block smc_netdev_notifier = {
@@ -515,28 +511,104 @@ void smc_pnet_exit(void)
genl_unregister_family(&smc_pnet_nl_family);
}
-/* PNET table analysis for a given sock:
- * determine ib_device and port belonging to used internal TCP socket
- * ethernet interface.
+/* Determine one base device for stacked net devices.
+ * If the lower device level contains more than one devices
+ * (for instance with bonding slaves), just the first device
+ * is used to reach a base device.
*/
-void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport)
+static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
{
- struct dst_entry *dst = sk_dst_get(sk);
- struct smc_pnetentry *pnetelem;
+ int i, nest_lvl;
- *smcibdev = NULL;
- *ibport = 0;
+ rtnl_lock();
+ nest_lvl = dev_get_nest_level(ndev);
+ for (i = 0; i < nest_lvl; i++) {
+ struct list_head *lower = &ndev->adj_list.lower;
+
+ if (list_empty(lower))
+ break;
+ lower = lower->next;
+ ndev = netdev_lower_get_next(ndev, &lower);
+ }
+ rtnl_unlock();
+ return ndev;
+}
+
+/* Determine the corresponding IB device port based on the hardware PNETID.
+ * Searching stops at the first matching active IB device port with vlan_id
+ * configured.
+ */
+static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
+ struct smc_ib_device **smcibdev,
+ u8 *ibport, unsigned short vlan_id,
+ u8 gid[])
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct smc_ib_device *ibdev;
+ int i;
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid))
+ return; /* pnetid could not be determined */
+
+ spin_lock(&smc_ib_devices.lock);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
+ if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid,
+ SMC_MAX_PNETID_LEN) &&
+ smc_ib_port_active(ibdev, i) &&
+ !smc_ib_determine_gid(ibdev, i, vlan_id, gid,
+ NULL)) {
+ *smcibdev = ibdev;
+ *ibport = i;
+ goto out;
+ }
+ }
+ }
+out:
+ spin_unlock(&smc_ib_devices.lock);
+}
+
+static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
+ struct smcd_dev **smcismdev)
+{
+ u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
+ struct smcd_dev *ismdev;
+
+ ndev = pnet_find_base_ndev(ndev);
+ if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
+ ndev_pnetid))
+ return; /* pnetid could not be determined */
+
+ spin_lock(&smcd_dev_list.lock);
+ list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
+ if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) {
+ *smcismdev = ismdev;
+ break;
+ }
+ }
+ spin_unlock(&smcd_dev_list.lock);
+}
+
+/* Lookup of coupled ib_device via SMC pnet table */
+static void smc_pnet_find_roce_by_table(struct net_device *netdev,
+ struct smc_ib_device **smcibdev,
+ u8 *ibport, unsigned short vlan_id,
+ u8 gid[])
+{
+ struct smc_pnetentry *pnetelem;
- if (!dst)
- return;
- if (!dst->dev)
- goto out_rel;
read_lock(&smc_pnettable.lock);
list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
- if (dst->dev == pnetelem->ndev) {
+ if (netdev == pnetelem->ndev) {
if (smc_ib_port_active(pnetelem->smcibdev,
- pnetelem->ib_port)) {
+ pnetelem->ib_port) &&
+ !smc_ib_determine_gid(pnetelem->smcibdev,
+ pnetelem->ib_port, vlan_id,
+ gid, NULL)) {
*smcibdev = pnetelem->smcibdev;
*ibport = pnetelem->ib_port;
}
@@ -544,6 +616,55 @@ void smc_pnet_find_roce_resource(struct sock *sk,
}
}
read_unlock(&smc_pnettable.lock);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport,
+ unsigned short vlan_id, u8 gid[])
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ *smcibdev = NULL;
+ *ibport = 0;
+
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ /* if possible, lookup via hardware-defined pnetid */
+ smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid);
+ if (*smcibdev)
+ goto out_rel;
+
+ /* lookup via SMC PNET table */
+ smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport, vlan_id, gid);
+
+out_rel:
+ dst_release(dst);
+out:
+ return;
+}
+
+void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+
+ *smcismdev = NULL;
+ if (!dst)
+ goto out;
+ if (!dst->dev)
+ goto out_rel;
+
+ /* if possible, lookup via hardware-defined pnetid */
+ smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev);
+
out_rel:
dst_release(dst);
+out:
+ return;
}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 5a29519db976..8ff777636e32 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -12,12 +12,29 @@
#ifndef _SMC_PNET_H
#define _SMC_PNET_H
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+#include <asm/pnet.h>
+#endif
+
struct smc_ib_device;
+struct smcd_dev;
+
+static inline int smc_pnetid_by_dev_port(struct device *dev,
+ unsigned short port, u8 *pnetid)
+{
+#if IS_ENABLED(CONFIG_HAVE_PNETID)
+ return pnet_id_by_dev_port(dev, port, pnetid);
+#else
+ return -ENOENT;
+#endif
+}
int smc_pnet_init(void) __init;
void smc_pnet_exit(void);
int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport);
+ struct smc_ib_device **smcibdev, u8 *ibport,
+ unsigned short vlan_id, u8 gid[]);
+void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev);
#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 3d77b383cccd..bbcf0fe4ae10 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -82,8 +82,7 @@ static int smc_rx_update_consumer(struct smc_sock *smc,
}
}
- smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
- conn);
+ smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn);
/* send consumer cursor update if required */
/* similar to advertising new TCP rcv_wnd if required */
@@ -97,8 +96,7 @@ static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
struct smc_connection *conn = &smc->conn;
union smc_host_cursor cons;
- smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
smc_rx_update_consumer(smc, cons, len);
}
@@ -157,10 +155,8 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
struct splice_pipe_desc spd;
struct partial_page partial;
struct smc_spd_priv *priv;
- struct page *page;
int bytes;
- page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
@@ -172,7 +168,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
spd.nr_pages_max = 1;
spd.nr_pages = 1;
- spd.pages = &page;
+ spd.pages = &smc->conn.rmb_desc->pages;
spd.partial = &partial;
spd.ops = &smc_pipe_ops;
spd.spd_release = smc_rx_spd_release;
@@ -245,10 +241,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
if (!(flags & MSG_TRUNC))
rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
len = 1;
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons,
- conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
if (smc_curs_diff(conn->rmb_desc->len, &cons,
&conn->urg_curs) > 1)
conn->urg_rx_skip_pend = true;
@@ -305,7 +298,7 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
- rcvbuf_base = conn->rmb_desc->cpu_addr;
+ rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr;
do { /* while (read_remaining) */
if (read_done >= target || (pipe && read_done))
@@ -370,9 +363,7 @@ copy:
continue;
}
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
/* subsequent splice() calls pick up where previous left */
if (splbytes)
smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index f82886b7d1d8..d8366ed51757 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -24,6 +24,7 @@
#include "smc.h"
#include "smc_wr.h"
#include "smc_cdc.h"
+#include "smc_ism.h"
#include "smc_tx.h"
#define SMC_TX_WORK_DELAY HZ
@@ -180,9 +181,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
copylen = min_t(size_t, send_remaining, writespace);
/* determine start of sndbuf */
sndbuf_base = conn->sndbuf_desc->cpu_addr;
- smc_curs_write(&prep,
- smc_curs_read(&conn->tx_curs_prep, conn),
- conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
tx_cnt_prep = prep.count;
/* determine chunks where to write into sndbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
@@ -213,9 +212,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
smc_sndbuf_sync_sg_for_device(conn);
/* update cursors */
smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
- smc_curs_write(&conn->tx_curs_prep,
- smc_curs_read(&prep, conn),
- conn);
+ smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
/* increased in send tasklet smc_cdc_tx_handler() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->sndbuf_space);
@@ -250,12 +247,29 @@ out_err:
/***************************** sndbuf consumer *******************************/
+/* sndbuf consumer: actual data transfer of one target chunk with ISM write */
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal)
+{
+ struct smc_ism_position pos;
+ int rc;
+
+ memset(&pos, 0, sizeof(pos));
+ pos.token = conn->peer_token;
+ pos.index = conn->peer_rmbe_idx;
+ pos.offset = conn->tx_off + offset;
+ pos.signal = signal;
+ rc = smc_ism_write(conn->lgr->smcd, &pos, data, len);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ return rc;
+}
+
/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
int num_sges, struct ib_sge sges[])
{
struct smc_link_group *lgr = conn->lgr;
- struct ib_send_wr *failed_wr = NULL;
struct ib_rdma_wr rdma_wr;
struct smc_link *link;
int rc;
@@ -273,7 +287,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
/* offset within RMBE */
peer_rmbe_offset;
rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
- rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
+ rc = ib_post_send(link->roce_qp, &rdma_wr.wr, NULL);
if (rc) {
conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
smc_lgr_terminate(lgr);
@@ -297,26 +311,109 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
smc_curs_add(conn->sndbuf_desc->len, sent, len);
}
+/* SMC-R helper for smc_tx_rdma_writes() */
+static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len)
+{
+ dma_addr_t dma_addr =
+ sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+ int sent_count = src_off;
+ int srcchunk, dstchunk;
+ int num_sges;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ num_sges = 0;
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ sges[srcchunk].addr = dma_addr + src_off;
+ sges[srcchunk].length = src_len;
+ sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+ num_sges++;
+
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+ if (rc)
+ return rc;
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len -
+ sent_count);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
+/* SMC-D helper for smc_tx_rdma_writes() */
+static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
+ size_t src_off, size_t src_len,
+ size_t dst_off, size_t dst_len)
+{
+ int src_len_sum = src_len, dst_len_sum = dst_len;
+ int srcchunk, dstchunk;
+ int rc;
+
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ void *data = conn->sndbuf_desc->cpu_addr + src_off;
+
+ rc = smcd_tx_ism_write(conn, data, src_len, dst_off +
+ sizeof(struct smcd_cdc_msg), 0);
+ if (rc)
+ return rc;
+ dst_off += src_len;
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_desc->len)
+ src_off -= conn->sndbuf_desc->len;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off);
+ src_len_sum = src_len;
+ }
+ return 0;
+}
+
/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
* usable snd_wnd as max transmit
*/
static int smc_tx_rdma_writes(struct smc_connection *conn)
{
- size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
- size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
+ size_t len, src_len, dst_off, dst_len; /* current chunk values */
union smc_host_cursor sent, prep, prod, cons;
- struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
- struct smc_link_group *lgr = conn->lgr;
struct smc_cdc_producer_flags *pflags;
int to_send, rmbespace;
- struct smc_link *link;
- dma_addr_t dma_addr;
- int num_sges;
int rc;
/* source: sndbuf */
- smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
- smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
/* cf. wmem_alloc - (snd_max - snd_una) */
to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
if (to_send <= 0)
@@ -327,12 +424,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
rmbespace = atomic_read(&conn->peer_rmbe_space);
if (rmbespace <= 0)
return 0;
- smc_curs_write(&prod,
- smc_curs_read(&conn->local_tx_ctrl.prod, conn),
- conn);
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_rx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
/* if usable snd_wnd closes ask peer to advertise once it opens again */
pflags = &conn->local_tx_ctrl.prod_flags;
@@ -341,7 +434,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
len = min(to_send, rmbespace);
/* initialize variables for first iteration of subsequent nested loop */
- link = &lgr->lnk[SMC_SINGLE_LINK];
dst_off = prod.count;
if (prod.wrap == cons.wrap) {
/* the filled destination area is unwrapped,
@@ -358,8 +450,6 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
*/
dst_len = len;
}
- dst_len_sum = dst_len;
- src_off = sent.count;
/* dst_len determines the maximum src_len */
if (sent.count + dst_len <= conn->sndbuf_desc->len) {
/* unwrapped src case: single chunk of entire dst_len */
@@ -368,51 +458,23 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
src_len = conn->sndbuf_desc->len - sent.count;
}
- src_len_sum = src_len;
- dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
- for (dstchunk = 0; dstchunk < 2; dstchunk++) {
- num_sges = 0;
- for (srcchunk = 0; srcchunk < 2; srcchunk++) {
- sges[srcchunk].addr = dma_addr + src_off;
- sges[srcchunk].length = src_len;
- sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
- num_sges++;
- src_off += src_len;
- if (src_off >= conn->sndbuf_desc->len)
- src_off -= conn->sndbuf_desc->len;
- /* modulo in send ring */
- if (src_len_sum == dst_len)
- break; /* either on 1st or 2nd iteration */
- /* prepare next (== 2nd) iteration */
- src_len = dst_len - src_len; /* remainder */
- src_len_sum += src_len;
- }
- rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
- if (rc)
- return rc;
- if (dst_len_sum == len)
- break; /* either on 1st or 2nd iteration */
- /* prepare next (== 2nd) iteration */
- dst_off = 0; /* modulo offset in RMBE ring buffer */
- dst_len = len - dst_len; /* remainder */
- dst_len_sum += dst_len;
- src_len = min_t(int,
- dst_len, conn->sndbuf_desc->len - sent.count);
- src_len_sum = src_len;
- }
+
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len);
+ else
+ rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len,
+ dst_off, dst_len);
+ if (rc)
+ return rc;
if (conn->urg_tx_pend && len == to_send)
pflags->urg_data_present = 1;
smc_tx_advance_cursors(conn, &prod, &sent, len);
/* update connection's cursors with advanced local cursors */
- smc_curs_write(&conn->local_tx_ctrl.prod,
- smc_curs_read(&prod, conn),
- conn);
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
/* dst: peer RMBE */
- smc_curs_write(&conn->tx_curs_sent,
- smc_curs_read(&sent, conn),
- conn);
- /* src: local sndbuf */
+ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
return 0;
}
@@ -420,7 +482,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
/* Wakeup sndbuf consumers from any context (IRQ or process)
* since there is more data to transmit; usable snd_wnd as max transmit
*/
-int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
{
struct smc_cdc_producer_flags *pflags;
struct smc_cdc_tx_pend *pend;
@@ -467,6 +529,37 @@ out_unlock:
return rc;
}
+static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
+ int rc = 0;
+
+ spin_lock_bh(&conn->send_lock);
+ if (!pflags->urg_data_present)
+ rc = smc_tx_rdma_writes(conn);
+ if (!rc)
+ rc = smcd_cdc_msg_send(conn);
+
+ if (!rc && pflags->urg_data_present) {
+ pflags->urg_data_pending = 0;
+ pflags->urg_data_present = 0;
+ }
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ int rc;
+
+ if (conn->lgr->is_smcd)
+ rc = smcd_tx_sndbuf_nonempty(conn);
+ else
+ rc = smcr_tx_sndbuf_nonempty(conn);
+
+ return rc;
+}
+
/* Wakeup sndbuf consumers from process context
* since there is more data to transmit
*/
@@ -499,17 +592,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
int sender_free = conn->rmb_desc->len;
int to_confirm;
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
- smc_curs_write(&cfed,
- smc_curs_read(&conn->rx_curs_confirmed, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
if (to_confirm > conn->rmbe_update_limit) {
- smc_curs_write(&prod,
- smc_curs_read(&conn->local_rx_ctrl.prod, conn),
- conn);
+ smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
sender_free = conn->rmb_desc->len -
smc_curs_diff(conn->rmb_desc->len, &prod, &cfed);
}
@@ -525,9 +612,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
SMC_TX_WORK_DELAY);
return;
}
- smc_curs_write(&conn->rx_curs_confirmed,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&conn->rx_curs_confirmed,
+ &conn->local_tx_ctrl.cons, conn);
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
}
if (conn->local_rx_ctrl.prod_flags.write_blocked &&
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 9d2238909fa0..07e6ad76224a 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -22,8 +22,8 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
{
union smc_host_cursor sent, prep;
- smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
- smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
@@ -33,5 +33,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
void smc_tx_consumer_update(struct smc_connection *conn, bool force);
+int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len,
+ u32 offset, int signal);
#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index dbd2605d1962..3c458d279855 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -92,8 +92,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
return;
if (wc->status) {
- struct smc_link_group *lgr;
-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
/* clear full struct smc_wr_tx_pend including .priv */
memset(&link->wr_tx_pends[i], 0,
@@ -103,9 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
clear_bit(i, link->wr_tx_mask);
}
/* terminate connections of this link group abnormally */
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
}
if (pnd_snd.handler)
pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -186,18 +182,14 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
if (rc)
return rc;
} else {
- struct smc_link_group *lgr;
-
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
rc = wait_event_timeout(
link->wr_tx_wait,
- list_empty(&lgr->list) || /* lgr terminated */
+ link->state == SMC_LNK_INACTIVE ||
(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
SMC_WR_TX_WAIT_FREE_SLOT_TIME);
if (!rc) {
/* timeout - terminate connections */
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
return -EPIPE;
}
if (idx == link->wr_tx_cnt)
@@ -240,22 +232,16 @@ int smc_wr_tx_put_slot(struct smc_link *link,
*/
int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
{
- struct ib_send_wr *failed_wr = NULL;
struct smc_wr_tx_pend *pend;
int rc;
ib_req_notify_cq(link->smcibdev->roce_cq_send,
IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
pend = container_of(priv, struct smc_wr_tx_pend, priv);
- rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
- &failed_wr);
+ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
if (rc) {
- struct smc_link_group *lgr =
- container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
-
smc_wr_tx_put_slot(link, priv);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
}
return rc;
}
@@ -263,7 +249,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
/* Register a memory region and wait for result. */
int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
{
- struct ib_send_wr *failed_wr = NULL;
int rc;
ib_req_notify_cq(link->smcibdev->roce_cq_send,
@@ -272,9 +257,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
link->wr_reg.mr = mr;
link->wr_reg.key = mr->rkey;
- failed_wr = &link->wr_reg.wr;
- rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, &failed_wr);
- WARN_ON(failed_wr != &link->wr_reg.wr);
+ rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
if (rc)
return rc;
@@ -283,11 +266,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
SMC_WR_REG_MR_WAIT_TIME);
if (!rc) {
/* timeout - terminate connections */
- struct smc_link_group *lgr;
-
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
return -EPIPE;
}
if (rc == -ERESTARTSYS)
@@ -380,8 +359,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
smc_wr_rx_demultiplex(&wc[i]);
smc_wr_rx_post(link); /* refill WR RX */
} else {
- struct smc_link_group *lgr;
-
/* handle status errors */
switch (wc[i].status) {
case IB_WC_RETRY_EXC_ERR:
@@ -390,9 +367,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
/* terminate connections of this link group
* abnormally
*/
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
break;
default:
smc_wr_rx_post(link); /* refill WR RX */
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 210bec3c3ebe..1d85bb14fd6f 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -63,7 +63,6 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
/* post a new receive work request to fill a completed old work request entry */
static inline int smc_wr_rx_post(struct smc_link *link)
{
- struct ib_recv_wr *bad_recv_wr = NULL;
int rc;
u64 wr_id, temp_wr_id;
u32 index;
@@ -72,7 +71,7 @@ static inline int smc_wr_rx_post(struct smc_link *link)
temp_wr_id = wr_id;
index = do_div(temp_wr_id, link->wr_rx_cnt);
link->wr_rx_ibs[index].wr_id = wr_id;
- rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
+ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
return rc;
}
diff --git a/net/socket.c b/net/socket.c
index 792f0313ea91..99c96851469f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -252,7 +252,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
wq->flags = 0;
- RCU_INIT_POINTER(ei->socket.wq, wq);
+ ei->socket.wq = wq;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -266,11 +266,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;
- struct socket_wq *wq;
ei = container_of(inode, struct socket_alloc, vfs_inode);
- wq = rcu_dereference_protected(ei->socket.wq, 1);
- kfree_rcu(wq, rcu);
+ kfree_rcu(ei->socket.wq, rcu);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -585,7 +583,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
module_put(owner);
}
- if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
+ if (sock->wq->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
if (!sock->file) {
@@ -943,7 +941,8 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
EXPORT_SYMBOL(dlci_ioctl_set);
static long sock_do_ioctl(struct net *net, struct socket *sock,
- unsigned int cmd, unsigned long arg)
+ unsigned int cmd, unsigned long arg,
+ unsigned int ifreq_size)
{
int err;
void __user *argp = (void __user *)arg;
@@ -969,11 +968,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
} else {
struct ifreq ifr;
bool need_copyout;
- if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
+ if (copy_from_user(&ifr, argp, ifreq_size))
return -EFAULT;
err = dev_ioctl(net, cmd, &ifr, &need_copyout);
if (!err && need_copyout)
- if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
+ if (copy_to_user(argp, &ifr, ifreq_size))
return -EFAULT;
}
return err;
@@ -1072,7 +1071,8 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
err = open_related_ns(&net->ns, get_net_ns);
break;
default:
- err = sock_do_ioctl(net, sock, cmd, arg);
+ err = sock_do_ioctl(net, sock, cmd, arg,
+ sizeof(struct ifreq));
break;
}
return err;
@@ -1112,12 +1112,21 @@ EXPORT_SYMBOL(sock_create_lite);
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
struct socket *sock = file->private_data;
- __poll_t events = poll_requested_events(wait);
+ __poll_t events = poll_requested_events(wait), flag = 0;
- sock_poll_busy_loop(sock, events);
if (!sock->ops->poll)
return 0;
- return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
+
+ if (sk_can_busy_loop(sock->sk)) {
+ /* poll once if requested by the syscall */
+ if (events & POLL_BUSY_LOOP)
+ sk_busy_loop(sock->sk, 1);
+
+ /* if this socket can poll_ll, tell the system call */
+ flag = POLL_BUSY_LOOP;
+ }
+
+ return sock->ops->poll(file, sock, wait) | flag;
}
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1154,7 +1163,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
return -EINVAL;
lock_sock(sk);
- wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
+ wq = sock->wq;
fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list)
@@ -1466,7 +1475,7 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address);
- if (err >= 0) {
+ if (!err) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
@@ -2333,7 +2342,7 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
*/
int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
- unsigned int flags, struct timespec *timeout)
+ unsigned int flags, struct timespec64 *timeout)
{
int fput_needed, err, datagrams;
struct socket *sock;
@@ -2398,8 +2407,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
if (timeout) {
ktime_get_ts64(&timeout64);
- *timeout = timespec64_to_timespec(
- timespec64_sub(end_time, timeout64));
+ *timeout = timespec64_sub(end_time, timeout64);
if (timeout->tv_sec < 0) {
timeout->tv_sec = timeout->tv_nsec = 0;
break;
@@ -2445,10 +2453,10 @@ out_put:
static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags,
- struct timespec __user *timeout)
+ struct __kernel_timespec __user *timeout)
{
int datagrams;
- struct timespec timeout_sys;
+ struct timespec64 timeout_sys;
if (flags & MSG_CMSG_COMPAT)
return -EINVAL;
@@ -2456,13 +2464,12 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
if (!timeout)
return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
- if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
+ if (get_timespec64(&timeout_sys, timeout))
return -EFAULT;
datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
- if (datagrams > 0 &&
- copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
+ if (datagrams > 0 && put_timespec64(&timeout_sys, timeout))
datagrams = -EFAULT;
return datagrams;
@@ -2470,7 +2477,7 @@ static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
unsigned int, vlen, unsigned int, flags,
- struct timespec __user *, timeout)
+ struct __kernel_timespec __user *, timeout)
{
return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
}
@@ -2594,7 +2601,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
break;
case SYS_RECVMMSG:
err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2],
- a[3], (struct timespec __user *)a[4]);
+ a[3], (struct __kernel_timespec __user *)a[4]);
break;
case SYS_ACCEPT4:
err = __sys_accept4(a0, (struct sockaddr __user *)a1,
@@ -2671,8 +2678,7 @@ EXPORT_SYMBOL(sock_unregister);
bool sock_is_registered(int family)
{
- return family < NPROTO &&
- rcu_access_pointer(net_families[array_index_nospec(family, NPROTO)]);
+ return family < NPROTO && rcu_access_pointer(net_families[family]);
}
static int __init sock_init(void)
@@ -2744,7 +2750,8 @@ static int do_siocgstamp(struct net *net, struct socket *sock,
int err;
set_fs(KERNEL_DS);
- err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
+ err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv,
+ sizeof(struct compat_ifreq));
set_fs(old_fs);
if (!err)
err = compat_put_timeval(&ktv, up);
@@ -2760,7 +2767,8 @@ static int do_siocgstampns(struct net *net, struct socket *sock,
int err;
set_fs(KERNEL_DS);
- err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
+ err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts,
+ sizeof(struct compat_ifreq));
set_fs(old_fs);
if (!err)
err = compat_put_timespec(&kts, up);
@@ -2865,9 +2873,14 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
copy_in_user(&rxnfc->fs.ring_cookie,
&compat_rxnfc->fs.ring_cookie,
(void __user *)(&rxnfc->fs.location + 1) -
- (void __user *)&rxnfc->fs.ring_cookie) ||
- copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt,
- sizeof(rxnfc->rule_cnt)))
+ (void __user *)&rxnfc->fs.ring_cookie))
+ return -EFAULT;
+ if (ethcmd == ETHTOOL_GRXCLSRLALL) {
+ if (put_user(rule_cnt, &rxnfc->rule_cnt))
+ return -EFAULT;
+ } else if (copy_in_user(&rxnfc->rule_cnt,
+ &compat_rxnfc->rule_cnt,
+ sizeof(rxnfc->rule_cnt)))
return -EFAULT;
}
@@ -3066,7 +3079,8 @@ static int routing_ioctl(struct net *net, struct socket *sock,
}
set_fs(KERNEL_DS);
- ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
+ ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r,
+ sizeof(struct compat_ifreq));
set_fs(old_fs);
out:
@@ -3179,7 +3193,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
case SIOCGIFNAME:
- return sock_do_ioctl(net, sock, cmd, arg);
+ return sock_do_ioctl(net, sock, cmd, arg,
+ sizeof(struct compat_ifreq));
}
return -ENOIOCTLCMD;
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
index 6cff3f6d0c3a..94da19a2a220 100644
--- a/net/strparser/Kconfig
+++ b/net/strparser/Kconfig
@@ -1,4 +1,2 @@
-
config STREAM_PARSER
- tristate
- default n
+ def_bool n
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 625acb27efcc..da1a676860ca 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -140,11 +140,13 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
/* We are going to append to the frags_list of head.
* Need to unshare the frag_list.
*/
- err = skb_unclone(head, GFP_ATOMIC);
- if (err) {
- STRP_STATS_INCR(strp->stats.mem_fail);
- desc->error = err;
- return 0;
+ if (skb_has_frag_list(head)) {
+ err = skb_unclone(head, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.mem_fail);
+ desc->error = err;
+ return 0;
+ }
}
if (unlikely(skb_shinfo(head)->frag_list)) {
@@ -201,14 +203,16 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
memset(stm, 0, sizeof(*stm));
stm->strp.offset = orig_offset + eaten;
} else {
- /* Unclone since we may be appending to an skb that we
+ /* Unclone if we are appending to an skb that we
* already share a frag_list with.
*/
- err = skb_unclone(skb, GFP_ATOMIC);
- if (err) {
- STRP_STATS_INCR(strp->stats.mem_fail);
- desc->error = err;
- break;
+ if (skb_has_frag_list(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err) {
+ STRP_STATS_INCR(strp->stats.mem_fail);
+ desc->error = err;
+ break;
+ }
}
stm = _strp_msg(head);
@@ -404,8 +408,6 @@ EXPORT_SYMBOL_GPL(strp_data_ready);
static void do_strp_work(struct strparser *strp)
{
- read_descriptor_t rd_desc;
-
/* We need the read lock to synchronize with strp_data_ready. We
* need the socket lock for calling strp_read_sock.
*/
@@ -417,8 +419,6 @@ static void do_strp_work(struct strparser *strp)
if (strp->paused)
goto out;
- rd_desc.arg.data = strp;
-
if (strp_read_sock(strp) == -ENOMEM)
queue_work(strp_wq, &strp->work);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d2623b9f23d6..305ecea92170 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -50,7 +50,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
if (!val)
goto out_inval;
ret = kstrtoul(val, 0, &num);
- if (ret == -EINVAL)
+ if (ret)
goto out_inval;
nbits = fls(num - 1);
if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
@@ -253,7 +253,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
struct rpc_auth *
-rpcauth_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
struct rpc_auth *auth;
const struct rpc_authops *ops;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index be8f103d22fd..21c0aa0a0d1d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -284,7 +284,12 @@ err:
return p;
}
-#define UPCALL_BUF_LEN 128
+/* XXX: Need some documentation about why UPCALL_BUF_LEN is so small.
+ * Is user space expecting no more than UPCALL_BUF_LEN bytes?
+ * Note that there are now _two_ NI_MAXHOST sized data items
+ * being passed in this string.
+ */
+#define UPCALL_BUF_LEN 256
struct gss_upcall_msg {
refcount_t count;
@@ -456,18 +461,44 @@ static int gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
buflen -= len;
p += len;
gss_msg->msg.len = len;
+
+ /*
+ * target= is a full service principal that names the remote
+ * identity that we are authenticating to.
+ */
if (target_name) {
len = scnprintf(p, buflen, "target=%s ", target_name);
buflen -= len;
p += len;
gss_msg->msg.len += len;
}
- if (service_name != NULL) {
- len = scnprintf(p, buflen, "service=%s ", service_name);
+
+ /*
+ * gssd uses service= and srchost= to select a matching key from
+ * the system's keytab to use as the source principal.
+ *
+ * service= is the service name part of the source principal,
+ * or "*" (meaning choose any).
+ *
+ * srchost= is the hostname part of the source principal. When
+ * not provided, gssd uses the local hostname.
+ */
+ if (service_name) {
+ char *c = strchr(service_name, '@');
+
+ if (!c)
+ len = scnprintf(p, buflen, "service=%s ",
+ service_name);
+ else
+ len = scnprintf(p, buflen,
+ "service=%.*s srchost=%s ",
+ (int)(c - service_name),
+ service_name, c + 1);
buflen -= len;
p += len;
gss_msg->msg.len += len;
}
+
if (mech->gm_upcall_enctypes) {
len = scnprintf(p, buflen, "enctypes=%s ",
mech->gm_upcall_enctypes);
@@ -517,7 +548,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
if (err)
goto err_put_pipe_version;
- };
+ }
kref_get(&gss_auth->kref);
return gss_msg;
err_put_pipe_version:
@@ -985,7 +1016,7 @@ static void gss_pipe_free(struct gss_pipe *p)
* parameters based on the input flavor (which must be a pseudoflavor)
*/
static struct gss_auth *
-gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
rpc_authflavor_t flavor = args->pseudoflavor;
struct gss_auth *gss_auth;
@@ -1132,7 +1163,7 @@ gss_destroy(struct rpc_auth *auth)
* (which is guaranteed to last as long as any of its descendants).
*/
static struct gss_auth *
-gss_auth_find_or_add_hashed(struct rpc_auth_create_args *args,
+gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args,
struct rpc_clnt *clnt,
struct gss_auth *new)
{
@@ -1169,7 +1200,8 @@ out:
}
static struct gss_auth *
-gss_create_hashed(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create_hashed(const struct rpc_auth_create_args *args,
+ struct rpc_clnt *clnt)
{
struct gss_auth *gss_auth;
struct gss_auth *new;
@@ -1188,7 +1220,7 @@ out:
}
static struct rpc_auth *
-gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
struct gss_auth *gss_auth;
struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch);
@@ -1571,7 +1603,7 @@ static int gss_cred_is_negative_entry(struct rpc_cred *cred)
if (test_bit(RPCAUTH_CRED_NEGATIVE, &cred->cr_flags)) {
unsigned long now = jiffies;
unsigned long begin, expire;
- struct gss_cred *gss_cred;
+ struct gss_cred *gss_cred;
gss_cred = container_of(cred, struct gss_cred, gc_base);
begin = gss_cred->gc_upcall_timestamp;
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index 254defe446a7..fe97f3106536 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -231,4 +231,3 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
}
EXPORT_SYMBOL_GPL(g_verify_token_header);
-
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 8654494b4d0a..0220e1ca5280 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -169,7 +169,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
struct scatterlist sg[1];
int err = -1;
u8 *checksumdata;
- u8 rc4salt[4];
+ u8 *rc4salt;
struct crypto_ahash *md5;
struct crypto_ahash *hmac_md5;
struct ahash_request *req;
@@ -183,14 +183,18 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
+ rc4salt = kmalloc_array(4, sizeof(*rc4salt), GFP_NOFS);
+ if (!rc4salt)
+ return GSS_S_FAILURE;
+
if (arcfour_hmac_md5_usage_to_salt(usage, rc4salt)) {
dprintk("%s: invalid usage value %u\n", __func__, usage);
- return GSS_S_FAILURE;
+ goto out_free_rc4salt;
}
checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
if (!checksumdata)
- return GSS_S_FAILURE;
+ goto out_free_rc4salt;
md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(md5))
@@ -258,6 +262,8 @@ out_free_md5:
crypto_free_ahash(md5);
out_free_cksum:
kfree(checksumdata);
+out_free_rc4salt:
+ kfree(rc4salt);
return err ? GSS_S_FAILURE : 0;
}
@@ -373,7 +379,6 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
struct scatterlist sg[1];
int err = -1;
u8 *checksumdata;
- unsigned int checksumlen;
if (kctx->gk5e->keyed_cksum == 0) {
dprintk("%s: expected keyed hash for %s\n",
@@ -393,7 +398,6 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm))
goto out_free_cksum;
- checksumlen = crypto_ahash_digestsize(tfm);
req = ahash_request_alloc(tfm, GFP_NOFS);
if (!req)
@@ -1077,4 +1081,3 @@ out_err:
dprintk("%s: returning %d\n", __func__, err);
return err;
}
-
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 870133146026..f7fe2d2b851f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -324,4 +324,3 @@ u32 gss_krb5_aes_make_key(const struct gss_krb5_enctype *gk5e,
err_out:
return ret;
}
-
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 94a2b3f082a8..eaad9bc7a0bd 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -229,4 +229,3 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
return gss_get_mic_v2(ctx, text, token);
}
}
-
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index b601a73cc9db..ef2b25b86d2f 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -225,4 +225,3 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
return gss_verify_mic_v2(ctx, message_buffer, read_token);
}
}
-
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index a737c2da0837..39a2e672900b 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -440,7 +440,6 @@ static u32
gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
struct xdr_buf *buf, struct page **pages)
{
- int blocksize;
u8 *ptr, *plainhdr;
s32 now;
u8 flags = 0x00;
@@ -473,7 +472,6 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
*ptr++ = 0xff;
be16ptr = (__be16 *)ptr;
- blocksize = crypto_skcipher_blocksize(kctx->acceptor_enc);
*be16ptr++ = 0;
/* "inner" token header always uses 0 for RRC */
*be16ptr++ = 0;
@@ -623,4 +621,3 @@ gss_unwrap_kerberos(struct gss_ctx *gctx, int offset, struct xdr_buf *buf)
return gss_unwrap_kerberos_v2(kctx, offset, buf);
}
}
-
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 1c7c49dbf8ba..73dcda060335 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -234,6 +234,35 @@ static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
return 0;
}
+static char *gssp_stringify(struct xdr_netobj *netobj)
+{
+ return kstrndup(netobj->data, netobj->len, GFP_KERNEL);
+}
+
+static void gssp_hostbased_service(char **principal)
+{
+ char *c;
+
+ if (!*principal)
+ return;
+
+ /* terminate and remove realm part */
+ c = strchr(*principal, '@');
+ if (c) {
+ *c = '\0';
+
+ /* change service-hostname delimiter */
+ c = strchr(*principal, '/');
+ if (c)
+ *c = '@';
+ }
+ if (!c) {
+ /* not a service principal */
+ kfree(*principal);
+ *principal = NULL;
+ }
+}
+
/*
* Public functions
*/
@@ -262,6 +291,7 @@ int gssp_accept_sec_context_upcall(struct net *net,
*/
.exported_context_token.len = GSSX_max_output_handle_sz,
.mech.len = GSS_OID_MAX_LEN,
+ .targ_name.display_name.len = GSSX_max_princ_sz,
.src_name.display_name.len = GSSX_max_princ_sz
};
struct gssx_res_accept_sec_context res = {
@@ -275,6 +305,7 @@ int gssp_accept_sec_context_upcall(struct net *net,
.rpc_cred = NULL, /* FIXME ? */
};
struct xdr_netobj client_name = { 0 , NULL };
+ struct xdr_netobj target_name = { 0, NULL };
int ret;
if (data->in_handle.len != 0)
@@ -285,8 +316,6 @@ int gssp_accept_sec_context_upcall(struct net *net,
if (ret)
return ret;
- /* use nfs/ for targ_name ? */
-
ret = gssp_call(net, &msg);
gssp_free_receive_pages(&arg);
@@ -304,6 +333,7 @@ int gssp_accept_sec_context_upcall(struct net *net,
kfree(rctxh.mech.data);
}
client_name = rctxh.src_name.display_name;
+ target_name = rctxh.targ_name.display_name;
}
if (res.options.count == 1) {
@@ -325,32 +355,22 @@ int gssp_accept_sec_context_upcall(struct net *net,
}
/* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */
- if (data->found_creds && client_name.data != NULL) {
- char *c;
-
- data->creds.cr_raw_principal = kstrndup(client_name.data,
- client_name.len, GFP_KERNEL);
-
- data->creds.cr_principal = kstrndup(client_name.data,
- client_name.len, GFP_KERNEL);
- if (data->creds.cr_principal) {
- /* terminate and remove realm part */
- c = strchr(data->creds.cr_principal, '@');
- if (c) {
- *c = '\0';
-
- /* change service-hostname delimiter */
- c = strchr(data->creds.cr_principal, '/');
- if (c) *c = '@';
- }
- if (!c) {
- /* not a service principal */
- kfree(data->creds.cr_principal);
- data->creds.cr_principal = NULL;
- }
+ if (data->found_creds) {
+ if (client_name.data) {
+ data->creds.cr_raw_principal =
+ gssp_stringify(&client_name);
+ data->creds.cr_principal =
+ gssp_stringify(&client_name);
+ gssp_hostbased_service(&data->creds.cr_principal);
+ }
+ if (target_name.data) {
+ data->creds.cr_targ_princ =
+ gssp_stringify(&target_name);
+ gssp_hostbased_service(&data->creds.cr_targ_princ);
}
}
kfree(client_name.data);
+ kfree(target_name.data);
return ret;
}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5089dbb96d58..860f2a1bbb67 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1389,7 +1389,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net)
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
if (sn->use_gssp_proc) {
- remove_proc_entry("use-gss-proxy", sn->proc_net_rpc);
+ remove_proc_entry("use-gss-proxy", sn->proc_net_rpc);
clear_gssp_clnt(sn);
}
}
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 75d72e109a04..4b48228ee8c7 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -19,7 +19,7 @@ static struct rpc_auth null_auth;
static struct rpc_cred null_cred;
static struct rpc_auth *
-nul_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
atomic_inc(&null_auth.au_count);
return &null_auth;
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index dafd6b870ba3..185e56d4f9ae 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -30,7 +30,7 @@ static struct rpc_auth unix_auth;
static const struct rpc_credops unix_credops;
static struct rpc_auth *
-unx_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
dprintk("RPC: creating UNIX authenticator for client %p\n",
clnt);
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index c2c68a15b59d..3c15a99b9700 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -362,4 +362,3 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
wake_up(&bc_serv->sv_cb_waitq);
spin_unlock(&bc_serv->sv_cb_lock);
}
-
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d839c33ae7d9..8ea2f5fadd96 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -892,7 +892,7 @@ rpc_free_client(struct rpc_clnt *clnt)
/*
* Free an RPC client
*/
-static struct rpc_clnt *
+static struct rpc_clnt *
rpc_free_auth(struct rpc_clnt *clnt)
{
if (clnt->cl_auth == NULL)
@@ -965,10 +965,20 @@ out:
}
EXPORT_SYMBOL_GPL(rpc_bind_new_program);
+void rpc_task_release_transport(struct rpc_task *task)
+{
+ struct rpc_xprt *xprt = task->tk_xprt;
+
+ if (xprt) {
+ task->tk_xprt = NULL;
+ xprt_put(xprt);
+ }
+}
+EXPORT_SYMBOL_GPL(rpc_task_release_transport);
+
void rpc_task_release_client(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
- struct rpc_xprt *xprt = task->tk_xprt;
if (clnt != NULL) {
/* Remove from client task list */
@@ -979,12 +989,14 @@ void rpc_task_release_client(struct rpc_task *task)
rpc_release_client(clnt);
}
+ rpc_task_release_transport(task);
+}
- if (xprt != NULL) {
- task->tk_xprt = NULL;
-
- xprt_put(xprt);
- }
+static
+void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+ if (!task->tk_xprt)
+ task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
}
static
@@ -992,8 +1004,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
{
if (clnt != NULL) {
- if (task->tk_xprt == NULL)
- task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
+ rpc_task_set_transport(task, clnt);
task->tk_client = clnt;
atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
@@ -1512,6 +1523,7 @@ call_start(struct rpc_task *task)
clnt->cl_program->version[clnt->cl_vers]->counts[idx]++;
clnt->cl_stats->rpccnt++;
task->tk_action = call_reserve;
+ rpc_task_set_transport(task, clnt);
}
/*
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index c526f8fb37c9..c7872bc13860 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -213,7 +213,7 @@ static void rpcb_set_local(struct net *net, struct rpc_clnt *clnt,
sn->rpcb_local_clnt = clnt;
sn->rpcb_local_clnt4 = clnt4;
sn->rpcb_is_af_local = is_af_local ? 1 : 0;
- smp_wmb();
+ smp_wmb();
sn->rpcb_users = 1;
dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: "
"%p, rpcb_local_clnt4: %p) for net %x%s\n",
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index f68aa46c9dd7..71166b393732 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -208,13 +208,39 @@ static void _print_name(struct seq_file *seq, unsigned int op,
seq_printf(seq, "\t%12u: ", op);
}
-void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
+static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b)
+{
+ a->om_ops += b->om_ops;
+ a->om_ntrans += b->om_ntrans;
+ a->om_timeouts += b->om_timeouts;
+ a->om_bytes_sent += b->om_bytes_sent;
+ a->om_bytes_recv += b->om_bytes_recv;
+ a->om_queue = ktime_add(a->om_queue, b->om_queue);
+ a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
+ a->om_execute = ktime_add(a->om_execute, b->om_execute);
+}
+
+static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
+ int op, const struct rpc_procinfo *procs)
+{
+ _print_name(seq, op, procs);
+ seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
+ stats->om_ops,
+ stats->om_ntrans,
+ stats->om_timeouts,
+ stats->om_bytes_sent,
+ stats->om_bytes_recv,
+ ktime_to_ms(stats->om_queue),
+ ktime_to_ms(stats->om_rtt),
+ ktime_to_ms(stats->om_execute));
+}
+
+void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
{
- struct rpc_iostats *stats = clnt->cl_metrics;
struct rpc_xprt *xprt;
unsigned int op, maxproc = clnt->cl_maxproc;
- if (!stats)
+ if (!clnt->cl_metrics)
return;
seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS);
@@ -229,20 +255,18 @@ void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt)
seq_printf(seq, "\tper-op statistics\n");
for (op = 0; op < maxproc; op++) {
- struct rpc_iostats *metrics = &stats[op];
- _print_name(seq, op, clnt->cl_procinfo);
- seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
- metrics->om_ops,
- metrics->om_ntrans,
- metrics->om_timeouts,
- metrics->om_bytes_sent,
- metrics->om_bytes_recv,
- ktime_to_ms(metrics->om_queue),
- ktime_to_ms(metrics->om_rtt),
- ktime_to_ms(metrics->om_execute));
+ struct rpc_iostats stats = {};
+ struct rpc_clnt *next = clnt;
+ do {
+ _add_rpc_iostats(&stats, &next->cl_metrics[op]);
+ if (next == next->cl_parent)
+ break;
+ next = next->cl_parent;
+ } while (next);
+ _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo);
}
}
-EXPORT_SYMBOL_GPL(rpc_print_iostats);
+EXPORT_SYMBOL_GPL(rpc_clnt_show_stats);
/*
* Register/unregister RPC proc files
@@ -310,4 +334,3 @@ void rpc_proc_exit(struct net *net)
dprintk("RPC: unregistering /proc/net/rpc\n");
remove_proc_entry("rpc", net->proc_net);
}
-
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index 09a0315ea77b..c9bacb3c930f 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -57,4 +57,3 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
int rpc_clients_notifier_register(void);
void rpc_clients_notifier_unregister(void);
#endif /* _NET_SUNRPC_SUNRPC_H */
-
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 30a4226baf03..d13e05f1a990 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1537,16 +1537,16 @@ EXPORT_SYMBOL_GPL(svc_max_payload);
/**
* svc_fill_write_vector - Construct data argument for VFS write call
* @rqstp: svc_rqst to operate on
+ * @pages: list of pages containing data payload
* @first: buffer containing first section of write payload
* @total: total number of bytes of write payload
*
- * Returns the number of elements populated in the data argument array.
+ * Fills in rqstp::rq_vec, and returns the number of elements.
*/
-unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first,
- size_t total)
+unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages,
+ struct kvec *first, size_t total)
{
struct kvec *vec = rqstp->rq_vec;
- struct page **pages;
unsigned int i;
/* Some types of transport can present the write payload
@@ -1560,14 +1560,11 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first,
++i;
}
- WARN_ON_ONCE(rqstp->rq_arg.page_base != 0);
- pages = rqstp->rq_arg.pages;
while (total) {
vec[i].iov_base = page_address(*pages);
vec[i].iov_len = min_t(size_t, total, PAGE_SIZE);
total -= vec[i].iov_len;
++i;
-
++pages;
}
@@ -1580,65 +1577,48 @@ EXPORT_SYMBOL_GPL(svc_fill_write_vector);
* svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call
* @rqstp: svc_rqst to operate on
* @first: buffer containing first section of pathname
+ * @p: buffer containing remaining section of pathname
* @total: total length of the pathname argument
*
- * Returns pointer to a NUL-terminated string, or an ERR_PTR. The buffer is
- * released automatically when @rqstp is recycled.
+ * The VFS symlink API demands a NUL-terminated pathname in mapped memory.
+ * Returns pointer to a NUL-terminated string, or an ERR_PTR. Caller must free
+ * the returned string.
*/
char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first,
- size_t total)
+ void *p, size_t total)
{
- struct xdr_buf *arg = &rqstp->rq_arg;
- struct page **pages;
- char *result;
-
- /* VFS API demands a NUL-terminated pathname. This function
- * uses a page from @rqstp as the pathname buffer, to enable
- * direct placement. Thus the total buffer size is PAGE_SIZE.
- * Space in this buffer for NUL-termination requires that we
- * cap the size of the returned symlink pathname just a
- * little early.
- */
- if (total > PAGE_SIZE - 1)
- return ERR_PTR(-ENAMETOOLONG);
+ size_t len, remaining;
+ char *result, *dst;
- /* Some types of transport can present the pathname entirely
- * in rq_arg.pages. If not, then copy the pathname into one
- * page.
- */
- pages = arg->pages;
- WARN_ON_ONCE(arg->page_base != 0);
- if (first->iov_base == 0) {
- result = page_address(*pages);
- result[total] = '\0';
- } else {
- size_t len, remaining;
- char *dst;
+ result = kmalloc(total + 1, GFP_KERNEL);
+ if (!result)
+ return ERR_PTR(-ESERVERFAULT);
- result = page_address(*(rqstp->rq_next_page++));
- dst = result;
- remaining = total;
+ dst = result;
+ remaining = total;
- len = min_t(size_t, total, first->iov_len);
+ len = min_t(size_t, total, first->iov_len);
+ if (len) {
memcpy(dst, first->iov_base, len);
dst += len;
remaining -= len;
+ }
- /* No more than one page left */
- if (remaining) {
- len = min_t(size_t, remaining, PAGE_SIZE);
- memcpy(dst, page_address(*pages), len);
- dst += len;
- }
-
- *dst = '\0';
+ if (remaining) {
+ len = min_t(size_t, remaining, PAGE_SIZE);
+ memcpy(dst, p, len);
+ dst += len;
}
- /* Sanity check: we don't allow the pathname argument to
+ *dst = '\0';
+
+ /* Sanity check: Linux doesn't allow the pathname argument to
* contain a NUL byte.
*/
- if (strlen(result) != total)
+ if (strlen(result) != total) {
+ kfree(result);
return ERR_PTR(-EINVAL);
+ }
return result;
}
EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3fabf9f6a0f9..a8db2e3f8904 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -880,7 +880,7 @@ static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
__must_hold(&req->rq_xprt->recv_lock)
{
struct rpc_task *task = req->rq_task;
-
+
if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
spin_unlock(&req->rq_xprt->recv_lock);
set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 17fb1e025654..0f7c465d9a5a 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -279,9 +279,7 @@ out_maperr:
static int
fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
- struct ib_send_wr *bad_wr;
-
- return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr);
+ return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL);
}
/* Invalidate all memory regions that were registered for "req".
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c040de196e13..1bb00dd6ccdb 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -464,7 +464,7 @@ out_mapmr_err:
static int
frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
- struct ib_send_wr *post_wr, *bad_wr;
+ struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
post_wr = &req->rl_sendctx->sc_wr;
@@ -486,7 +486,7 @@ frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
/* If ib_post_send fails, the next ->send_request for
* @req will queue these MWs for recovery.
*/
- return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
+ return ib_post_send(ia->ri_id->qp, post_wr, NULL);
}
/* Handle a remotely invalidated mr on the @mrs list
@@ -517,7 +517,8 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
{
- struct ib_send_wr *first, **prev, *last, *bad_wr;
+ struct ib_send_wr *first, **prev, *last;
+ const struct ib_send_wr *bad_wr;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 357ba90c382d..134bef6a451e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -94,7 +94,6 @@ static int read_reset_stat(struct ctl_table *table, int write,
atomic_set(stat, 0);
else {
char str_buf[32];
- char *data;
int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
if (len >= 32)
return -EFAULT;
@@ -103,7 +102,6 @@ static int read_reset_stat(struct ctl_table *table, int write,
*lenp = 0;
return 0;
}
- data = &str_buf[*ppos];
len -= *ppos;
if (len > *lenp)
len = *lenp;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 841fca143804..b24d5b8f2fee 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -229,11 +229,10 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
struct svc_rdma_recv_ctxt *ctxt)
{
- struct ib_recv_wr *bad_recv_wr;
int ret;
svc_xprt_get(&rdma->sc_xprt);
- ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr);
+ ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
if (ret)
goto err_post;
@@ -366,9 +365,6 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
arg->page_base = 0;
arg->buflen = ctxt->rc_byte_len;
arg->len = ctxt->rc_byte_len;
-
- rqstp->rq_respages = &rqstp->rq_pages[0];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
}
/* This accommodates the largest possible Write chunk,
@@ -730,6 +726,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
svc_rdma_build_arg_xdr(rqstp, ctxt);
+ /* Prevent svc_xprt_release from releasing pages in rq_pages
+ * if we return 0 or an error.
+ */
+ rqstp->rq_respages = rqstp->rq_pages;
+ rqstp->rq_next_page = rqstp->rq_respages;
+
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
if (ret < 0)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index ce3ea8419704..dc1951759a8e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -307,7 +307,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
{
struct svcxprt_rdma *rdma = cc->cc_rdma;
struct svc_xprt *xprt = &rdma->sc_xprt;
- struct ib_send_wr *first_wr, *bad_wr;
+ struct ib_send_wr *first_wr;
+ const struct ib_send_wr *bad_wr;
struct list_head *tmp;
struct ib_cqe *cqe;
int ret;
@@ -679,6 +680,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
struct svc_rdma_read_info *info,
__be32 *p)
{
+ unsigned int i;
int ret;
ret = -EINVAL;
@@ -701,6 +703,12 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
info->ri_chunklen += rs_length;
}
+ /* Pages under I/O have been copied to head->rc_pages.
+ * Prevent their premature release by svc_xprt_release() .
+ */
+ for (i = 0; i < info->ri_readctxt->rc_page_count; i++)
+ rqstp->rq_pages[i] = NULL;
+
return ret;
}
@@ -816,7 +824,6 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
struct svc_rdma_recv_ctxt *head, __be32 *p)
{
struct svc_rdma_read_info *info;
- struct page **page;
int ret;
/* The request (with page list) is constructed in
@@ -843,27 +850,15 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
else
ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
-
- /* Mark the start of the pages that can be used for the reply */
- if (info->ri_pageoff > 0)
- info->ri_pageno++;
- rqstp->rq_respages = &rqstp->rq_pages[info->ri_pageno];
- rqstp->rq_next_page = rqstp->rq_respages + 1;
-
if (ret < 0)
- goto out;
+ goto out_err;
ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
-
-out:
- /* Read sink pages have been moved from rqstp->rq_pages to
- * head->rc_arg.pages. Force svc_recv to refill those slots
- * in rq_pages.
- */
- for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++)
- *page = NULL;
-
if (ret < 0)
- svc_rdma_read_info_free(info);
+ goto out_err;
+ return 0;
+
+out_err:
+ svc_rdma_read_info_free(info);
return ret;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 4a3efaea277c..8602a5f1b515 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -291,7 +291,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
*/
int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
{
- struct ib_send_wr *bad_wr;
int ret;
might_sleep();
@@ -311,7 +310,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
}
svc_xprt_get(&rdma->sc_xprt);
- ret = ib_post_send(rdma->sc_qp, wr, &bad_wr);
+ ret = ib_post_send(rdma->sc_qp, wr, NULL);
trace_svcrdma_post_send(wr, ret);
if (ret) {
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
@@ -657,7 +656,9 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
ctxt->sc_pages[i] = rqstp->rq_respages[i];
rqstp->rq_respages[i] = NULL;
}
- rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Prevent svc_xprt_release from releasing pages in rq_pages */
+ rqstp->rq_next_page = rqstp->rq_respages;
}
/* Prepare the portion of the RPC Reply that will be transmitted
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e9535a66bab0..2848cafd4a17 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -296,7 +296,6 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
struct rdma_cm_event *event)
{
struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
- int ret = 0;
trace_svcrdma_cm_event(event, sap);
@@ -315,7 +314,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
break;
}
- return ret;
+ return 0;
}
static int rdma_cma_handler(struct rdma_cm_id *cma_id,
@@ -476,7 +475,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
- newxprt->sc_max_send_sges = dev->attrs.max_sge;
+ newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
/* transport hdr, head iovec, one page list entry, tail iovec */
if (newxprt->sc_max_send_sges < 4) {
pr_err("svcrdma: too few Send SGEs available (%d)\n",
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 16161a36dc73..956a5ea47b58 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -280,7 +280,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
++xprt->rx_xprt.connect_cookie;
connstate = -ECONNABORTED;
connected:
- xprt->rx_buf.rb_credits = 1;
ep->rep_connected = connstate;
rpcrdma_conn_func(ep);
wake_up_all(&ep->rep_connect_wait);
@@ -508,7 +507,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
unsigned int max_sge;
int rc;
- max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
+ max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge,
RPCRDMA_MAX_SEND_SGES);
if (max_sge < RPCRDMA_MIN_SEND_SGES) {
pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
@@ -755,6 +754,7 @@ retry:
}
ep->rep_connected = 0;
+ rpcrdma_post_recvs(r_xprt, true);
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
if (rc) {
@@ -773,8 +773,6 @@ retry:
dprintk("RPC: %s: connected\n", __func__);
- rpcrdma_post_recvs(r_xprt, true);
-
out:
if (rc)
ep->rep_connected = rc;
@@ -1171,6 +1169,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
list_add(&req->rl_list, &buf->rb_send_bufs);
}
+ buf->rb_credits = 1;
buf->rb_posted_receives = 0;
INIT_LIST_HEAD(&buf->rb_recv_bufs);
@@ -1559,7 +1558,8 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!count)
return;
- rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, &bad_wr);
+ rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+ (const struct ib_recv_wr **)&bad_wr);
if (rc) {
for (wr = bad_wr; wr; wr = wr->next) {
struct rpcrdma_rep *rep;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9e1c5024aba9..6b7539c0466e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3375,4 +3375,3 @@ module_param_named(tcp_max_slot_table_entries, xprt_max_tcp_slot_table_entries,
max_slot_table_size, 0644);
module_param_named(udp_slot_table_entries, xprt_udp_slot_table_entries,
slot_table_size, 0644);
-
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index f3711176be45..d8026543bf4c 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -51,12 +51,12 @@ const char tipc_bclink_name[] = "broadcast-link";
* struct tipc_bc_base - base structure for keeping broadcast send state
* @link: broadcast send link structure
* @inputq: data input queue; will only carry SOCK_WAKEUP messages
- * @dest: array keeping number of reachable destinations per bearer
+ * @dests: array keeping number of reachable destinations per bearer
* @primary_bearer: a bearer having links to all broadcast destinations, if any
* @bcast_support: indicates if primary bearer, if any, supports broadcast
* @rcast_support: indicates if all peer nodes support replicast
* @rc_ratio: dest count as percentage of cluster size where send method changes
- * @bc_threshold: calculated drom rc_ratio; if dests > threshold use broadcast
+ * @bc_threshold: calculated from rc_ratio; if dests > threshold use broadcast
*/
struct tipc_bc_base {
struct tipc_link *link;
@@ -512,7 +512,7 @@ int tipc_bcast_init(struct net *net)
struct tipc_bc_base *bb = NULL;
struct tipc_link *l = NULL;
- bb = kzalloc(sizeof(*bb), GFP_ATOMIC);
+ bb = kzalloc(sizeof(*bb), GFP_KERNEL);
if (!bb)
goto enomem;
tn->bcbase = bb;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 2dfb492a7c94..e65c3a8551e4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -395,6 +395,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
tipc_net_init(net, node_id, 0);
}
if (!tipc_own_id(net)) {
+ dev_put(dev);
pr_warn("Failed to obtain node identity\n");
return -EINVAL;
}
@@ -576,7 +577,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
rcu_dereference_rtnl(orig_dev->tipc_ptr);
if (likely(b && test_bit(0, &b->up) &&
(skb->pkt_type <= PACKET_MULTICAST))) {
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
tipc_rcv(dev_net(b->pt.dev), skb, b);
rcu_read_unlock();
return NET_RX_SUCCESS;
@@ -608,15 +609,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
switch (evt) {
case NETDEV_CHANGE:
- if (netif_carrier_ok(dev))
+ if (netif_carrier_ok(dev) && netif_oper_up(dev)) {
+ test_and_set_bit_lock(0, &b->up);
break;
- case NETDEV_UP:
- test_and_set_bit_lock(0, &b->up);
- break;
+ }
+ /* fall through */
case NETDEV_GOING_DOWN:
clear_bit_unlock(0, &b->up);
tipc_reset_bearer(net, b);
break;
+ case NETDEV_UP:
+ test_and_set_bit_lock(0, &b->up);
+ break;
case NETDEV_CHANGEMTU:
if (tipc_mtu_bad(dev, 0)) {
bearer_disable(net, b);
diff --git a/net/tipc/diag.c b/net/tipc/diag.c
index aaabb0b776dd..73137f4aeb68 100644
--- a/net/tipc/diag.c
+++ b/net/tipc/diag.c
@@ -84,7 +84,9 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb,
if (h->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start = tipc_dump_start,
.dump = tipc_diag_dump,
+ .done = tipc_dump_done,
};
netlink_dump_start(net->diag_nlsk, skb, h, &c);
return 0;
diff --git a/net/tipc/group.c b/net/tipc/group.c
index d7a7befeddd4..06fee142f09f 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp)
return 0;
}
-int tipc_group_size(struct tipc_group *grp)
-{
- return grp->member_cnt;
-}
-
struct tipc_group *tipc_group_create(struct net *net, u32 portid,
struct tipc_group_req *mreq,
bool *group_is_open)
@@ -232,8 +227,8 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp)
kfree(grp);
}
-struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
- u32 node, u32 port)
+static struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
+ u32 node, u32 port)
{
struct rb_node *n = grp->members.rb_node;
u64 nkey, key = (u64)node << 32 | port;
@@ -671,6 +666,7 @@ static void tipc_group_create_event(struct tipc_group *grp,
struct sk_buff *skb;
struct tipc_msg *hdr;
+ memset(&evt, 0, sizeof(evt));
evt.event = event;
evt.found_lower = m->instance;
evt.found_upper = m->instance;
@@ -918,3 +914,35 @@ void tipc_group_member_evt(struct tipc_group *grp,
}
*sk_rcvbuf = tipc_group_rcvbuf_limit(grp);
}
+
+int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb)
+{
+ struct nlattr *group = nla_nest_start(skb, TIPC_NLA_SOCK_GROUP);
+
+ if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID,
+ grp->type) ||
+ nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE,
+ grp->instance) ||
+ nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT,
+ grp->bc_snd_nxt))
+ goto group_msg_cancel;
+
+ if (grp->scope == TIPC_NODE_SCOPE)
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE))
+ goto group_msg_cancel;
+
+ if (grp->scope == TIPC_CLUSTER_SCOPE)
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE))
+ goto group_msg_cancel;
+
+ if (*grp->open)
+ if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN))
+ goto group_msg_cancel;
+
+ nla_nest_end(skb, group);
+ return 0;
+
+group_msg_cancel:
+ nla_nest_cancel(skb, group);
+ return -1;
+}
diff --git a/net/tipc/group.h b/net/tipc/group.h
index 5996af6e9f1d..76b4e5a7b39d 100644
--- a/net/tipc/group.h
+++ b/net/tipc/group.h
@@ -72,4 +72,5 @@ void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node,
u32 port, struct sk_buff_head *xmitq);
u16 tipc_group_bc_snd_nxt(struct tipc_group *grp);
void tipc_group_update_member(struct tipc_member *m, int len);
+int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb);
#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 695acb783969..201c3b5bc96b 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -106,7 +106,8 @@ struct tipc_stats {
* @backlogq: queue for messages waiting to be sent
* @snt_nxt: next sequence number to use for outbound messages
* @last_retransmitted: sequence number of most recently retransmitted message
- * @stale_count: # of identical retransmit requests made by peer
+ * @stale_cnt: counter for number of identical retransmit attempts
+ * @stale_limit: time when repeated identical retransmits must force link reset
* @ackers: # of peers that needs to ack each packet before it can be released
* @acked: # last packet acked by a certain peer. Used for broadcast.
* @rcv_nxt: next sequence number to expect for inbound messages
@@ -127,14 +128,17 @@ struct tipc_link {
struct net *net;
/* Management and link supervision data */
- u32 peer_session;
- u32 session;
+ u16 peer_session;
+ u16 session;
+ u16 snd_nxt_state;
+ u16 rcv_nxt_state;
u32 peer_bearer_id;
u32 bearer_id;
u32 tolerance;
u32 abort_limit;
u32 state;
u16 peer_caps;
+ bool in_session;
bool active;
u32 silent_intv_cnt;
char if_name[TIPC_MAX_IF_NAME];
@@ -161,7 +165,8 @@ struct tipc_link {
u16 snd_nxt;
u16 last_retransm;
u16 window;
- u32 stale_count;
+ u16 stale_cnt;
+ unsigned long stale_limit;
/* Reception */
u16 rcv_nxt;
@@ -212,11 +217,6 @@ enum {
*/
#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-/* Wildcard value for link session numbers. When it is known that
- * peer endpoint is down, any session number must be accepted.
- */
-#define ANY_SESSION 0x10000
-
/* Link FSM states:
*/
enum {
@@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l)
return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l));
}
-int tipc_link_is_active(struct tipc_link *l)
-{
- return l->active;
-}
-
void tipc_link_set_active(struct tipc_link *l, bool active)
{
l->active = active;
@@ -337,6 +332,11 @@ char tipc_link_plane(struct tipc_link *l)
return l->net_plane;
}
+void tipc_link_update_caps(struct tipc_link *l, u16 capabilities)
+{
+ l->peer_caps = capabilities;
+}
+
void tipc_link_add_bc_peer(struct tipc_link *snd_l,
struct tipc_link *uc_l,
struct sk_buff_head *xmitq)
@@ -373,7 +373,7 @@ int tipc_link_bc_peers(struct tipc_link *l)
return l->ackers;
}
-u16 link_bc_rcv_gap(struct tipc_link *l)
+static u16 link_bc_rcv_gap(struct tipc_link *l)
{
struct sk_buff *skb = skb_peek(&l->deferdq);
u16 gap = 0;
@@ -410,6 +410,11 @@ char *tipc_link_name(struct tipc_link *l)
return l->name;
}
+u32 tipc_link_state(struct tipc_link *l)
+{
+ return l->state;
+}
+
/**
* tipc_link_create - create a new link
* @n: pointer to associated node
@@ -469,9 +474,11 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
l->addr = peer;
l->peer_caps = peer_caps;
l->net = net;
- l->peer_session = ANY_SESSION;
+ l->in_session = false;
l->bearer_id = bearer_id;
l->tolerance = tolerance;
+ if (bc_rcvlink)
+ bc_rcvlink->tolerance = tolerance;
l->net_plane = net_plane;
l->advertised_mtu = mtu;
l->mtu = mtu;
@@ -820,7 +827,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
* Wake up a number of waiting users, as permitted by available space
* in the send queue
*/
-void link_prepare_wakeup(struct tipc_link *l)
+static void link_prepare_wakeup(struct tipc_link *l)
{
struct sk_buff *skb, *tmp;
int imp, i = 0;
@@ -838,12 +845,24 @@ void link_prepare_wakeup(struct tipc_link *l)
void tipc_link_reset(struct tipc_link *l)
{
- l->peer_session = ANY_SESSION;
+ struct sk_buff_head list;
+
+ __skb_queue_head_init(&list);
+
+ l->in_session = false;
l->session++;
l->mtu = l->advertised_mtu;
+
+ spin_lock_bh(&l->wakeupq.lock);
+ skb_queue_splice_init(&l->wakeupq, &list);
+ spin_unlock_bh(&l->wakeupq.lock);
+
+ spin_lock_bh(&l->inputq->lock);
+ skb_queue_splice_init(&list, l->inputq);
+ spin_unlock_bh(&l->inputq->lock);
+
__skb_queue_purge(&l->transmq);
__skb_queue_purge(&l->deferdq);
- skb_queue_splice_init(&l->wakeupq, l->inputq);
__skb_queue_purge(&l->backlogq);
l->backlog[TIPC_LOW_IMPORTANCE].len = 0;
l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0;
@@ -857,10 +876,12 @@ void tipc_link_reset(struct tipc_link *l)
l->rcv_unacked = 0;
l->snd_nxt = 1;
l->rcv_nxt = 1;
+ l->snd_nxt_state = 1;
+ l->rcv_nxt_state = 1;
l->acked = 0;
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
- l->stale_count = 0;
+ l->stale_cnt = 0;
l->bc_peer_is_up = false;
memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
@@ -954,7 +975,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
return rc;
}
-void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
+static void tipc_link_advance_backlog(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
{
struct sk_buff *skb, *_skb;
struct tipc_msg *hdr;
@@ -997,39 +1019,42 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
}
-int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker,
- u16 from, u16 to, struct sk_buff_head *xmitq)
+/* tipc_link_retrans() - retransmit one or more packets
+ * @l: the link to transmit on
+ * @r: the receiving link ordering the retransmit. Same as l if unicast
+ * @from: retransmit from (inclusive) this sequence number
+ * @to: retransmit to (inclusive) this sequence number
+ * xmitq: queue for accumulating the retransmitted packets
+ */
+static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
+ u16 from, u16 to, struct sk_buff_head *xmitq)
{
struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
- struct tipc_msg *hdr;
- u16 ack = l->rcv_nxt - 1;
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+ u16 ack = l->rcv_nxt - 1;
+ struct tipc_msg *hdr;
if (!skb)
return 0;
/* Detect repeated retransmit failures on same packet */
- if (nacker->last_retransm != buf_seqno(skb)) {
- nacker->last_retransm = buf_seqno(skb);
- nacker->stale_count = 1;
- } else if (++nacker->stale_count > 100) {
+ if (r->last_retransm != buf_seqno(skb)) {
+ r->last_retransm = buf_seqno(skb);
+ r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
+ r->stale_cnt = 0;
+ } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) {
link_retransmit_failure(l, skb);
- nacker->stale_count = 0;
if (link_is_bc_sndlink(l))
return TIPC_LINK_DOWN_EVT;
return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
}
- /* Move forward to where retransmission should start */
skb_queue_walk(&l->transmq, skb) {
- if (!less(buf_seqno(skb), from))
- break;
- }
-
- skb_queue_walk_from(&l->transmq, skb) {
- if (more(buf_seqno(skb), to))
- break;
hdr = buf_msg(skb);
+ if (less(msg_seqno(hdr), from))
+ continue;
+ if (more(msg_seqno(hdr), to))
+ break;
_skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
if (!_skb)
return 0;
@@ -1063,6 +1088,7 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
skb_queue_tail(mc_inputq, skb);
return true;
}
+ /* else: fall through */
case CONN_MANAGER:
skb_queue_tail(inputq, skb);
return true;
@@ -1271,6 +1297,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Forward queues and wake up waiting users */
if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
+ l->stale_cnt = 0;
tipc_link_advance_backlog(l, xmitq);
if (unlikely(!skb_queue_empty(&l->wakeupq)))
link_prepare_wakeup(l);
@@ -1347,6 +1374,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
if (mtyp == STATE_MSG) {
+ if (l->peer_caps & TIPC_LINK_PROTO_SEQNO)
+ msg_set_seqno(hdr, l->snd_nxt_state++);
msg_set_seq_gap(hdr, rcvgap);
msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
msg_set_probe(hdr, probe);
@@ -1371,6 +1400,36 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
__skb_queue_tail(xmitq, skb);
}
+void tipc_link_create_dummy_tnl_msg(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
+{
+ u32 onode = tipc_own_addr(l->net);
+ struct tipc_msg *hdr, *ihdr;
+ struct sk_buff_head tnlq;
+ struct sk_buff *skb;
+ u32 dnode = l->addr;
+
+ skb_queue_head_init(&tnlq);
+ skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG,
+ INT_H_SIZE, BASIC_H_SIZE,
+ dnode, onode, 0, 0, 0);
+ if (!skb) {
+ pr_warn("%sunable to create tunnel packet\n", link_co_err);
+ return;
+ }
+
+ hdr = buf_msg(skb);
+ msg_set_msgcnt(hdr, 1);
+ msg_set_bearer_id(hdr, l->peer_bearer_id);
+
+ ihdr = (struct tipc_msg *)msg_data(hdr);
+ tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
+ BASIC_H_SIZE, dnode);
+ msg_set_errcode(ihdr, TIPC_ERR_NO_PORT);
+ __skb_queue_tail(&tnlq, skb);
+ tipc_link_xmit(l, &tnlq, xmitq);
+}
+
/* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets
* with contents of the link's transmit and backlog queues.
*/
@@ -1438,6 +1497,47 @@ tnl:
}
}
+/* tipc_link_validate_msg(): validate message against current link state
+ * Returns true if message should be accepted, otherwise false
+ */
+bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr)
+{
+ u16 curr_session = l->peer_session;
+ u16 session = msg_session(hdr);
+ int mtyp = msg_type(hdr);
+
+ if (msg_user(hdr) != LINK_PROTOCOL)
+ return true;
+
+ switch (mtyp) {
+ case RESET_MSG:
+ if (!l->in_session)
+ return true;
+ /* Accept only RESET with new session number */
+ return more(session, curr_session);
+ case ACTIVATE_MSG:
+ if (!l->in_session)
+ return true;
+ /* Accept only ACTIVATE with new or current session number */
+ return !less(session, curr_session);
+ case STATE_MSG:
+ /* Accept only STATE with current session number */
+ if (!l->in_session)
+ return false;
+ if (session != curr_session)
+ return false;
+ /* Extra sanity check */
+ if (!link_is_up(l) && msg_ack(hdr))
+ return false;
+ if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO))
+ return true;
+ /* Accept only STATE with new sequence number */
+ return !less(msg_seqno(hdr), l->rcv_nxt_state);
+ default:
+ return false;
+ }
+}
+
/* tipc_link_proto_rcv(): receive link level protocol message :
* Note that network plane id propagates through the network, and may
* change at any time. The node with lowest numerical id determines
@@ -1471,17 +1571,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
hdr = buf_msg(skb);
data = msg_data(hdr);
+ if (!tipc_link_validate_msg(l, hdr))
+ goto exit;
+
switch (mtyp) {
case RESET_MSG:
-
- /* Ignore duplicate RESET with old session number */
- if ((less_eq(msg_session(hdr), l->peer_session)) &&
- (l->peer_session != ANY_SESSION))
- break;
- /* fall thru' */
-
case ACTIVATE_MSG:
-
/* Complete own link name with peer's interface name */
if_name = strrchr(l->name, ':') + 1;
if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
@@ -1491,9 +1586,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
strncpy(if_name, data, TIPC_MAX_IF_NAME);
/* Update own tolerance if peer indicates a non-zero value */
- if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
l->tolerance = peers_tol;
-
+ l->bc_rcvlink->tolerance = peers_tol;
+ }
/* Update own priority if peer's priority is higher */
if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
l->priority = peers_prio;
@@ -1509,17 +1605,20 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
rc = TIPC_LINK_UP_EVT;
l->peer_session = msg_session(hdr);
+ l->in_session = true;
l->peer_bearer_id = msg_bearer_id(hdr);
if (l->mtu > msg_max_pkt(hdr))
l->mtu = msg_max_pkt(hdr);
break;
case STATE_MSG:
+ l->rcv_nxt_state = msg_seqno(hdr) + 1;
/* Update own tolerance if peer indicates a non-zero value */
- if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
+ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
l->tolerance = peers_tol;
-
+ l->bc_rcvlink->tolerance = peers_tol;
+ }
/* Update own prio if peer indicates a different value */
if ((peers_prio != l->priority) &&
in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) {
@@ -2136,6 +2235,8 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
struct sk_buff_head *xmitq)
{
l->tolerance = tol;
+ if (l->bc_rcvlink)
+ l->bc_rcvlink->tolerance = tol;
if (link_is_up(l))
tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq);
}
diff --git a/net/tipc/link.h b/net/tipc/link.h
index ec59348a81e8..90488c538a4e 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -88,6 +88,8 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
struct tipc_link **link);
void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
int mtyp, struct sk_buff_head *xmitq);
+void tipc_link_create_dummy_tnl_msg(struct tipc_link *tnl,
+ struct sk_buff_head *xmitq);
void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
int tipc_link_fsm_evt(struct tipc_link *l, int evt);
bool tipc_link_is_up(struct tipc_link *l);
@@ -107,9 +109,12 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l);
u16 tipc_link_acked(struct tipc_link *l);
u32 tipc_link_id(struct tipc_link *l);
char *tipc_link_name(struct tipc_link *l);
+u32 tipc_link_state(struct tipc_link *l);
char tipc_link_plane(struct tipc_link *l);
int tipc_link_prio(struct tipc_link *l);
int tipc_link_window(struct tipc_link *l);
+void tipc_link_update_caps(struct tipc_link *l, u16 capabilities);
+bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr);
unsigned long tipc_link_tolerance(struct tipc_link *l);
void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
struct sk_buff_head *xmitq);
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 5453e564da82..67f69389ec17 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -684,7 +684,8 @@ int tipc_nl_monitor_get_threshold(struct net *net)
return tn->mon_threshold;
}
-int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg)
+static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer,
+ struct tipc_nl_msg *msg)
{
struct tipc_mon_domain *dom = peer->domain;
struct nlattr *attrs;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index b6c45dccba3d..f48e5857210f 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -416,26 +416,31 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu)
*/
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos)
{
- struct tipc_msg *msg;
- int imsz, offset;
+ struct tipc_msg *hdr, *ihdr;
+ int imsz;
*iskb = NULL;
if (unlikely(skb_linearize(skb)))
goto none;
- msg = buf_msg(skb);
- offset = msg_hdr_sz(msg) + *pos;
- if (unlikely(offset > (msg_size(msg) - MIN_H_SIZE)))
+ hdr = buf_msg(skb);
+ if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE)))
goto none;
- *iskb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!*iskb))
+ ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos);
+ imsz = msg_size(ihdr);
+
+ if ((*pos + imsz) > msg_data_sz(hdr))
goto none;
- skb_pull(*iskb, offset);
- imsz = msg_size(buf_msg(*iskb));
- skb_trim(*iskb, imsz);
+
+ *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC);
+ if (!*iskb)
+ goto none;
+
+ skb_copy_to_linear_data(*iskb, ihdr, imsz);
if (unlikely(!tipc_msg_validate(iskb)))
goto none;
+
*pos += align(imsz);
return true;
none:
@@ -494,60 +499,56 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
/**
* tipc_msg_reverse(): swap source and destination addresses and add error code
* @own_node: originating node id for reversed message
- * @skb: buffer containing message to be reversed; may be replaced.
+ * @skb: buffer containing message to be reversed; will be consumed
* @err: error code to be set in message, if any
- * Consumes buffer at failure
+ * Replaces consumed buffer with new one when successful
* Returns true if success, otherwise false
*/
bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err)
{
struct sk_buff *_skb = *skb;
- struct tipc_msg *hdr;
- struct tipc_msg ohdr;
- int dlen;
+ struct tipc_msg *_hdr, *hdr;
+ int hlen, dlen;
if (skb_linearize(_skb))
goto exit;
- hdr = buf_msg(_skb);
- dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE);
- if (msg_dest_droppable(hdr))
+ _hdr = buf_msg(_skb);
+ dlen = min_t(uint, msg_data_sz(_hdr), MAX_FORWARD_SIZE);
+ hlen = msg_hdr_sz(_hdr);
+
+ if (msg_dest_droppable(_hdr))
goto exit;
- if (msg_errcode(hdr))
+ if (msg_errcode(_hdr))
goto exit;
- /* Take a copy of original header before altering message */
- memcpy(&ohdr, hdr, msg_hdr_sz(hdr));
-
- /* Never return SHORT header; expand by replacing buffer if necessary */
- if (msg_short(hdr)) {
- *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen, GFP_ATOMIC);
- if (!*skb)
- goto exit;
- memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen);
- kfree_skb(_skb);
- _skb = *skb;
- hdr = buf_msg(_skb);
- memcpy(hdr, &ohdr, BASIC_H_SIZE);
- msg_set_hdr_sz(hdr, BASIC_H_SIZE);
- }
+ /* Never return SHORT header */
+ if (hlen == SHORT_H_SIZE)
+ hlen = BASIC_H_SIZE;
- if (skb_cloned(_skb) &&
- pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
+ /* Don't return data along with SYN+, - sender has a clone */
+ if (msg_is_syn(_hdr) && err == TIPC_ERR_OVERLOAD)
+ dlen = 0;
+
+ /* Allocate new buffer to return */
+ *skb = tipc_buf_acquire(hlen + dlen, GFP_ATOMIC);
+ if (!*skb)
goto exit;
+ memcpy((*skb)->data, _skb->data, msg_hdr_sz(_hdr));
+ memcpy((*skb)->data + hlen, msg_data(_hdr), dlen);
- /* reassign after skb header modifications */
- hdr = buf_msg(_skb);
- /* Now reverse the concerned fields */
+ /* Build reverse header in new buffer */
+ hdr = buf_msg(*skb);
+ msg_set_hdr_sz(hdr, hlen);
msg_set_errcode(hdr, err);
msg_set_non_seq(hdr, 0);
- msg_set_origport(hdr, msg_destport(&ohdr));
- msg_set_destport(hdr, msg_origport(&ohdr));
- msg_set_destnode(hdr, msg_prevnode(&ohdr));
+ msg_set_origport(hdr, msg_destport(_hdr));
+ msg_set_destport(hdr, msg_origport(_hdr));
+ msg_set_destnode(hdr, msg_prevnode(_hdr));
msg_set_prevnode(hdr, own_node);
msg_set_orignode(hdr, own_node);
- msg_set_size(hdr, msg_hdr_sz(hdr) + dlen);
- skb_trim(_skb, msg_size(hdr));
+ msg_set_size(hdr, hlen + dlen);
skb_orphan(_skb);
+ kfree_skb(_skb);
return true;
exit:
kfree_skb(_skb);
@@ -555,6 +556,22 @@ exit:
return false;
}
+bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy)
+{
+ struct sk_buff *skb, *_skb;
+
+ skb_queue_walk(msg, skb) {
+ _skb = skb_clone(skb, GFP_ATOMIC);
+ if (!_skb) {
+ __skb_queue_purge(cpy);
+ pr_err_ratelimited("Failed to clone buffer chain\n");
+ return false;
+ }
+ __skb_queue_tail(cpy, _skb);
+ }
+ return true;
+}
+
/**
* tipc_msg_lookup_dest(): try to find new destination for named message
* @skb: the buffer containing the message.
@@ -595,10 +612,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
if (!skb_cloned(skb))
return true;
- /* Unclone buffer in case it was bundled */
- if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
- return false;
-
return true;
}
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index a4e944d59394..a2879e6ec5b6 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -216,6 +216,16 @@ static inline void msg_set_non_seq(struct tipc_msg *m, u32 n)
msg_set_bits(m, 0, 20, 1, n);
}
+static inline int msg_is_syn(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 17, 1);
+}
+
+static inline void msg_set_syn(struct tipc_msg *m, u32 d)
+{
+ msg_set_bits(m, 0, 17, 1, d);
+}
+
static inline int msg_dest_droppable(struct tipc_msg *m)
{
return msg_bits(m, 0, 19, 1);
@@ -970,6 +980,7 @@ bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg,
struct sk_buff_head *cpy);
void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno,
struct sk_buff *skb);
+bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy);
static inline u16 buf_seqno(struct sk_buff *skb)
{
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 51b4b96f89db..61219f0b9677 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -94,8 +94,9 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
list_add_tail_rcu(&publ->binding_node, &nt->node_scope);
return NULL;
}
- list_add_tail_rcu(&publ->binding_node, &nt->cluster_scope);
-
+ write_lock_bh(&nt->cluster_scope_lock);
+ list_add_tail(&publ->binding_node, &nt->cluster_scope);
+ write_unlock_bh(&nt->cluster_scope_lock);
skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
if (!skb) {
pr_warn("Publication distribution failure\n");
@@ -112,11 +113,13 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
*/
struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
{
+ struct name_table *nt = tipc_name_table(net);
struct sk_buff *buf;
struct distr_item *item;
+ write_lock_bh(&nt->cluster_scope_lock);
list_del(&publ->binding_node);
-
+ write_unlock_bh(&nt->cluster_scope_lock);
if (publ->scope == TIPC_NODE_SCOPE)
return NULL;
@@ -189,11 +192,10 @@ void tipc_named_node_up(struct net *net, u32 dnode)
__skb_queue_head_init(&head);
- rcu_read_lock();
+ read_lock_bh(&nt->cluster_scope_lock);
named_distribute(net, &head, dnode, &nt->cluster_scope);
- rcu_read_unlock();
-
tipc_node_xmit(net, &head, dnode, 0);
+ read_unlock_bh(&nt->cluster_scope_lock);
}
/**
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index bebe88cae07b..bff241f03525 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -735,7 +735,7 @@ int tipc_nametbl_init(struct net *net)
struct name_table *nt;
int i;
- nt = kzalloc(sizeof(*nt), GFP_ATOMIC);
+ nt = kzalloc(sizeof(*nt), GFP_KERNEL);
if (!nt)
return -ENOMEM;
@@ -744,6 +744,7 @@ int tipc_nametbl_init(struct net *net)
INIT_LIST_HEAD(&nt->node_scope);
INIT_LIST_HEAD(&nt->cluster_scope);
+ rwlock_init(&nt->cluster_scope_lock);
tn->nametbl = nt;
spin_lock_init(&tn->nametbl_lock);
return 0;
@@ -980,20 +981,17 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port)
{
- u64 value = (u64)node << 32 | port;
struct tipc_dest *dst;
list_for_each_entry(dst, l, list) {
- if (dst->value != value)
- continue;
- return dst;
+ if (dst->node == node && dst->port == port)
+ return dst;
}
return NULL;
}
bool tipc_dest_push(struct list_head *l, u32 node, u32 port)
{
- u64 value = (u64)node << 32 | port;
struct tipc_dest *dst;
if (tipc_dest_find(l, node, port))
@@ -1002,7 +1000,8 @@ bool tipc_dest_push(struct list_head *l, u32 node, u32 port)
dst = kmalloc(sizeof(*dst), GFP_ATOMIC);
if (unlikely(!dst))
return false;
- dst->value = value;
+ dst->node = node;
+ dst->port = port;
list_add(&dst->list, l);
return true;
}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 0febba41da86..f79066334cc8 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -100,6 +100,7 @@ struct name_table {
struct hlist_head services[TIPC_NAMETBL_SIZE];
struct list_head node_scope;
struct list_head cluster_scope;
+ rwlock_t cluster_scope_lock;
u32 local_publ_count;
};
@@ -133,13 +134,8 @@ void tipc_nametbl_stop(struct net *net);
struct tipc_dest {
struct list_head list;
- union {
- struct {
- u32 port;
- u32 node;
- };
- u64 value;
- };
+ u32 port;
+ u32 node;
};
struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 6ff2254088f6..99ee419210ba 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -167,7 +167,9 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
},
{
.cmd = TIPC_NL_SOCK_GET,
+ .start = tipc_dump_start,
.dumpit = tipc_nl_sk_dump,
+ .done = tipc_dump_done,
.policy = tipc_nl_policy,
},
{
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index a2f76743c73a..6376467e78f8 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -185,6 +185,10 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
return -ENOMEM;
buf->sk = msg->dst_sk;
+ if (__tipc_dump_start(&cb, msg->net)) {
+ kfree_skb(buf);
+ return -ENOMEM;
+ }
do {
int rem;
@@ -216,6 +220,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
err = 0;
err_out:
+ tipc_dump_done(&cb);
kfree_skb(buf);
if (err == -EMSGSIZE) {
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 0453bd451ce8..2afc4f8c37a7 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -45,6 +45,7 @@
#include "netlink.h"
#define INVALID_NODE_SIG 0x10000
+#define NODE_CLEANUP_AFTER 300000
/* Flags used to take different actions according to flag type
* TIPC_NOTIFY_NODE_DOWN: notify node is down
@@ -96,6 +97,7 @@ struct tipc_bclink_entry {
* @link_id: local and remote bearer ids of changing link, if any
* @publ_list: list of publications
* @rcu: rcu struct for tipc_node
+ * @delete_at: indicates the time for deleting a down node
*/
struct tipc_node {
u32 addr;
@@ -109,6 +111,7 @@ struct tipc_node {
int action_flags;
struct list_head list;
int state;
+ bool failover_sent;
u16 sync_point;
int link_cnt;
u16 working_links;
@@ -121,6 +124,7 @@ struct tipc_node {
unsigned long keepalive_intv;
struct timer_list timer;
struct rcu_head rcu;
+ unsigned long delete_at;
};
/* Node FSM states and events:
@@ -160,6 +164,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
static void tipc_node_put(struct tipc_node *node);
static bool node_is_up(struct tipc_node *n);
+static void tipc_node_delete_from_list(struct tipc_node *node);
struct tipc_sock_conn {
u32 port;
@@ -359,13 +364,24 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *n, *temp_node;
+ struct tipc_link *l;
+ int bearer_id;
int i;
spin_lock_bh(&tn->node_list_lock);
n = tipc_node_find(net, addr);
if (n) {
+ if (n->capabilities == capabilities)
+ goto exit;
/* Same node may come back with new capabilities */
+ write_lock_bh(&n->lock);
n->capabilities = capabilities;
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ l = n->links[bearer_id].link;
+ if (l)
+ tipc_link_update_caps(l, capabilities);
+ }
+ write_unlock_bh(&n->lock);
goto exit;
}
n = kzalloc(sizeof(*n), GFP_ATOMIC);
@@ -390,6 +406,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
for (i = 0; i < MAX_BEARERS; i++)
spin_lock_init(&n->links[i].lock);
n->state = SELF_DOWN_PEER_LEAVING;
+ n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
n->signature = INVALID_NODE_SIG;
n->active_links[0] = INVALID_BEARER_ID;
n->active_links[1] = INVALID_BEARER_ID;
@@ -433,11 +450,16 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
tipc_link_set_abort_limit(l, tol / n->keepalive_intv);
}
-static void tipc_node_delete(struct tipc_node *node)
+static void tipc_node_delete_from_list(struct tipc_node *node)
{
list_del_rcu(&node->list);
hlist_del_rcu(&node->hash);
tipc_node_put(node);
+}
+
+static void tipc_node_delete(struct tipc_node *node)
+{
+ tipc_node_delete_from_list(node);
del_timer_sync(&node->timer);
tipc_node_put(node);
@@ -544,6 +566,42 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
tipc_node_put(node);
}
+static void tipc_node_clear_links(struct tipc_node *node)
+{
+ int i;
+
+ for (i = 0; i < MAX_BEARERS; i++) {
+ struct tipc_link_entry *le = &node->links[i];
+
+ if (le->link) {
+ kfree(le->link);
+ le->link = NULL;
+ node->link_cnt--;
+ }
+ }
+}
+
+/* tipc_node_cleanup - delete nodes that does not
+ * have active links for NODE_CLEANUP_AFTER time
+ */
+static int tipc_node_cleanup(struct tipc_node *peer)
+{
+ struct tipc_net *tn = tipc_net(peer->net);
+ bool deleted = false;
+
+ spin_lock_bh(&tn->node_list_lock);
+ tipc_node_write_lock(peer);
+
+ if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) {
+ tipc_node_clear_links(peer);
+ tipc_node_delete_from_list(peer);
+ deleted = true;
+ }
+ tipc_node_write_unlock(peer);
+ spin_unlock_bh(&tn->node_list_lock);
+ return deleted;
+}
+
/* tipc_node_timeout - handle expiration of node timer
*/
static void tipc_node_timeout(struct timer_list *t)
@@ -551,21 +609,29 @@ static void tipc_node_timeout(struct timer_list *t)
struct tipc_node *n = from_timer(n, t, timer);
struct tipc_link_entry *le;
struct sk_buff_head xmitq;
+ int remains = n->link_cnt;
int bearer_id;
int rc = 0;
+ if (!node_is_up(n) && tipc_node_cleanup(n)) {
+ /*Removing the reference of Timer*/
+ tipc_node_put(n);
+ return;
+ }
+
__skb_queue_head_init(&xmitq);
- for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) {
tipc_node_read_lock(n);
le = &n->links[bearer_id];
- spin_lock_bh(&le->lock);
if (le->link) {
+ spin_lock_bh(&le->lock);
/* Link tolerance may change asynchronously: */
tipc_node_calculate_timer(n, le->link);
rc = tipc_link_timeout(le->link, &xmitq);
+ spin_unlock_bh(&le->lock);
+ remains--;
}
- spin_unlock_bh(&le->lock);
tipc_node_read_unlock(n);
tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
if (rc & TIPC_LINK_DOWN_EVT)
@@ -615,6 +681,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
*slot0 = bearer_id;
*slot1 = bearer_id;
tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT);
+ n->failover_sent = false;
n->action_flags |= TIPC_NOTIFY_NODE_UP;
tipc_link_set_active(nl, true);
tipc_bcast_add_peer(n->net, nl, xmitq);
@@ -846,6 +913,7 @@ void tipc_node_check_dest(struct net *net, u32 addr,
bool reset = true;
char *if_name;
unsigned long intv;
+ u16 session;
*dupl_addr = false;
*respond = false;
@@ -932,9 +1000,10 @@ void tipc_node_check_dest(struct net *net, u32 addr,
goto exit;
if_name = strchr(b->name, ':') + 1;
+ get_random_bytes(&session, sizeof(u16));
if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
b->net_plane, b->mtu, b->priority,
- b->window, mod(tipc_net(net)->random),
+ b->window, session,
tipc_own_addr(net), addr, peer_id,
n->capabilities,
tipc_bc_sndlink(n->net), n->bc_entry.link,
@@ -1174,6 +1243,7 @@ static void node_lost_contact(struct tipc_node *n,
uint i;
pr_debug("Lost contact with %x\n", n->addr);
+ n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER);
/* Clean up broadcast state */
tipc_bcast_remove_peer(n->net, n->bc_entry.link);
@@ -1481,7 +1551,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
* tipc_node_check_state - check and if necessary update node state
* @skb: TIPC packet
* @bearer_id: identity of bearer delivering the packet
- * Returns true if state is ok, otherwise consumes buffer and returns false
+ * Returns true if state and msg are ok, otherwise false
*/
static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
int bearer_id, struct sk_buff_head *xmitq)
@@ -1515,6 +1585,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
}
}
+ if (!tipc_link_validate_msg(l, hdr))
+ return false;
+
/* Check and update node accesibility if applicable */
if (state == SELF_UP_PEER_COMING) {
if (!tipc_link_is_up(l))
@@ -1546,6 +1619,14 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl),
tipc_link_inputq(l));
}
+ /* If parallel link was already down, and this happened before
+ * the tunnel link came up, FAILOVER was never sent. Ensure that
+ * FAILOVER is sent to get peer out of NODE_FAILINGOVER state.
+ */
+ if (n->state != NODE_FAILINGOVER && !n->failover_sent) {
+ tipc_link_create_dummy_tnl_msg(l, xmitq);
+ n->failover_sent = true;
+ }
/* If pkts arrive out of order, use lowest calculated syncpt */
if (less(syncpt, n->sync_point))
n->sync_point = syncpt;
@@ -1743,7 +1824,6 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
struct tipc_node *peer;
u32 addr;
int err;
- int i;
/* We identify the peer by its net */
if (!info->attrs[TIPC_NLA_NET])
@@ -1778,15 +1858,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
goto err_out;
}
- for (i = 0; i < MAX_BEARERS; i++) {
- struct tipc_link_entry *le = &peer->links[i];
-
- if (le->link) {
- kfree(le->link);
- le->link = NULL;
- peer->link_cnt--;
- }
- }
+ tipc_node_clear_links(peer);
tipc_node_write_unlock(peer);
tipc_node_delete(peer);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 846c8f240872..03f5efb62cfb 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -45,18 +45,22 @@
/* Optional capabilities supported by this code version
*/
enum {
+ TIPC_SYN_BIT = (1),
TIPC_BCAST_SYNCH = (1 << 1),
TIPC_BCAST_STATE_NACK = (1 << 2),
TIPC_BLOCK_FLOWCTL = (1 << 3),
TIPC_BCAST_RCAST = (1 << 4),
- TIPC_NODE_ID128 = (1 << 5)
+ TIPC_NODE_ID128 = (1 << 5),
+ TIPC_LINK_PROTO_SEQNO = (1 << 6)
};
-#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
- TIPC_BCAST_STATE_NACK | \
- TIPC_BCAST_RCAST | \
- TIPC_BLOCK_FLOWCTL | \
- TIPC_NODE_ID128)
+#define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \
+ TIPC_BCAST_SYNCH | \
+ TIPC_BCAST_STATE_NACK | \
+ TIPC_BCAST_RCAST | \
+ TIPC_BLOCK_FLOWCTL | \
+ TIPC_NODE_ID128 | \
+ TIPC_LINK_PROTO_SEQNO)
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 930852c54d7a..636e6131769d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -47,7 +47,7 @@
#include "netlink.h"
#include "group.h"
-#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
+#define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */
#define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */
#define TIPC_FWD_MSG 1
#define TIPC_MAX_PORT 0xffffffff
@@ -80,7 +80,6 @@ struct sockaddr_pair {
* @publications: list of publications for port
* @blocking_link: address of the congested link we are currently sleeping on
* @pub_count: total # of publications port has made during its lifetime
- * @probing_state:
* @conn_timeout: the time we can wait for an unresponded setup request
* @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue
* @cong_link_cnt: number of congested links
@@ -102,8 +101,8 @@ struct tipc_sock {
struct list_head cong_links;
struct list_head publications;
u32 pub_count;
- uint conn_timeout;
atomic_t dupl_rcvcnt;
+ u16 conn_timeout;
bool probe_unacked;
u16 cong_link_cnt;
u16 snt_unacked;
@@ -411,7 +410,6 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout)
static int tipc_sk_create(struct net *net, struct socket *sock,
int protocol, int kern)
{
- struct tipc_net *tn;
const struct proto_ops *ops;
struct sock *sk;
struct tipc_sock *tsk;
@@ -446,7 +444,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
INIT_LIST_HEAD(&tsk->publications);
INIT_LIST_HEAD(&tsk->cong_links);
msg = &tsk->phdr;
- tn = net_generic(sock_net(sk), tipc_net_id);
/* Finish initializing socket data structures */
sock->ops = ops;
@@ -509,6 +506,9 @@ static void __tipc_shutdown(struct socket *sock, int error)
tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
!tsk_conn_cong(tsk)));
+ /* Remove any pending SYN message */
+ __skb_queue_purge(&sk->sk_write_queue);
+
/* Reject all unreceived messages, except on an active connection
* (which disconnects locally & sends a 'FIN+' to peer).
*/
@@ -578,6 +578,7 @@ static int tipc_release(struct socket *sock)
sk_stop_timer(sk, &sk->sk_timer);
tipc_sk_remove(tsk);
+ sock_orphan(sk);
/* Reject any messages that accumulated in backlog queue */
release_sock(sk);
tipc_dest_list_purge(&tsk->cong_links);
@@ -716,7 +717,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
struct tipc_sock *tsk = tipc_sk(sk);
__poll_t revents = 0;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
if (sk->sk_shutdown & RCV_SHUTDOWN)
revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
@@ -1117,7 +1118,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
u32 self = tipc_own_addr(net);
u32 type, lower, upper, scope;
struct sk_buff *skb, *_skb;
- u32 portid, oport, onode;
+ u32 portid, onode;
struct sk_buff_head tmpq;
struct list_head dports;
struct tipc_msg *hdr;
@@ -1133,7 +1134,6 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
user = msg_user(hdr);
mtyp = msg_type(hdr);
hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
- oport = msg_origport(hdr);
onode = msg_orignode(hdr);
type = msg_nametype(hdr);
@@ -1198,6 +1198,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
* @skb: pointer to message buffer.
*/
static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
+ struct sk_buff_head *inputq,
struct sk_buff_head *xmitq)
{
struct tipc_msg *hdr = buf_msg(skb);
@@ -1215,7 +1216,16 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk),
tsk_peer_port(tsk));
sk->sk_state_change(sk);
- goto exit;
+
+ /* State change is ignored if socket already awake,
+ * - convert msg to abort msg and add to inqueue
+ */
+ msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE);
+ msg_set_type(hdr, TIPC_CONN_MSG);
+ msg_set_size(hdr, BASIC_H_SIZE);
+ msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+ __skb_queue_tail(inputq, skb);
+ return;
}
tsk->probe_unacked = false;
@@ -1321,6 +1331,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
tsk->conn_type = dest->addr.name.name.type;
tsk->conn_instance = dest->addr.name.name.instance;
}
+ msg_set_syn(hdr, 1);
}
seq = &dest->addr.nameseq;
@@ -1363,6 +1374,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
+ if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue)))
+ return -ENOMEM;
rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
if (unlikely(rc == -ELINKCONG)) {
@@ -1421,8 +1434,10 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
/* Handle implicit connection setup */
if (unlikely(dest)) {
rc = __tipc_sendmsg(sock, m, dlen);
- if (dlen && (dlen == rc))
+ if (dlen && dlen == rc) {
+ tsk->peer_caps = tipc_node_get_capabilities(net, dnode);
tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr));
+ }
return rc;
}
@@ -1480,6 +1495,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
struct net *net = sock_net(sk);
struct tipc_msg *msg = &tsk->phdr;
+ msg_set_syn(msg, 0);
msg_set_destnode(msg, peer_node);
msg_set_destport(msg, peer_port);
msg_set_type(msg, TIPC_CONN_MSG);
@@ -1491,6 +1507,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
+ __skb_queue_purge(&sk->sk_write_queue);
if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
return;
@@ -1936,7 +1953,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
switch (msg_user(hdr)) {
case CONN_MANAGER:
- tipc_sk_conn_proto_rcv(tsk, skb, xmitq);
+ tipc_sk_conn_proto_rcv(tsk, skb, inputq, xmitq);
return;
case SOCK_WAKEUP:
tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0);
@@ -1961,91 +1978,90 @@ static void tipc_sk_proto_rcv(struct sock *sk,
}
/**
- * tipc_filter_connect - Handle incoming message for a connection-based socket
+ * tipc_sk_filter_connect - check incoming message for a connection-based socket
* @tsk: TIPC socket
- * @skb: pointer to message buffer. Set to NULL if buffer is consumed
- *
- * Returns true if everything ok, false otherwise
+ * @skb: pointer to message buffer.
+ * Returns true if message should be added to receive queue, false otherwise
*/
static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
{
struct sock *sk = &tsk->sk;
struct net *net = sock_net(sk);
struct tipc_msg *hdr = buf_msg(skb);
- u32 pport = msg_origport(hdr);
- u32 pnode = msg_orignode(hdr);
+ bool con_msg = msg_connected(hdr);
+ u32 pport = tsk_peer_port(tsk);
+ u32 pnode = tsk_peer_node(tsk);
+ u32 oport = msg_origport(hdr);
+ u32 onode = msg_orignode(hdr);
+ int err = msg_errcode(hdr);
+ unsigned long delay;
if (unlikely(msg_mcast(hdr)))
return false;
switch (sk->sk_state) {
case TIPC_CONNECTING:
- /* Accept only ACK or NACK message */
- if (unlikely(!msg_connected(hdr))) {
- if (pport != tsk_peer_port(tsk) ||
- pnode != tsk_peer_node(tsk))
- return false;
-
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
- sk->sk_err = ECONNREFUSED;
- sk->sk_state_change(sk);
- return true;
- }
-
- if (unlikely(msg_errcode(hdr))) {
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
- sk->sk_err = ECONNREFUSED;
- sk->sk_state_change(sk);
- return true;
- }
-
- if (unlikely(!msg_isdata(hdr))) {
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
- sk->sk_err = EINVAL;
- sk->sk_state_change(sk);
- return true;
+ /* Setup ACK */
+ if (likely(con_msg)) {
+ if (err)
+ break;
+ tipc_sk_finish_conn(tsk, oport, onode);
+ msg_set_importance(&tsk->phdr, msg_importance(hdr));
+ /* ACK+ message with data is added to receive queue */
+ if (msg_data_sz(hdr))
+ return true;
+ /* Empty ACK-, - wake up sleeping connect() and drop */
+ sk->sk_data_ready(sk);
+ msg_set_dest_droppable(hdr, 1);
+ return false;
}
+ /* Ignore connectionless message if not from listening socket */
+ if (oport != pport || onode != pnode)
+ return false;
- tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr));
- msg_set_importance(&tsk->phdr, msg_importance(hdr));
-
- /* If 'ACK+' message, add to socket receive queue */
- if (msg_data_sz(hdr))
- return true;
-
- /* If empty 'ACK-' message, wake up sleeping connect() */
- sk->sk_data_ready(sk);
+ /* Rejected SYN */
+ if (err != TIPC_ERR_OVERLOAD)
+ break;
- /* 'ACK-' message is neither accepted nor rejected: */
- msg_set_dest_droppable(hdr, 1);
+ /* Prepare for new setup attempt if we have a SYN clone */
+ if (skb_queue_empty(&sk->sk_write_queue))
+ break;
+ get_random_bytes(&delay, 2);
+ delay %= (tsk->conn_timeout / 4);
+ delay = msecs_to_jiffies(delay + 100);
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + delay);
return false;
-
case TIPC_OPEN:
case TIPC_DISCONNECTING:
- break;
+ return false;
case TIPC_LISTEN:
/* Accept only SYN message */
- if (!msg_connected(hdr) && !(msg_errcode(hdr)))
+ if (!msg_is_syn(hdr) &&
+ tipc_node_get_capabilities(net, onode) & TIPC_SYN_BIT)
+ return false;
+ if (!con_msg && !err)
return true;
- break;
+ return false;
case TIPC_ESTABLISHED:
/* Accept only connection-based messages sent by peer */
- if (unlikely(!tsk_peer_msg(tsk, hdr)))
+ if (likely(con_msg && !err && pport == oport && pnode == onode))
+ return true;
+ if (!tsk_peer_msg(tsk, hdr))
return false;
-
- if (unlikely(msg_errcode(hdr))) {
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
- /* Let timer expire on it's own */
- tipc_node_remove_conn(net, tsk_peer_node(tsk),
- tsk->portid);
- sk->sk_state_change(sk);
- }
+ if (!err)
+ return true;
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ tipc_node_remove_conn(net, pnode, tsk->portid);
+ sk->sk_state_change(sk);
return true;
default:
pr_err("Unknown sk_state %u\n", sk->sk_state);
}
-
- return false;
+ /* Abort connection setup attempt */
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ sk->sk_err = ECONNREFUSED;
+ sk->sk_state_change(sk);
+ return true;
}
/**
@@ -2547,43 +2563,78 @@ static int tipc_shutdown(struct socket *sock, int how)
return res;
}
+static void tipc_sk_check_probing_state(struct sock *sk,
+ struct sk_buff_head *list)
+{
+ struct tipc_sock *tsk = tipc_sk(sk);
+ u32 pnode = tsk_peer_node(tsk);
+ u32 pport = tsk_peer_port(tsk);
+ u32 self = tsk_own_node(tsk);
+ u32 oport = tsk->portid;
+ struct sk_buff *skb;
+
+ if (tsk->probe_unacked) {
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ sk->sk_err = ECONNABORTED;
+ tipc_node_remove_conn(sock_net(sk), pnode, pport);
+ sk->sk_state_change(sk);
+ return;
+ }
+ /* Prepare new probe */
+ skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0,
+ pnode, self, pport, oport, TIPC_OK);
+ if (skb)
+ __skb_queue_tail(list, skb);
+ tsk->probe_unacked = true;
+ sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
+}
+
+static void tipc_sk_retry_connect(struct sock *sk, struct sk_buff_head *list)
+{
+ struct tipc_sock *tsk = tipc_sk(sk);
+
+ /* Try again later if dest link is congested */
+ if (tsk->cong_link_cnt) {
+ sk_reset_timer(sk, &sk->sk_timer, msecs_to_jiffies(100));
+ return;
+ }
+ /* Prepare SYN for retransmit */
+ tipc_msg_skb_clone(&sk->sk_write_queue, list);
+}
+
static void tipc_sk_timeout(struct timer_list *t)
{
struct sock *sk = from_timer(sk, t, sk_timer);
struct tipc_sock *tsk = tipc_sk(sk);
- u32 peer_port = tsk_peer_port(tsk);
- u32 peer_node = tsk_peer_node(tsk);
- u32 own_node = tsk_own_node(tsk);
- u32 own_port = tsk->portid;
- struct net *net = sock_net(sk);
- struct sk_buff *skb = NULL;
+ u32 pnode = tsk_peer_node(tsk);
+ struct sk_buff_head list;
+ int rc = 0;
+ skb_queue_head_init(&list);
bh_lock_sock(sk);
- if (!tipc_sk_connected(sk))
- goto exit;
/* Try again later if socket is busy */
if (sock_owned_by_user(sk)) {
sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20);
- goto exit;
+ bh_unlock_sock(sk);
+ return;
}
- if (tsk->probe_unacked) {
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
- tipc_node_remove_conn(net, peer_node, peer_port);
- sk->sk_state_change(sk);
- goto exit;
- }
- /* Send new probe */
- skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0,
- peer_node, own_node, peer_port, own_port,
- TIPC_OK);
- tsk->probe_unacked = true;
- sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
-exit:
+ if (sk->sk_state == TIPC_ESTABLISHED)
+ tipc_sk_check_probing_state(sk, &list);
+ else if (sk->sk_state == TIPC_CONNECTING)
+ tipc_sk_retry_connect(sk, &list);
+
bh_unlock_sock(sk);
- if (skb)
- tipc_node_xmit_skb(net, skb, peer_node, own_port);
+
+ if (!skb_queue_empty(&list))
+ rc = tipc_node_xmit(sock_net(sk), &list, pnode, tsk->portid);
+
+ /* SYN messages may cause link congestion */
+ if (rc == -ELINKCONG) {
+ tipc_dest_push(&tsk->cong_links, pnode, 0);
+ tsk->cong_link_cnt = 1;
+ }
sock_put(sk);
}
@@ -2675,6 +2726,8 @@ void tipc_sk_reinit(struct net *net)
rhashtable_walk_stop(&iter);
} while (tsk == ERR_PTR(-EAGAIN));
+
+ rhashtable_walk_exit(&iter);
}
static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
@@ -3230,45 +3283,74 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
struct netlink_callback *cb,
struct tipc_sock *tsk))
{
- struct net *net = sock_net(skb->sk);
- struct tipc_net *tn = tipc_net(net);
- const struct bucket_table *tbl;
- u32 prev_portid = cb->args[1];
- u32 tbl_id = cb->args[0];
- struct rhash_head *pos;
+ struct rhashtable_iter *iter = (void *)cb->args[4];
struct tipc_sock *tsk;
int err;
- rcu_read_lock();
- tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
- for (; tbl_id < tbl->size; tbl_id++) {
- rht_for_each_entry_rcu(tsk, pos, tbl, tbl_id, node) {
- spin_lock_bh(&tsk->sk.sk_lock.slock);
- if (prev_portid && prev_portid != tsk->portid) {
- spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ rhashtable_walk_start(iter);
+ while ((tsk = rhashtable_walk_next(iter)) != NULL) {
+ if (IS_ERR(tsk)) {
+ err = PTR_ERR(tsk);
+ if (err == -EAGAIN) {
+ err = 0;
continue;
}
+ break;
+ }
- err = skb_handler(skb, cb, tsk);
- if (err) {
- prev_portid = tsk->portid;
- spin_unlock_bh(&tsk->sk.sk_lock.slock);
- goto out;
- }
-
- prev_portid = 0;
- spin_unlock_bh(&tsk->sk.sk_lock.slock);
+ sock_hold(&tsk->sk);
+ rhashtable_walk_stop(iter);
+ lock_sock(&tsk->sk);
+ err = skb_handler(skb, cb, tsk);
+ if (err) {
+ release_sock(&tsk->sk);
+ sock_put(&tsk->sk);
+ goto out;
}
+ release_sock(&tsk->sk);
+ rhashtable_walk_start(iter);
+ sock_put(&tsk->sk);
}
+ rhashtable_walk_stop(iter);
out:
- rcu_read_unlock();
- cb->args[0] = tbl_id;
- cb->args[1] = prev_portid;
-
return skb->len;
}
EXPORT_SYMBOL(tipc_nl_sk_walk);
+int tipc_dump_start(struct netlink_callback *cb)
+{
+ return __tipc_dump_start(cb, sock_net(cb->skb->sk));
+}
+EXPORT_SYMBOL(tipc_dump_start);
+
+int __tipc_dump_start(struct netlink_callback *cb, struct net *net)
+{
+ /* tipc_nl_name_table_dump() uses cb->args[0...3]. */
+ struct rhashtable_iter *iter = (void *)cb->args[4];
+ struct tipc_net *tn = tipc_net(net);
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[4] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&tn->sk_rht, iter);
+ return 0;
+}
+
+int tipc_dump_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *hti = (void *)cb->args[4];
+
+ rhashtable_walk_exit(hti);
+ kfree(hti);
+ return 0;
+}
+EXPORT_SYMBOL(tipc_dump_done);
+
int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
struct tipc_sock *tsk, u32 sk_filter_state,
u64 (*tipc_diag_gen_cookie)(struct sock *sk))
@@ -3320,6 +3402,11 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
goto stat_msg_cancel;
nla_nest_end(skb, stat);
+
+ if (tsk->group)
+ if (tipc_group_fill_sock_diag(tsk->group, skb))
+ goto stat_msg_cancel;
+
nla_nest_end(skb, attrs);
return 0;
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index aff9b2ae5a1f..5e575f205afe 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -68,4 +68,7 @@ int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
int (*skb_handler)(struct sk_buff *skb,
struct netlink_callback *cb,
struct tipc_sock *tsk));
+int tipc_dump_start(struct netlink_callback *cb);
+int __tipc_dump_start(struct netlink_callback *cb, struct net *net);
+int tipc_dump_done(struct netlink_callback *cb);
#endif
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index c8e34ef22c30..4bdea0057171 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -57,16 +57,12 @@
* @idr_lock: protect the connection identifier set
* @idr_in_use: amount of allocated identifier entry
* @net: network namspace instance
- * @rcvbuf_cache: memory cache of server receive buffer
+ * @awork: accept work item
* @rcv_wq: receive workqueue
* @send_wq: send workqueue
* @max_rcvbuf_size: maximum permitted receive message length
- * @tipc_conn_new: callback will be called when new connection is incoming
- * @tipc_conn_release: callback will be called before releasing the connection
- * @tipc_conn_recvmsg: callback will be called when message arrives
+ * @listener: topsrv listener socket
* @name: server name
- * @imp: message importance
- * @type: socket type
*/
struct tipc_topsrv {
struct idr conn_idr;
@@ -90,9 +86,7 @@ struct tipc_topsrv {
* @server: pointer to connected server
* @sub_list: lsit to all pertaing subscriptions
* @sub_lock: lock protecting the subscription list
- * @outqueue_lock: control access to the outqueue
* @rwork: receive work item
- * @rx_action: what to do when connection socket is active
* @outqueue: pointer to first outbound message in queue
* @outqueue_lock: control access to the outqueue
* @swork: send work item
@@ -313,8 +307,8 @@ static void tipc_conn_send_work(struct work_struct *work)
conn_put(con);
}
-/* tipc_conn_queue_evt() - interrupt level call from a subscription instance
- * The queued work is launched into tipc_send_work()->tipc_send_to_sock()
+/* tipc_topsrv_queue_evt() - interrupt level call from a subscription instance
+ * The queued work is launched into tipc_conn_send_work()->tipc_conn_send_to_sock()
*/
void tipc_topsrv_queue_evt(struct net *net, int conid,
u32 event, struct tipc_event *evt)
@@ -657,7 +651,7 @@ int tipc_topsrv_start(struct net *net)
srv->max_rcvbuf_size = sizeof(struct tipc_subscr);
INIT_WORK(&srv->awork, tipc_topsrv_accept);
- strncpy(srv->name, name, strlen(name) + 1);
+ strscpy(srv->name, name, sizeof(srv->name));
tn->topsrv = srv;
atomic_set(&tn->subscription_count, 0);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 9783101bc4a9..10dc59ce9c82 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -650,6 +650,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
struct udp_tunnel_sock_cfg tuncfg = {NULL};
struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
u8 node_id[NODE_ID_LEN] = {0,};
+ int rmcast = 0;
ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
if (!ub)
@@ -680,6 +681,9 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
if (err)
goto err;
+ /* Checking remote ip address */
+ rmcast = tipc_udp_is_mcast_addr(&remote);
+
/* Autoconfigure own node identity if needed */
if (!tipc_own_id(net)) {
memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16);
@@ -705,7 +709,12 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
goto err;
}
udp_conf.family = AF_INET;
- udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+
+ /* Switch to use ANY to receive packets from group */
+ if (rmcast)
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ else
+ udp_conf.local_ip.s_addr = local.ipv4.s_addr;
udp_conf.use_udp_checksums = false;
ub->ifindex = dev->ifindex;
if (tipc_mtu_bad(dev, sizeof(struct iphdr) +
@@ -719,7 +728,10 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
udp_conf.family = AF_INET6;
udp_conf.use_udp6_tx_checksums = true;
udp_conf.use_udp6_rx_checksums = true;
- udp_conf.local_ip6 = in6addr_any;
+ if (rmcast)
+ udp_conf.local_ip6 = in6addr_any;
+ else
+ udp_conf.local_ip6 = local.ipv6;
b->mtu = 1280;
#endif
} else {
@@ -741,7 +753,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
* is used if it's a multicast address.
*/
memcpy(&b->bcast_addr.value, &remote, sizeof(remote));
- if (tipc_udp_is_mcast_addr(&remote))
+ if (rmcast)
err = enable_mcast(ub, &remote);
else
err = tipc_udp_rcast_add(b, &remote);
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 73f05ece53d0..99c1a19c17b1 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -8,6 +8,7 @@ config TLS
select CRYPTO_AES
select CRYPTO_GCM
select STREAM_PARSER
+ select NET_SOCK_MSG
default n
---help---
Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a7a8f8e20ff3..276edbc04f38 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -52,9 +52,12 @@ static DEFINE_SPINLOCK(tls_device_lock);
static void tls_device_free_ctx(struct tls_context *ctx)
{
- struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+ if (ctx->tx_conf == TLS_HW)
+ kfree(tls_offload_ctx_tx(ctx));
+
+ if (ctx->rx_conf == TLS_HW)
+ kfree(tls_offload_ctx_rx(ctx));
- kfree(offload_ctx);
kfree(ctx);
}
@@ -71,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work)
list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
struct net_device *netdev = ctx->netdev;
- if (netdev) {
+ if (netdev && ctx->tx_conf == TLS_HW) {
netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
TLS_OFFLOAD_CTX_DIR_TX);
dev_put(netdev);
+ ctx->netdev = NULL;
}
list_del(&ctx->list);
@@ -82,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work)
}
}
+static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
+ struct net_device *netdev)
+{
+ if (sk->sk_destruct != tls_device_sk_destruct) {
+ refcount_set(&ctx->refcount, 1);
+ dev_hold(netdev);
+ ctx->netdev = netdev;
+ spin_lock_irq(&tls_device_lock);
+ list_add_tail(&ctx->list, &tls_device_list);
+ spin_unlock_irq(&tls_device_lock);
+
+ ctx->sk_destruct = sk->sk_destruct;
+ sk->sk_destruct = tls_device_sk_destruct;
+ }
+}
+
static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
{
unsigned long flags;
@@ -125,7 +145,7 @@ static void destroy_record(struct tls_record_info *record)
kfree(record);
}
-static void delete_all_records(struct tls_offload_context *offload_ctx)
+static void delete_all_records(struct tls_offload_context_tx *offload_ctx)
{
struct tls_record_info *info, *temp;
@@ -141,14 +161,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_record_info *info, *temp;
- struct tls_offload_context *ctx;
+ struct tls_offload_context_tx *ctx;
u64 deleted_records = 0;
unsigned long flags;
if (!tls_ctx)
return;
- ctx = tls_offload_ctx(tls_ctx);
+ ctx = tls_offload_ctx_tx(tls_ctx);
spin_lock_irqsave(&ctx->lock, flags);
info = ctx->retransmit_hint;
@@ -179,15 +199,17 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
void tls_device_sk_destruct(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
- if (ctx->open_record)
- destroy_record(ctx->open_record);
+ tls_ctx->sk_destruct(sk);
- delete_all_records(ctx);
- crypto_free_aead(ctx->aead_send);
- ctx->sk_destruct(sk);
- clean_acked_data_disable(inet_csk(sk));
+ if (tls_ctx->tx_conf == TLS_HW) {
+ if (ctx->open_record)
+ destroy_record(ctx->open_record);
+ delete_all_records(ctx);
+ crypto_free_aead(ctx->aead_send);
+ clean_acked_data_disable(inet_csk(sk));
+ }
if (refcount_dec_and_test(&tls_ctx->refcount))
tls_device_queue_ctx_destruction(tls_ctx);
@@ -219,7 +241,7 @@ static void tls_append_frag(struct tls_record_info *record,
static int tls_push_record(struct sock *sk,
struct tls_context *ctx,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct tls_record_info *record,
struct page_frag *pfrag,
int flags,
@@ -264,7 +286,7 @@ static int tls_push_record(struct sock *sk,
return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}
-static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
{
@@ -290,7 +312,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx,
}
static int tls_do_allocation(struct sock *sk,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
{
@@ -324,7 +346,7 @@ static int tls_push_data(struct sock *sk,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
struct tls_record_info *record = ctx->open_record;
@@ -399,7 +421,7 @@ last_record:
tls_push_record_flags = flags;
if (more) {
tls_ctx->pending_open_record_frags =
- record->num_frags;
+ !!record->num_frags;
break;
}
@@ -477,7 +499,7 @@ out:
return rc;
}
-struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
u32 seq, u64 *p_record_sn)
{
u64 record_sn = context->hint_record_sn;
@@ -520,11 +542,123 @@ static int tls_device_push_pending_record(struct sock *sk, int flags)
return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
}
+void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct net_device *netdev = tls_ctx->netdev;
+ struct tls_offload_context_rx *rx_ctx;
+ u32 is_req_pending;
+ s64 resync_req;
+ u32 req_seq;
+
+ if (tls_ctx->rx_conf != TLS_HW)
+ return;
+
+ rx_ctx = tls_offload_ctx_rx(tls_ctx);
+ resync_req = atomic64_read(&rx_ctx->resync_req);
+ req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1);
+ is_req_pending = resync_req;
+
+ if (unlikely(is_req_pending) && req_seq == seq &&
+ atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+ netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
+ seq + TLS_HEADER_SIZE - 1,
+ rcd_sn);
+}
+
+static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
+{
+ struct strp_msg *rxm = strp_msg(skb);
+ int err = 0, offset = rxm->offset, copy, nsg;
+ struct sk_buff *skb_iter, *unused;
+ struct scatterlist sg[1];
+ char *orig_buf, *buf;
+
+ orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation);
+ if (!orig_buf)
+ return -ENOMEM;
+ buf = orig_buf;
+
+ nsg = skb_cow_data(skb, 0, &unused);
+ if (unlikely(nsg < 0)) {
+ err = nsg;
+ goto free_buf;
+ }
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], buf,
+ rxm->full_len + TLS_HEADER_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ skb_copy_bits(skb, offset, buf,
+ TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+ /* We are interested only in the decrypted data not the auth */
+ err = decrypt_skb(sk, skb, sg);
+ if (err != -EBADMSG)
+ goto free_buf;
+ else
+ err = 0;
+
+ copy = min_t(int, skb_pagelen(skb) - offset,
+ rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+
+ if (skb->decrypted)
+ skb_store_bits(skb, offset, buf, copy);
+
+ offset += copy;
+ buf += copy;
+
+ skb_walk_frags(skb, skb_iter) {
+ copy = min_t(int, skb_iter->len,
+ rxm->full_len - offset + rxm->offset -
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+
+ if (skb_iter->decrypted)
+ skb_store_bits(skb_iter, offset, buf, copy);
+
+ offset += copy;
+ buf += copy;
+ }
+
+free_buf:
+ kfree(orig_buf);
+ return err;
+}
+
+int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
+ int is_decrypted = skb->decrypted;
+ int is_encrypted = !is_decrypted;
+ struct sk_buff *skb_iter;
+
+ /* Skip if it is already decrypted */
+ if (ctx->sw.decrypted)
+ return 0;
+
+ /* Check if all the data is decrypted already */
+ skb_walk_frags(skb, skb_iter) {
+ is_decrypted &= skb_iter->decrypted;
+ is_encrypted &= !skb_iter->decrypted;
+ }
+
+ ctx->sw.decrypted |= is_decrypted;
+
+ /* Return immedeatly if the record is either entirely plaintext or
+ * entirely ciphertext. Otherwise handle reencrypt partially decrypted
+ * record.
+ */
+ return (is_encrypted || is_decrypted) ? 0 :
+ tls_device_reencrypt(sk, skb);
+}
+
int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
{
u16 nonce_size, tag_size, iv_size, rec_seq_size;
struct tls_record_info *start_marker_record;
- struct tls_offload_context *offload_ctx;
+ struct tls_offload_context_tx *offload_ctx;
struct tls_crypto_info *crypto_info;
struct net_device *netdev;
char *iv, *rec_seq;
@@ -546,13 +680,13 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
goto out;
}
- offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+ offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
if (!offload_ctx) {
rc = -ENOMEM;
goto free_marker_record;
}
- crypto_info = &ctx->crypto_send;
+ crypto_info = &ctx->crypto_send.info;
switch (crypto_info->cipher_type) {
case TLS_CIPHER_AES_GCM_128:
nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
@@ -582,12 +716,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
ctx->tx.rec_seq_size = rec_seq_size;
- ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+ ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
if (!ctx->tx.rec_seq) {
rc = -ENOMEM;
goto free_iv;
}
- memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
if (rc)
@@ -609,7 +742,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
ctx->push_pending_record = tls_device_push_pending_record;
- offload_ctx->sk_destruct = sk->sk_destruct;
/* TLS offload is greatly simplified if we don't send
* SKBs where only part of the payload needs to be encrypted.
@@ -619,8 +751,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (skb)
TCP_SKB_CB(skb)->eor = 1;
- refcount_set(&ctx->refcount, 1);
-
/* We support starting offload on multiple sockets
* concurrently, so we only need a read lock here.
* This lock must precede get_netdev_for_sock to prevent races between
@@ -650,24 +780,19 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
ctx->priv_ctx_tx = offload_ctx;
rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
- &ctx->crypto_send,
+ &ctx->crypto_send.info,
tcp_sk(sk)->write_seq);
if (rc)
goto release_netdev;
- ctx->netdev = netdev;
-
- spin_lock_irq(&tls_device_lock);
- list_add_tail(&ctx->list, &tls_device_list);
- spin_unlock_irq(&tls_device_lock);
+ tls_device_attach(ctx, sk, netdev);
- sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
/* following this assignment tls_is_sk_tx_device_offloaded
* will return true and the context might be accessed
* by the netdev's xmit function.
*/
- smp_store_release(&sk->sk_destruct,
- &tls_device_sk_destruct);
+ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
+ dev_put(netdev);
up_read(&device_offload_lock);
goto out;
@@ -690,6 +815,105 @@ out:
return rc;
}
+int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
+{
+ struct tls_offload_context_rx *context;
+ struct net_device *netdev;
+ int rc = 0;
+
+ /* We support starting offload on multiple sockets
+ * concurrently, so we only need a read lock here.
+ * This lock must precede get_netdev_for_sock to prevent races between
+ * NETDEV_DOWN and setsockopt.
+ */
+ down_read(&device_offload_lock);
+ netdev = get_netdev_for_sock(sk);
+ if (!netdev) {
+ pr_err_ratelimited("%s: netdev not found\n", __func__);
+ rc = -EINVAL;
+ goto release_lock;
+ }
+
+ if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
+ pr_err_ratelimited("%s: netdev %s with no TLS offload\n",
+ __func__, netdev->name);
+ rc = -ENOTSUPP;
+ goto release_netdev;
+ }
+
+ /* Avoid offloading if the device is down
+ * We don't want to offload new flows after
+ * the NETDEV_DOWN event
+ */
+ if (!(netdev->flags & IFF_UP)) {
+ rc = -EINVAL;
+ goto release_netdev;
+ }
+
+ context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
+ if (!context) {
+ rc = -ENOMEM;
+ goto release_netdev;
+ }
+
+ ctx->priv_ctx_rx = context;
+ rc = tls_set_sw_offload(sk, ctx, 0);
+ if (rc)
+ goto release_ctx;
+
+ rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
+ &ctx->crypto_recv.info,
+ tcp_sk(sk)->copied_seq);
+ if (rc) {
+ pr_err_ratelimited("%s: The netdev has refused to offload this socket\n",
+ __func__);
+ goto free_sw_resources;
+ }
+
+ tls_device_attach(ctx, sk, netdev);
+ goto release_netdev;
+
+free_sw_resources:
+ tls_sw_free_resources_rx(sk);
+release_ctx:
+ ctx->priv_ctx_rx = NULL;
+release_netdev:
+ dev_put(netdev);
+release_lock:
+ up_read(&device_offload_lock);
+ return rc;
+}
+
+void tls_device_offload_cleanup_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct net_device *netdev;
+
+ down_read(&device_offload_lock);
+ netdev = tls_ctx->netdev;
+ if (!netdev)
+ goto out;
+
+ if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
+ pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
+ __func__);
+ goto out;
+ }
+
+ netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
+ TLS_OFFLOAD_CTX_DIR_RX);
+
+ if (tls_ctx->tx_conf != TLS_HW) {
+ dev_put(netdev);
+ tls_ctx->netdev = NULL;
+ }
+out:
+ up_read(&device_offload_lock);
+ kfree(tls_ctx->rx.rec_seq);
+ kfree(tls_ctx->rx.iv);
+ tls_sw_release_resources_rx(sk);
+}
+
static int tls_device_down(struct net_device *netdev)
{
struct tls_context *ctx, *tmp;
@@ -710,8 +934,12 @@ static int tls_device_down(struct net_device *netdev)
spin_unlock_irqrestore(&tls_device_lock, flags);
list_for_each_entry_safe(ctx, tmp, &list, list) {
- netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
- TLS_OFFLOAD_CTX_DIR_TX);
+ if (ctx->tx_conf == TLS_HW)
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ if (ctx->rx_conf == TLS_HW)
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_RX);
ctx->netdev = NULL;
dev_put(netdev);
list_del_init(&ctx->list);
@@ -732,12 +960,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- if (!(dev->features & NETIF_F_HW_TLS_TX))
+ if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
return NOTIFY_DONE;
switch (event) {
case NETDEV_REGISTER:
case NETDEV_FEAT_CHANGE:
+ if ((dev->features & NETIF_F_HW_TLS_RX) &&
+ !dev->tlsdev_ops->tls_dev_resync_rx)
+ return NOTIFY_BAD;
+
if (dev->tlsdev_ops &&
dev->tlsdev_ops->tls_dev_add &&
dev->tlsdev_ops->tls_dev_del)
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 748914abdb60..450a6dbc5a88 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -42,7 +42,7 @@ static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
sg_set_page(sg, sg_page(src),
src->length - diff, walk->offset);
- scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
+ scatterwalk_crypto_chain(sg, sg_next(src), 2);
}
static int tls_enc_record(struct aead_request *aead_req,
@@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
static int fill_sg_in(struct scatterlist *sg_in,
struct sk_buff *skb,
- struct tls_offload_context *ctx,
+ struct tls_offload_context_tx *ctx,
u64 *rcd_sn,
s32 *sync_size,
int *resync_sgs)
@@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
s32 sync_size, u64 rcd_sn)
{
int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
void *buf, *iv, *aad, *dummy_buf;
struct aead_request *aead_req;
@@ -320,7 +320,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
goto free_req;
iv = buf;
- memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
+ memcpy(iv, tls_ctx->crypto_send.aes_gcm_128.salt,
TLS_CIPHER_AES_GCM_128_SALT_SIZE);
aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
TLS_CIPHER_AES_GCM_128_IV_SIZE;
@@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
{
int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
struct scatterlist *sg_in, sg_out[3];
struct sk_buff *nskb = NULL;
@@ -413,9 +413,10 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
return tls_sw_fallback(sk, skb);
}
+EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
int tls_sw_fallback_init(struct sock *sk,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct tls_crypto_info *crypto_info)
{
const u8 *key;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 301f22430469..311cec8e533d 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -45,21 +45,13 @@
MODULE_AUTHOR("Mellanox Technologies");
MODULE_DESCRIPTION("Transport Layer Security Support");
MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_TCP_ULP("tls");
enum {
TLSV4,
TLSV6,
TLS_NUM_PROTS,
};
-enum {
- TLS_BASE,
- TLS_SW,
-#ifdef CONFIG_TLS_DEVICE
- TLS_HW,
-#endif
- TLS_HW_RECORD,
- TLS_NUM_CONFIG,
-};
static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
@@ -149,7 +141,6 @@ retry:
size = sg->length;
}
- clear_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
ctx->in_tcp_sendpages = false;
ctx->sk_write_space(sk);
@@ -201,15 +192,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
return rc;
}
-int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
- int flags, long *timeo)
+int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
+ int flags)
{
struct scatterlist *sg;
u16 offset;
- if (!tls_is_partially_sent_record(ctx))
- return ctx->push_pending_record(sk, flags);
-
sg = ctx->partially_sent_record;
offset = ctx->partially_sent_offset;
@@ -217,33 +205,53 @@ int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx,
return tls_push_sg(sk, ctx, sg, offset, flags);
}
+int tls_push_pending_closed_record(struct sock *sk,
+ struct tls_context *tls_ctx,
+ int flags, long *timeo)
+{
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+
+ if (tls_is_partially_sent_record(tls_ctx) ||
+ !list_empty(&ctx->tx_list))
+ return tls_tx_records(sk, flags);
+ else
+ return tls_ctx->push_pending_record(sk, flags);
+}
+
static void tls_write_space(struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx);
- /* We are already sending pages, ignore notification */
- if (ctx->in_tcp_sendpages)
+ /* If in_tcp_sendpages call lower protocol write space handler
+ * to ensure we wake up any waiting operations there. For example
+ * if do_tcp_sendpages where to call sk_wait_event.
+ */
+ if (ctx->in_tcp_sendpages) {
+ ctx->sk_write_space(sk);
return;
+ }
- if (!sk->sk_write_pending && tls_is_pending_closed_record(ctx)) {
- gfp_t sk_allocation = sk->sk_allocation;
- int rc;
- long timeo = 0;
-
- sk->sk_allocation = GFP_ATOMIC;
- rc = tls_push_pending_closed_record(sk, ctx,
- MSG_DONTWAIT |
- MSG_NOSIGNAL,
- &timeo);
- sk->sk_allocation = sk_allocation;
-
- if (rc < 0)
- return;
+ /* Schedule the transmission if tx list is ready */
+ if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) {
+ /* Schedule the transmission */
+ if (!test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
+ schedule_delayed_work(&tx_ctx->tx_work.work, 0);
}
ctx->sk_write_space(sk);
}
+static void tls_ctx_free(struct tls_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send));
+ memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv));
+ kfree(ctx);
+}
+
static void tls_sk_proto_close(struct sock *sk, long timeout)
{
struct tls_context *ctx = tls_get_ctx(sk);
@@ -263,19 +271,6 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
if (!tls_complete_pending_work(sk, ctx, 0, &timeo))
tls_handle_open_record(sk, 0);
- if (ctx->partially_sent_record) {
- struct scatterlist *sg = ctx->partially_sent_record;
-
- while (1) {
- put_page(sg_page(sg));
- sk_mem_uncharge(sk, sg->length);
-
- if (sg_is_last(sg))
- break;
- sg++;
- }
- }
-
/* We need these for tls_sw_fallback handling of other packets */
if (ctx->tx_conf == TLS_SW) {
kfree(ctx->tx.rec_seq);
@@ -290,11 +285,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
}
#ifdef CONFIG_TLS_DEVICE
- if (ctx->tx_conf != TLS_HW) {
+ if (ctx->rx_conf == TLS_HW)
+ tls_device_offload_cleanup_rx(sk);
+
+ if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) {
#else
{
#endif
- kfree(ctx);
+ tls_ctx_free(ctx);
ctx = NULL;
}
@@ -305,7 +303,7 @@ skip_tx_cleanup:
* for sk->sk_prot->unhash [tls_hw_unhash]
*/
if (free_ctx)
- kfree(ctx);
+ tls_ctx_free(ctx);
}
static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
@@ -330,7 +328,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
}
/* get user crypto info */
- crypto_info = &ctx->crypto_send;
+ crypto_info = &ctx->crypto_send.info;
if (!TLS_CRYPTO_INFO_READY(crypto_info)) {
rc = -EBUSY;
@@ -417,9 +415,9 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
}
if (tx)
- crypto_info = &ctx->crypto_send;
+ crypto_info = &ctx->crypto_send.info;
else
- crypto_info = &ctx->crypto_recv;
+ crypto_info = &ctx->crypto_recv.info;
/* Currently we don't support set crypto info more than one time */
if (TLS_CRYPTO_INFO_READY(crypto_info)) {
@@ -470,8 +468,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
conf = TLS_SW;
}
} else {
- rc = tls_set_sw_offload(sk, ctx, 0);
- conf = TLS_SW;
+#ifdef CONFIG_TLS_DEVICE
+ rc = tls_set_device_offload_rx(sk, ctx);
+ conf = TLS_HW;
+ if (rc) {
+#else
+ {
+#endif
+ rc = tls_set_sw_offload(sk, ctx, 0);
+ conf = TLS_SW;
+ }
}
if (rc)
@@ -491,7 +497,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
goto out;
err_crypto_info:
- memset(crypto_info, 0, sizeof(*crypto_info));
+ memzero_explicit(crypto_info, sizeof(union tls_crypto_context));
out:
return rc;
}
@@ -614,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
- prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
- prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
+ prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
+ prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
- prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
- prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
+ prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
+ prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
+ prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
#ifdef CONFIG_TLS_DEVICE
prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
@@ -629,6 +637,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
+
+ prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW];
+
+ prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW];
+
+ prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW];
#endif
prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
@@ -701,8 +715,6 @@ EXPORT_SYMBOL(tls_unregister_device);
static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
.name = "tls",
- .uid = TCP_ULP_TLS,
- .user_visible = true,
.owner = THIS_MODULE,
.init = tls_init,
};
@@ -712,7 +724,6 @@ static int __init tls_register(void)
build_protos(tls_prots[TLSV4], &tcp_prot);
tls_sw_proto_ops = inet_stream_ops;
- tls_sw_proto_ops.poll = tls_sw_poll;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
#ifdef CONFIG_TLS_DEVICE
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 1f3d9789af30..5cd88ba8acd1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -4,6 +4,7 @@
* Copyright (c) 2016-2017, Lance Chao <lancerchao@fb.com>. All rights reserved.
* Copyright (c) 2016, Fridolin Pokorny <fridolin.pokorny@gmail.com>. All rights reserved.
* Copyright (c) 2016, Nikos Mavrogiannopoulos <nmav@gnutls.org>. All rights reserved.
+ * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -43,353 +44,794 @@
#define MAX_IV_SIZE TLS_CIPHER_AES_GCM_128_IV_SIZE
+static int __skb_nsg(struct sk_buff *skb, int offset, int len,
+ unsigned int recursion_level)
+{
+ int start = skb_headlen(skb);
+ int i, chunk = start - offset;
+ struct sk_buff *frag_iter;
+ int elt = 0;
+
+ if (unlikely(recursion_level >= 24))
+ return -EMSGSIZE;
+
+ if (chunk > 0) {
+ if (chunk > len)
+ chunk = len;
+ elt++;
+ len -= chunk;
+ if (len == 0)
+ return elt;
+ offset += chunk;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ chunk = end - offset;
+ if (chunk > 0) {
+ if (chunk > len)
+ chunk = len;
+ elt++;
+ len -= chunk;
+ if (len == 0)
+ return elt;
+ offset += chunk;
+ }
+ start = end;
+ }
+
+ if (unlikely(skb_has_frag_list(skb))) {
+ skb_walk_frags(skb, frag_iter) {
+ int end, ret;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ chunk = end - offset;
+ if (chunk > 0) {
+ if (chunk > len)
+ chunk = len;
+ ret = __skb_nsg(frag_iter, offset - start, chunk,
+ recursion_level + 1);
+ if (unlikely(ret < 0))
+ return ret;
+ elt += ret;
+ len -= chunk;
+ if (len == 0)
+ return elt;
+ offset += chunk;
+ }
+ start = end;
+ }
+ }
+ BUG_ON(len);
+ return elt;
+}
+
+/* Return the number of scatterlist elements required to completely map the
+ * skb, or -EMSGSIZE if the recursion depth is exceeded.
+ */
+static int skb_nsg(struct sk_buff *skb, int offset, int len)
+{
+ return __skb_nsg(skb, offset, len, 0);
+}
+
+static void tls_decrypt_done(struct crypto_async_request *req, int err)
+{
+ struct aead_request *aead_req = (struct aead_request *)req;
+ struct scatterlist *sgout = aead_req->dst;
+ struct tls_sw_context_rx *ctx;
+ struct tls_context *tls_ctx;
+ struct scatterlist *sg;
+ struct sk_buff *skb;
+ unsigned int pages;
+ int pending;
+
+ skb = (struct sk_buff *)req->data;
+ tls_ctx = tls_get_ctx(skb->sk);
+ ctx = tls_sw_ctx_rx(tls_ctx);
+ pending = atomic_dec_return(&ctx->decrypt_pending);
+
+ /* Propagate if there was an err */
+ if (err) {
+ ctx->async_wait.err = err;
+ tls_err_abort(skb->sk, err);
+ }
+
+ /* After using skb->sk to propagate sk through crypto async callback
+ * we need to NULL it again.
+ */
+ skb->sk = NULL;
+
+ /* Release the skb, pages and memory allocated for crypto req */
+ kfree_skb(skb);
+
+ /* Skip the first S/G entry as it points to AAD */
+ for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) {
+ if (!sg)
+ break;
+ put_page(sg_page(sg));
+ }
+
+ kfree(aead_req);
+
+ if (!pending && READ_ONCE(ctx->async_notify))
+ complete(&ctx->async_wait.completion);
+}
+
static int tls_do_decryption(struct sock *sk,
+ struct sk_buff *skb,
struct scatterlist *sgin,
struct scatterlist *sgout,
char *iv_recv,
size_t data_len,
- struct sk_buff *skb,
- gfp_t flags)
+ struct aead_request *aead_req,
+ bool async)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm = strp_msg(skb);
- struct aead_request *aead_req;
-
int ret;
- unsigned int req_size = sizeof(struct aead_request) +
- crypto_aead_reqsize(ctx->aead_recv);
-
- aead_req = kzalloc(req_size, flags);
- if (!aead_req)
- return -ENOMEM;
aead_request_set_tfm(aead_req, ctx->aead_recv);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
aead_request_set_crypt(aead_req, sgin, sgout,
data_len + tls_ctx->rx.tag_size,
(u8 *)iv_recv);
- aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &ctx->async_wait);
- ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
+ if (async) {
+ /* Using skb->sk to push sk through to crypto async callback
+ * handler. This allows propagating errors up to the socket
+ * if needed. It _must_ be cleared in the async handler
+ * before kfree_skb is called. We _know_ skb->sk is NULL
+ * because it is a clone from strparser.
+ */
+ skb->sk = sk;
+ aead_request_set_callback(aead_req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ tls_decrypt_done, skb);
+ atomic_inc(&ctx->decrypt_pending);
+ } else {
+ aead_request_set_callback(aead_req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &ctx->async_wait);
+ }
- if (ret < 0)
- goto out;
+ ret = crypto_aead_decrypt(aead_req);
+ if (ret == -EINPROGRESS) {
+ if (async)
+ return ret;
- rxm->offset += tls_ctx->rx.prepend_size;
- rxm->full_len -= tls_ctx->rx.overhead_size;
- tls_advance_record_sn(sk, &tls_ctx->rx);
-
- ctx->decrypted = true;
+ ret = crypto_wait_req(ret, &ctx->async_wait);
+ }
- ctx->saved_data_ready(sk);
+ if (async)
+ atomic_dec(&ctx->decrypt_pending);
-out:
- kfree(aead_req);
return ret;
}
-static void trim_sg(struct sock *sk, struct scatterlist *sg,
- int *sg_num_elem, unsigned int *sg_size, int target_size)
+static void tls_trim_both_msgs(struct sock *sk, int target_size)
{
- int i = *sg_num_elem - 1;
- int trim = *sg_size - target_size;
-
- if (trim <= 0) {
- WARN_ON(trim < 0);
- return;
- }
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_rec *rec = ctx->open_rec;
- *sg_size = target_size;
- while (trim >= sg[i].length) {
- trim -= sg[i].length;
- sk_mem_uncharge(sk, sg[i].length);
- put_page(sg_page(&sg[i]));
- i--;
+ sk_msg_trim(sk, &rec->msg_plaintext, target_size);
+ if (target_size > 0)
+ target_size += tls_ctx->tx.overhead_size;
+ sk_msg_trim(sk, &rec->msg_encrypted, target_size);
+}
- if (i < 0)
- goto out;
- }
+static int tls_alloc_encrypted_msg(struct sock *sk, int len)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_rec *rec = ctx->open_rec;
+ struct sk_msg *msg_en = &rec->msg_encrypted;
- sg[i].length -= trim;
- sk_mem_uncharge(sk, trim);
+ return sk_msg_alloc(sk, msg_en, len, 0);
+}
-out:
- *sg_num_elem = i + 1;
+static int tls_clone_plaintext_msg(struct sock *sk, int required)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_rec *rec = ctx->open_rec;
+ struct sk_msg *msg_pl = &rec->msg_plaintext;
+ struct sk_msg *msg_en = &rec->msg_encrypted;
+ int skip, len;
+
+ /* We add page references worth len bytes from encrypted sg
+ * at the end of plaintext sg. It is guaranteed that msg_en
+ * has enough required room (ensured by caller).
+ */
+ len = required - msg_pl->sg.size;
+
+ /* Skip initial bytes in msg_en's data to be able to use
+ * same offset of both plain and encrypted data.
+ */
+ skip = tls_ctx->tx.prepend_size + msg_pl->sg.size;
+
+ return sk_msg_clone(sk, msg_pl, msg_en, skip, len);
}
-static void trim_both_sgl(struct sock *sk, int target_size)
+static struct tls_rec *tls_get_rec(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct sk_msg *msg_pl, *msg_en;
+ struct tls_rec *rec;
+ int mem_size;
- trim_sg(sk, ctx->sg_plaintext_data,
- &ctx->sg_plaintext_num_elem,
- &ctx->sg_plaintext_size,
- target_size);
+ mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send);
- if (target_size > 0)
- target_size += tls_ctx->tx.overhead_size;
+ rec = kzalloc(mem_size, sk->sk_allocation);
+ if (!rec)
+ return NULL;
+
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+
+ sk_msg_init(msg_pl);
+ sk_msg_init(msg_en);
+
+ sg_init_table(rec->sg_aead_in, 2);
+ sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
+ sizeof(rec->aad_space));
+ sg_unmark_end(&rec->sg_aead_in[1]);
+
+ sg_init_table(rec->sg_aead_out, 2);
+ sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
+ sizeof(rec->aad_space));
+ sg_unmark_end(&rec->sg_aead_out[1]);
- trim_sg(sk, ctx->sg_encrypted_data,
- &ctx->sg_encrypted_num_elem,
- &ctx->sg_encrypted_size,
- target_size);
+ return rec;
}
-static int alloc_encrypted_sg(struct sock *sk, int len)
+static void tls_free_rec(struct sock *sk, struct tls_rec *rec)
+{
+ sk_msg_free(sk, &rec->msg_encrypted);
+ sk_msg_free(sk, &rec->msg_plaintext);
+ kfree(rec);
+}
+
+static void tls_free_open_rec(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- int rc = 0;
+ struct tls_rec *rec = ctx->open_rec;
- rc = sk_alloc_sg(sk, len,
- ctx->sg_encrypted_data, 0,
- &ctx->sg_encrypted_num_elem,
- &ctx->sg_encrypted_size, 0);
-
- return rc;
+ if (rec) {
+ tls_free_rec(sk, rec);
+ ctx->open_rec = NULL;
+ }
}
-static int alloc_plaintext_sg(struct sock *sk, int len)
+int tls_tx_records(struct sock *sk, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- int rc = 0;
+ struct tls_rec *rec, *tmp;
+ struct sk_msg *msg_en;
+ int tx_flags, rc = 0;
- rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
- &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
- tls_ctx->pending_open_record_frags);
+ if (tls_is_partially_sent_record(tls_ctx)) {
+ rec = list_first_entry(&ctx->tx_list,
+ struct tls_rec, list);
- return rc;
-}
+ if (flags == -1)
+ tx_flags = rec->tx_flags;
+ else
+ tx_flags = flags;
-static void free_sg(struct sock *sk, struct scatterlist *sg,
- int *sg_num_elem, unsigned int *sg_size)
-{
- int i, n = *sg_num_elem;
+ rc = tls_push_partial_record(sk, tls_ctx, tx_flags);
+ if (rc)
+ goto tx_err;
- for (i = 0; i < n; ++i) {
- sk_mem_uncharge(sk, sg[i].length);
- put_page(sg_page(&sg[i]));
+ /* Full record has been transmitted.
+ * Remove the head of tx_list
+ */
+ list_del(&rec->list);
+ sk_msg_free(sk, &rec->msg_plaintext);
+ kfree(rec);
}
- *sg_num_elem = 0;
- *sg_size = 0;
+
+ /* Tx all ready records */
+ list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
+ if (READ_ONCE(rec->tx_ready)) {
+ if (flags == -1)
+ tx_flags = rec->tx_flags;
+ else
+ tx_flags = flags;
+
+ msg_en = &rec->msg_encrypted;
+ rc = tls_push_sg(sk, tls_ctx,
+ &msg_en->sg.data[msg_en->sg.curr],
+ 0, tx_flags);
+ if (rc)
+ goto tx_err;
+
+ list_del(&rec->list);
+ sk_msg_free(sk, &rec->msg_plaintext);
+ kfree(rec);
+ } else {
+ break;
+ }
+ }
+
+tx_err:
+ if (rc < 0 && rc != -EAGAIN)
+ tls_err_abort(sk, EBADMSG);
+
+ return rc;
}
-static void tls_free_both_sg(struct sock *sk)
+static void tls_encrypt_done(struct crypto_async_request *req, int err)
{
+ struct aead_request *aead_req = (struct aead_request *)req;
+ struct sock *sk = req->data;
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct scatterlist *sge;
+ struct sk_msg *msg_en;
+ struct tls_rec *rec;
+ bool ready = false;
+ int pending;
+
+ rec = container_of(aead_req, struct tls_rec, aead_req);
+ msg_en = &rec->msg_encrypted;
- free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
- &ctx->sg_encrypted_size);
+ sge = sk_msg_elem(msg_en, msg_en->sg.curr);
+ sge->offset -= tls_ctx->tx.prepend_size;
+ sge->length += tls_ctx->tx.prepend_size;
- free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
- &ctx->sg_plaintext_size);
+ /* Check if error is previously set on socket */
+ if (err || sk->sk_err) {
+ rec = NULL;
+
+ /* If err is already set on socket, return the same code */
+ if (sk->sk_err) {
+ ctx->async_wait.err = sk->sk_err;
+ } else {
+ ctx->async_wait.err = err;
+ tls_err_abort(sk, err);
+ }
+ }
+
+ if (rec) {
+ struct tls_rec *first_rec;
+
+ /* Mark the record as ready for transmission */
+ smp_store_mb(rec->tx_ready, true);
+
+ /* If received record is at head of tx_list, schedule tx */
+ first_rec = list_first_entry(&ctx->tx_list,
+ struct tls_rec, list);
+ if (rec == first_rec)
+ ready = true;
+ }
+
+ pending = atomic_dec_return(&ctx->encrypt_pending);
+
+ if (!pending && READ_ONCE(ctx->async_notify))
+ complete(&ctx->async_wait.completion);
+
+ if (!ready)
+ return;
+
+ /* Schedule the transmission */
+ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+ schedule_delayed_work(&ctx->tx_work.work, 1);
}
-static int tls_do_encryption(struct tls_context *tls_ctx,
+static int tls_do_encryption(struct sock *sk,
+ struct tls_context *tls_ctx,
struct tls_sw_context_tx *ctx,
struct aead_request *aead_req,
- size_t data_len)
+ size_t data_len, u32 start)
{
+ struct tls_rec *rec = ctx->open_rec;
+ struct sk_msg *msg_en = &rec->msg_encrypted;
+ struct scatterlist *sge = sk_msg_elem(msg_en, start);
int rc;
- ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
- ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
+ sge->offset += tls_ctx->tx.prepend_size;
+ sge->length -= tls_ctx->tx.prepend_size;
+
+ msg_en->sg.curr = start;
aead_request_set_tfm(aead_req, ctx->aead_send);
aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
- aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
+ aead_request_set_crypt(aead_req, rec->sg_aead_in,
+ rec->sg_aead_out,
data_len, tls_ctx->tx.iv);
aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &ctx->async_wait);
+ tls_encrypt_done, sk);
+
+ /* Add the record in tx_list */
+ list_add_tail((struct list_head *)&rec->list, &ctx->tx_list);
+ atomic_inc(&ctx->encrypt_pending);
- rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait);
+ rc = crypto_aead_encrypt(aead_req);
+ if (!rc || rc != -EINPROGRESS) {
+ atomic_dec(&ctx->encrypt_pending);
+ sge->offset -= tls_ctx->tx.prepend_size;
+ sge->length += tls_ctx->tx.prepend_size;
+ }
- ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
- ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
+ if (!rc) {
+ WRITE_ONCE(rec->tx_ready, true);
+ } else if (rc != -EINPROGRESS) {
+ list_del(&rec->list);
+ return rc;
+ }
+ /* Unhook the record from context if encryption is not failure */
+ ctx->open_rec = NULL;
+ tls_advance_record_sn(sk, &tls_ctx->tx);
return rc;
}
-static int tls_push_record(struct sock *sk, int flags,
- unsigned char record_type)
+static int tls_split_open_record(struct sock *sk, struct tls_rec *from,
+ struct tls_rec **to, struct sk_msg *msg_opl,
+ struct sk_msg *msg_oen, u32 split_point,
+ u32 tx_overhead_size, u32 *orig_end)
{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- struct aead_request *req;
- int rc;
+ u32 i, j, bytes = 0, apply = msg_opl->apply_bytes;
+ struct scatterlist *sge, *osge, *nsge;
+ u32 orig_size = msg_opl->sg.size;
+ struct scatterlist tmp = { };
+ struct sk_msg *msg_npl;
+ struct tls_rec *new;
+ int ret;
- req = kzalloc(sizeof(struct aead_request) +
- crypto_aead_reqsize(ctx->aead_send), sk->sk_allocation);
- if (!req)
+ new = tls_get_rec(sk);
+ if (!new)
return -ENOMEM;
+ ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size +
+ tx_overhead_size, 0);
+ if (ret < 0) {
+ tls_free_rec(sk, new);
+ return ret;
+ }
- sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
- sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
-
- tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size,
- tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
- record_type);
-
- tls_fill_prepend(tls_ctx,
- page_address(sg_page(&ctx->sg_encrypted_data[0])) +
- ctx->sg_encrypted_data[0].offset,
- ctx->sg_plaintext_size, record_type);
-
- tls_ctx->pending_open_record_frags = 0;
- set_bit(TLS_PENDING_CLOSED_RECORD, &tls_ctx->flags);
+ *orig_end = msg_opl->sg.end;
+ i = msg_opl->sg.start;
+ sge = sk_msg_elem(msg_opl, i);
+ while (apply && sge->length) {
+ if (sge->length > apply) {
+ u32 len = sge->length - apply;
+
+ get_page(sg_page(sge));
+ sg_set_page(&tmp, sg_page(sge), len,
+ sge->offset + apply);
+ sge->length = apply;
+ bytes += apply;
+ apply = 0;
+ } else {
+ apply -= sge->length;
+ bytes += sge->length;
+ }
- rc = tls_do_encryption(tls_ctx, ctx, req, ctx->sg_plaintext_size);
- if (rc < 0) {
- /* If we are called from write_space and
- * we fail, we need to set this SOCK_NOSPACE
- * to trigger another write_space in the future.
- */
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- goto out_req;
+ sk_msg_iter_var_next(i);
+ if (i == msg_opl->sg.end)
+ break;
+ sge = sk_msg_elem(msg_opl, i);
}
- free_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem,
- &ctx->sg_plaintext_size);
+ msg_opl->sg.end = i;
+ msg_opl->sg.curr = i;
+ msg_opl->sg.copybreak = 0;
+ msg_opl->apply_bytes = 0;
+ msg_opl->sg.size = bytes;
+
+ msg_npl = &new->msg_plaintext;
+ msg_npl->apply_bytes = apply;
+ msg_npl->sg.size = orig_size - bytes;
+
+ j = msg_npl->sg.start;
+ nsge = sk_msg_elem(msg_npl, j);
+ if (tmp.length) {
+ memcpy(nsge, &tmp, sizeof(*nsge));
+ sk_msg_iter_var_next(j);
+ nsge = sk_msg_elem(msg_npl, j);
+ }
- ctx->sg_encrypted_num_elem = 0;
- ctx->sg_encrypted_size = 0;
+ osge = sk_msg_elem(msg_opl, i);
+ while (osge->length) {
+ memcpy(nsge, osge, sizeof(*nsge));
+ sg_unmark_end(nsge);
+ sk_msg_iter_var_next(i);
+ sk_msg_iter_var_next(j);
+ if (i == *orig_end)
+ break;
+ osge = sk_msg_elem(msg_opl, i);
+ nsge = sk_msg_elem(msg_npl, j);
+ }
- /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
- rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
- if (rc < 0 && rc != -EAGAIN)
- tls_err_abort(sk, EBADMSG);
+ msg_npl->sg.end = j;
+ msg_npl->sg.curr = j;
+ msg_npl->sg.copybreak = 0;
- tls_advance_record_sn(sk, &tls_ctx->tx);
-out_req:
- kfree(req);
- return rc;
+ *to = new;
+ return 0;
}
-static int tls_sw_push_pending_record(struct sock *sk, int flags)
+static void tls_merge_open_record(struct sock *sk, struct tls_rec *to,
+ struct tls_rec *from, u32 orig_end)
{
- return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA);
+ struct sk_msg *msg_npl = &from->msg_plaintext;
+ struct sk_msg *msg_opl = &to->msg_plaintext;
+ struct scatterlist *osge, *nsge;
+ u32 i, j;
+
+ i = msg_opl->sg.end;
+ sk_msg_iter_var_prev(i);
+ j = msg_npl->sg.start;
+
+ osge = sk_msg_elem(msg_opl, i);
+ nsge = sk_msg_elem(msg_npl, j);
+
+ if (sg_page(osge) == sg_page(nsge) &&
+ osge->offset + osge->length == nsge->offset) {
+ osge->length += nsge->length;
+ put_page(sg_page(nsge));
+ }
+
+ msg_opl->sg.end = orig_end;
+ msg_opl->sg.curr = orig_end;
+ msg_opl->sg.copybreak = 0;
+ msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size;
+ msg_opl->sg.size += msg_npl->sg.size;
+
+ sk_msg_free(sk, &to->msg_encrypted);
+ sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted);
+
+ kfree(from);
}
-static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
- int length, int *pages_used,
- unsigned int *size_used,
- struct scatterlist *to, int to_max_pages,
- bool charge)
+static int tls_push_record(struct sock *sk, int flags,
+ unsigned char record_type)
{
- struct page *pages[MAX_SKB_FRAGS];
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_rec *rec = ctx->open_rec, *tmp = NULL;
+ u32 i, split_point, uninitialized_var(orig_end);
+ struct sk_msg *msg_pl, *msg_en;
+ struct aead_request *req;
+ bool split;
+ int rc;
- size_t offset;
- ssize_t copied, use;
- int i = 0;
- unsigned int size = *size_used;
- int num_elem = *pages_used;
- int rc = 0;
- int maxpages;
+ if (!rec)
+ return 0;
- while (length > 0) {
- i = 0;
- maxpages = to_max_pages - num_elem;
- if (maxpages == 0) {
- rc = -EFAULT;
- goto out;
- }
- copied = iov_iter_get_pages(from, pages,
- length,
- maxpages, &offset);
- if (copied <= 0) {
- rc = -EFAULT;
- goto out;
- }
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+
+ split_point = msg_pl->apply_bytes;
+ split = split_point && split_point < msg_pl->sg.size;
+ if (split) {
+ rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
+ split_point, tls_ctx->tx.overhead_size,
+ &orig_end);
+ if (rc < 0)
+ return rc;
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+ tls_ctx->tx.overhead_size);
+ }
- iov_iter_advance(from, copied);
+ rec->tx_flags = flags;
+ req = &rec->aead_req;
- length -= copied;
- size += copied;
- while (copied) {
- use = min_t(int, copied, PAGE_SIZE - offset);
+ i = msg_pl->sg.end;
+ sk_msg_iter_var_prev(i);
+ sg_mark_end(sk_msg_elem(msg_pl, i));
- sg_set_page(&to[num_elem],
- pages[i], use, offset);
- sg_unmark_end(&to[num_elem]);
- if (charge)
- sk_mem_charge(sk, use);
+ i = msg_pl->sg.start;
+ sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
+ &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
- offset = 0;
- copied -= use;
+ i = msg_en->sg.end;
+ sk_msg_iter_var_prev(i);
+ sg_mark_end(sk_msg_elem(msg_en, i));
+
+ i = msg_en->sg.start;
+ sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
+
+ tls_make_aad(rec->aad_space, msg_pl->sg.size,
+ tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
+ record_type);
+
+ tls_fill_prepend(tls_ctx,
+ page_address(sg_page(&msg_en->sg.data[i])) +
+ msg_en->sg.data[i].offset, msg_pl->sg.size,
+ record_type);
+
+ tls_ctx->pending_open_record_frags = false;
- ++i;
- ++num_elem;
+ rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i);
+ if (rc < 0) {
+ if (rc != -EINPROGRESS) {
+ tls_err_abort(sk, EBADMSG);
+ if (split) {
+ tls_ctx->pending_open_record_frags = true;
+ tls_merge_open_record(sk, rec, tmp, orig_end);
+ }
}
+ return rc;
+ } else if (split) {
+ msg_pl = &tmp->msg_plaintext;
+ msg_en = &tmp->msg_encrypted;
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+ tls_ctx->tx.overhead_size);
+ tls_ctx->pending_open_record_frags = true;
+ ctx->open_rec = tmp;
}
-out:
- *size_used = size;
- *pages_used = num_elem;
-
- return rc;
+ return tls_tx_records(sk, flags);
}
-static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
- int bytes)
+static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
+ bool full_record, u8 record_type,
+ size_t *copied, int flags)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- struct scatterlist *sg = ctx->sg_plaintext_data;
- int copy, i, rc = 0;
-
- for (i = tls_ctx->pending_open_record_frags;
- i < ctx->sg_plaintext_num_elem; ++i) {
- copy = sg[i].length;
- if (copy_from_iter(
- page_address(sg_page(&sg[i])) + sg[i].offset,
- copy, from) != copy) {
- rc = -EFAULT;
- goto out;
+ struct sk_msg msg_redir = { };
+ struct sk_psock *psock;
+ struct sock *sk_redir;
+ struct tls_rec *rec;
+ int err = 0, send;
+ bool enospc;
+
+ psock = sk_psock_get(sk);
+ if (!psock)
+ return tls_push_record(sk, flags, record_type);
+more_data:
+ enospc = sk_msg_full(msg);
+ if (psock->eval == __SK_NONE)
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
+ !enospc && !full_record) {
+ err = -ENOSPC;
+ goto out_err;
+ }
+ msg->cork_bytes = 0;
+ send = msg->sg.size;
+ if (msg->apply_bytes && msg->apply_bytes < send)
+ send = msg->apply_bytes;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ err = tls_push_record(sk, flags, record_type);
+ if (err < 0) {
+ *copied -= sk_msg_free(sk, msg);
+ tls_free_open_rec(sk);
+ goto out_err;
+ }
+ break;
+ case __SK_REDIRECT:
+ sk_redir = psock->sk_redir;
+ memcpy(&msg_redir, msg, sizeof(*msg));
+ if (msg->apply_bytes < send)
+ msg->apply_bytes = 0;
+ else
+ msg->apply_bytes -= send;
+ sk_msg_return_zero(sk, msg, send);
+ msg->sg.size -= send;
+ release_sock(sk);
+ err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
+ lock_sock(sk);
+ if (err < 0) {
+ *copied -= sk_msg_free_nocharge(sk, &msg_redir);
+ msg->sg.size = 0;
}
- bytes -= copy;
+ if (msg->sg.size == 0)
+ tls_free_open_rec(sk);
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free_partial(sk, msg, send);
+ if (msg->apply_bytes < send)
+ msg->apply_bytes = 0;
+ else
+ msg->apply_bytes -= send;
+ if (msg->sg.size == 0)
+ tls_free_open_rec(sk);
+ *copied -= send;
+ err = -EACCES;
+ }
- ++tls_ctx->pending_open_record_frags;
+ if (likely(!err)) {
+ bool reset_eval = !ctx->open_rec;
- if (!bytes)
- break;
+ rec = ctx->open_rec;
+ if (rec) {
+ msg = &rec->msg_plaintext;
+ if (!msg->apply_bytes)
+ reset_eval = true;
+ }
+ if (reset_eval) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (rec)
+ goto more_data;
}
+ out_err:
+ sk_psock_put(sk, psock);
+ return err;
+}
-out:
- return rc;
+static int tls_sw_push_pending_record(struct sock *sk, int flags)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_rec *rec = ctx->open_rec;
+ struct sk_msg *msg_pl;
+ size_t copied;
+
+ if (!rec)
+ return 0;
+
+ msg_pl = &rec->msg_plaintext;
+ copied = msg_pl->sg.size;
+ if (!copied)
+ return 0;
+
+ return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA,
+ &copied, flags);
}
int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
+ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- int ret = 0;
- int required_size;
- long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ struct crypto_tfm *tfm = crypto_aead_tfm(ctx->aead_send);
+ bool async_capable = tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC;
+ unsigned char record_type = TLS_RECORD_TYPE_DATA;
+ bool is_kvec = msg->msg_iter.type & ITER_KVEC;
bool eor = !(msg->msg_flags & MSG_MORE);
size_t try_to_copy, copied = 0;
- unsigned char record_type = TLS_RECORD_TYPE_DATA;
- int record_room;
+ struct sk_msg *msg_pl, *msg_en;
+ struct tls_rec *rec;
+ int required_size;
+ int num_async = 0;
bool full_record;
+ int record_room;
+ int num_zc = 0;
int orig_size;
+ int ret = 0;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
return -ENOTSUPP;
lock_sock(sk);
- if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, &timeo))
- goto send_end;
+ /* Wait till there is any pending write on socket */
+ if (unlikely(sk->sk_write_pending)) {
+ ret = wait_on_pending_writer(sk, &timeo);
+ if (unlikely(ret))
+ goto send_end;
+ }
if (unlikely(msg->msg_controllen)) {
ret = tls_proccess_cmsg(sk, msg, &record_type);
- if (ret)
- goto send_end;
+ if (ret) {
+ if (ret == -EINPROGRESS)
+ num_async++;
+ else if (ret != -EAGAIN)
+ goto send_end;
+ }
}
while (msg_data_left(msg)) {
@@ -398,22 +840,35 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
goto send_end;
}
- orig_size = ctx->sg_plaintext_size;
+ if (ctx->open_rec)
+ rec = ctx->open_rec;
+ else
+ rec = ctx->open_rec = tls_get_rec(sk);
+ if (!rec) {
+ ret = -ENOMEM;
+ goto send_end;
+ }
+
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+
+ orig_size = msg_pl->sg.size;
full_record = false;
try_to_copy = msg_data_left(msg);
- record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+ record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
if (try_to_copy >= record_room) {
try_to_copy = record_room;
full_record = true;
}
- required_size = ctx->sg_plaintext_size + try_to_copy +
+ required_size = msg_pl->sg.size + try_to_copy +
tls_ctx->tx.overhead_size;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
+
alloc_encrypted:
- ret = alloc_encrypted_sg(sk, required_size);
+ ret = tls_alloc_encrypted_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
goto wait_for_memory;
@@ -422,71 +877,88 @@ alloc_encrypted:
* actually allocated. The difference is due
* to max sg elements limit
*/
- try_to_copy -= required_size - ctx->sg_encrypted_size;
+ try_to_copy -= required_size - msg_en->sg.size;
full_record = true;
}
- if (full_record || eor) {
- ret = zerocopy_from_iter(sk, &msg->msg_iter,
- try_to_copy, &ctx->sg_plaintext_num_elem,
- &ctx->sg_plaintext_size,
- ctx->sg_plaintext_data,
- ARRAY_SIZE(ctx->sg_plaintext_data),
- true);
+ if (!is_kvec && (full_record || eor) && !async_capable) {
+ u32 first = msg_pl->sg.end;
+
+ ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter,
+ msg_pl, try_to_copy);
if (ret)
goto fallback_to_reg_send;
+ rec->inplace_crypto = 0;
+
+ num_zc++;
copied += try_to_copy;
- ret = tls_push_record(sk, msg->msg_flags, record_type);
- if (!ret)
- continue;
- if (ret < 0)
- goto send_end;
+ sk_msg_sg_copy_set(msg_pl, first);
+ ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+ record_type, &copied,
+ msg->msg_flags);
+ if (ret) {
+ if (ret == -EINPROGRESS)
+ num_async++;
+ else if (ret == -ENOMEM)
+ goto wait_for_memory;
+ else if (ret == -ENOSPC)
+ goto rollback_iter;
+ else if (ret != -EAGAIN)
+ goto send_end;
+ }
+ continue;
+rollback_iter:
copied -= try_to_copy;
-fallback_to_reg_send:
+ sk_msg_sg_copy_clear(msg_pl, first);
iov_iter_revert(&msg->msg_iter,
- ctx->sg_plaintext_size - orig_size);
- trim_sg(sk, ctx->sg_plaintext_data,
- &ctx->sg_plaintext_num_elem,
- &ctx->sg_plaintext_size,
- orig_size);
+ msg_pl->sg.size - orig_size);
+fallback_to_reg_send:
+ sk_msg_trim(sk, msg_pl, orig_size);
}
- required_size = ctx->sg_plaintext_size + try_to_copy;
-alloc_plaintext:
- ret = alloc_plaintext_sg(sk, required_size);
+ required_size = msg_pl->sg.size + try_to_copy;
+
+ ret = tls_clone_plaintext_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
- goto wait_for_memory;
+ goto send_end;
/* Adjust try_to_copy according to the amount that was
* actually allocated. The difference is due
* to max sg elements limit
*/
- try_to_copy -= required_size - ctx->sg_plaintext_size;
+ try_to_copy -= required_size - msg_pl->sg.size;
full_record = true;
-
- trim_sg(sk, ctx->sg_encrypted_data,
- &ctx->sg_encrypted_num_elem,
- &ctx->sg_encrypted_size,
- ctx->sg_plaintext_size +
- tls_ctx->tx.overhead_size);
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size +
+ tls_ctx->tx.overhead_size);
}
- ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy);
- if (ret)
+ ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl,
+ try_to_copy);
+ if (ret < 0)
goto trim_sgl;
+ /* Open records defined only if successfully copied, otherwise
+ * we would trim the sg but not reset the open record frags.
+ */
+ tls_ctx->pending_open_record_frags = true;
copied += try_to_copy;
if (full_record || eor) {
-push_record:
- ret = tls_push_record(sk, msg->msg_flags, record_type);
+ ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+ record_type, &copied,
+ msg->msg_flags);
if (ret) {
- if (ret == -ENOMEM)
+ if (ret == -EINPROGRESS)
+ num_async++;
+ else if (ret == -ENOMEM)
goto wait_for_memory;
-
- goto send_end;
+ else if (ret != -EAGAIN) {
+ if (ret == -ENOSPC)
+ ret = 0;
+ goto send_end;
+ }
}
}
@@ -498,17 +970,37 @@ wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
trim_sgl:
- trim_both_sgl(sk, orig_size);
+ tls_trim_both_msgs(sk, orig_size);
goto send_end;
}
- if (tls_is_pending_closed_record(tls_ctx))
- goto push_record;
-
- if (ctx->sg_encrypted_size < required_size)
+ if (msg_en->sg.size < required_size)
goto alloc_encrypted;
+ }
+
+ if (!num_async) {
+ goto send_end;
+ } else if (num_zc) {
+ /* Wait for pending encryptions to get completed */
+ smp_store_mb(ctx->async_notify, true);
- goto alloc_plaintext;
+ if (atomic_read(&ctx->encrypt_pending))
+ crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
+ else
+ reinit_completion(&ctx->async_wait.completion);
+
+ WRITE_ONCE(ctx->async_notify, false);
+
+ if (ctx->async_wait.err) {
+ ret = ctx->async_wait.err;
+ copied = 0;
+ }
+ }
+
+ /* Transmit if any encryptions have completed */
+ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
+ cancel_delayed_work(&ctx->tx_work.work);
+ tls_tx_records(sk, msg->msg_flags);
}
send_end:
@@ -521,16 +1013,18 @@ send_end:
int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
+ long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- int ret = 0;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- bool eor;
- size_t orig_size = size;
unsigned char record_type = TLS_RECORD_TYPE_DATA;
- struct scatterlist *sg;
+ struct sk_msg *msg_pl;
+ struct tls_rec *rec;
+ int num_async = 0;
+ size_t copied = 0;
bool full_record;
int record_room;
+ int ret = 0;
+ bool eor;
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST))
@@ -543,8 +1037,12 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- if (tls_complete_pending_work(sk, tls_ctx, flags, &timeo))
- goto sendpage_end;
+ /* Wait till there is any pending write on socket */
+ if (unlikely(sk->sk_write_pending)) {
+ ret = wait_on_pending_writer(sk, &timeo);
+ if (unlikely(ret))
+ goto sendpage_end;
+ }
/* Call the sk_stream functions to manage the sndbuf mem. */
while (size > 0) {
@@ -555,20 +1053,33 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
goto sendpage_end;
}
+ if (ctx->open_rec)
+ rec = ctx->open_rec;
+ else
+ rec = ctx->open_rec = tls_get_rec(sk);
+ if (!rec) {
+ ret = -ENOMEM;
+ goto sendpage_end;
+ }
+
+ msg_pl = &rec->msg_plaintext;
+
full_record = false;
- record_room = TLS_MAX_PAYLOAD_SIZE - ctx->sg_plaintext_size;
+ record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size;
+ copied = 0;
copy = size;
if (copy >= record_room) {
copy = record_room;
full_record = true;
}
- required_size = ctx->sg_plaintext_size + copy +
- tls_ctx->tx.overhead_size;
+
+ required_size = msg_pl->sg.size + copy +
+ tls_ctx->tx.overhead_size;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
alloc_payload:
- ret = alloc_encrypted_sg(sk, required_size);
+ ret = tls_alloc_encrypted_msg(sk, required_size);
if (ret) {
if (ret != -ENOSPC)
goto wait_for_memory;
@@ -577,33 +1088,32 @@ alloc_payload:
* actually allocated. The difference is due
* to max sg elements limit
*/
- copy -= required_size - ctx->sg_plaintext_size;
+ copy -= required_size - msg_pl->sg.size;
full_record = true;
}
- get_page(page);
- sg = ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem;
- sg_set_page(sg, page, copy, offset);
- sg_unmark_end(sg);
-
- ctx->sg_plaintext_num_elem++;
-
+ sk_msg_page_add(msg_pl, page, copy, offset);
sk_mem_charge(sk, copy);
+
offset += copy;
size -= copy;
- ctx->sg_plaintext_size += copy;
- tls_ctx->pending_open_record_frags = ctx->sg_plaintext_num_elem;
-
- if (full_record || eor ||
- ctx->sg_plaintext_num_elem ==
- ARRAY_SIZE(ctx->sg_plaintext_data)) {
-push_record:
- ret = tls_push_record(sk, flags, record_type);
+ copied += copy;
+
+ tls_ctx->pending_open_record_frags = true;
+ if (full_record || eor || sk_msg_full(msg_pl)) {
+ rec->inplace_crypto = 0;
+ ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
+ record_type, &copied, flags);
if (ret) {
- if (ret == -ENOMEM)
+ if (ret == -EINPROGRESS)
+ num_async++;
+ else if (ret == -ENOMEM)
goto wait_for_memory;
-
- goto sendpage_end;
+ else if (ret != -EAGAIN) {
+ if (ret == -ENOSPC)
+ ret = 0;
+ goto sendpage_end;
+ }
}
}
continue;
@@ -612,35 +1122,35 @@ wait_for_sndbuf:
wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
- trim_both_sgl(sk, ctx->sg_plaintext_size);
+ tls_trim_both_msgs(sk, msg_pl->sg.size);
goto sendpage_end;
}
- if (tls_is_pending_closed_record(tls_ctx))
- goto push_record;
-
goto alloc_payload;
}
+ if (num_async) {
+ /* Transmit if any encryptions have completed */
+ if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) {
+ cancel_delayed_work(&ctx->tx_work.work);
+ tls_tx_records(sk, flags);
+ }
+ }
sendpage_end:
- if (orig_size > size)
- ret = orig_size - size;
- else
- ret = sk_stream_error(sk, flags, ret);
-
+ ret = sk_stream_error(sk, flags, ret);
release_sock(sk);
- return ret;
+ return copied ? copied : ret;
}
-static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
- long timeo, int *err)
+static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
struct sk_buff *skb;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
- while (!(skb = ctx->recv_pkt)) {
+ while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) {
if (sk->sk_err) {
*err = sock_error(sk);
return NULL;
@@ -659,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
add_wait_queue(sk_sleep(sk), &wait);
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait);
+ sk_wait_event(sk, &timeo,
+ ctx->recv_pkt != skb ||
+ !sk_psock_queue_empty(psock),
+ &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
remove_wait_queue(sk_sleep(sk), &wait);
@@ -673,57 +1186,234 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
return skb;
}
-static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
- struct scatterlist *sgout)
+static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from,
+ int length, int *pages_used,
+ unsigned int *size_used,
+ struct scatterlist *to,
+ int to_max_pages)
+{
+ int rc = 0, i = 0, num_elem = *pages_used, maxpages;
+ struct page *pages[MAX_SKB_FRAGS];
+ unsigned int size = *size_used;
+ ssize_t copied, use;
+ size_t offset;
+
+ while (length > 0) {
+ i = 0;
+ maxpages = to_max_pages - num_elem;
+ if (maxpages == 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+ copied = iov_iter_get_pages(from, pages,
+ length,
+ maxpages, &offset);
+ if (copied <= 0) {
+ rc = -EFAULT;
+ goto out;
+ }
+
+ iov_iter_advance(from, copied);
+
+ length -= copied;
+ size += copied;
+ while (copied) {
+ use = min_t(int, copied, PAGE_SIZE - offset);
+
+ sg_set_page(&to[num_elem],
+ pages[i], use, offset);
+ sg_unmark_end(&to[num_elem]);
+ /* We do not uncharge memory from this API */
+
+ offset = 0;
+ copied -= use;
+
+ i++;
+ num_elem++;
+ }
+ }
+ /* Mark the end in the last sg entry if newly added */
+ if (num_elem > *pages_used)
+ sg_mark_end(&to[num_elem - 1]);
+out:
+ if (rc)
+ iov_iter_revert(from, size - *size_used);
+ *size_used = size;
+ *pages_used = num_elem;
+
+ return rc;
+}
+
+/* This function decrypts the input skb into either out_iov or in out_sg
+ * or in skb buffers itself. The input parameter 'zc' indicates if
+ * zero-copy mode needs to be tried or not. With zero-copy mode, either
+ * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are
+ * NULL, then the decryption happens inside skb buffers itself, i.e.
+ * zero-copy gets disabled and 'zc' is updated.
+ */
+
+static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *out_iov,
+ struct scatterlist *out_sg,
+ int *chunk, bool *zc)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
- struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
- struct scatterlist *sgin = &sgin_arr[0];
struct strp_msg *rxm = strp_msg(skb);
- int ret, nsg = ARRAY_SIZE(sgin_arr);
+ int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0;
+ struct aead_request *aead_req;
struct sk_buff *unused;
+ u8 *aad, *iv, *mem = NULL;
+ struct scatterlist *sgin = NULL;
+ struct scatterlist *sgout = NULL;
+ const int data_len = rxm->full_len - tls_ctx->rx.overhead_size;
+
+ if (*zc && (out_iov || out_sg)) {
+ if (out_iov)
+ n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
+ else
+ n_sgout = sg_nents(out_sg);
+ n_sgin = skb_nsg(skb, rxm->offset + tls_ctx->rx.prepend_size,
+ rxm->full_len - tls_ctx->rx.prepend_size);
+ } else {
+ n_sgout = 0;
+ *zc = false;
+ n_sgin = skb_cow_data(skb, 0, &unused);
+ }
- ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
+ if (n_sgin < 1)
+ return -EBADMSG;
+
+ /* Increment to accommodate AAD */
+ n_sgin = n_sgin + 1;
+
+ nsg = n_sgin + n_sgout;
+
+ aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
+ mem_size = aead_size + (nsg * sizeof(struct scatterlist));
+ mem_size = mem_size + TLS_AAD_SPACE_SIZE;
+ mem_size = mem_size + crypto_aead_ivsize(ctx->aead_recv);
+
+ /* Allocate a single block of memory which contains
+ * aead_req || sgin[] || sgout[] || aad || iv.
+ * This order achieves correct alignment for aead_req, sgin, sgout.
+ */
+ mem = kmalloc(mem_size, sk->sk_allocation);
+ if (!mem)
+ return -ENOMEM;
+
+ /* Segment the allocated memory */
+ aead_req = (struct aead_request *)mem;
+ sgin = (struct scatterlist *)(mem + aead_size);
+ sgout = sgin + n_sgin;
+ aad = (u8 *)(sgout + n_sgout);
+ iv = aad + TLS_AAD_SPACE_SIZE;
+
+ /* Prepare IV */
+ err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
tls_ctx->rx.iv_size);
- if (ret < 0)
- return ret;
-
- memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
- if (!sgout) {
- nsg = skb_cow_data(skb, 0, &unused) + 1;
- sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
- sgout = sgin;
+ if (err < 0) {
+ kfree(mem);
+ return err;
}
+ memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
- sg_init_table(sgin, nsg);
- sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE);
+ /* Prepare AAD */
+ tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size,
+ tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size,
+ ctx->control);
- nsg = skb_to_sgvec(skb, &sgin[1],
+ /* Prepare sgin */
+ sg_init_table(sgin, n_sgin);
+ sg_set_buf(&sgin[0], aad, TLS_AAD_SPACE_SIZE);
+ err = skb_to_sgvec(skb, &sgin[1],
rxm->offset + tls_ctx->rx.prepend_size,
rxm->full_len - tls_ctx->rx.prepend_size);
- if (nsg < 0) {
- ret = nsg;
- goto out;
+ if (err < 0) {
+ kfree(mem);
+ return err;
}
- tls_make_aad(ctx->rx_aad_ciphertext,
- rxm->full_len - tls_ctx->rx.overhead_size,
- tls_ctx->rx.rec_seq,
- tls_ctx->rx.rec_seq_size,
- ctx->control);
+ if (n_sgout) {
+ if (out_iov) {
+ sg_init_table(sgout, n_sgout);
+ sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE);
- ret = tls_do_decryption(sk, sgin, sgout, iv,
- rxm->full_len - tls_ctx->rx.overhead_size,
- skb, sk->sk_allocation);
+ *chunk = 0;
+ err = tls_setup_from_iter(sk, out_iov, data_len,
+ &pages, chunk, &sgout[1],
+ (n_sgout - 1));
+ if (err < 0)
+ goto fallback_to_reg_recv;
+ } else if (out_sg) {
+ memcpy(sgout, out_sg, n_sgout * sizeof(*sgout));
+ } else {
+ goto fallback_to_reg_recv;
+ }
+ } else {
+fallback_to_reg_recv:
+ sgout = sgin;
+ pages = 0;
+ *chunk = 0;
+ *zc = false;
+ }
-out:
- if (sgin != &sgin_arr[0])
- kfree(sgin);
+ /* Prepare and submit AEAD request */
+ err = tls_do_decryption(sk, skb, sgin, sgout, iv,
+ data_len, aead_req, *zc);
+ if (err == -EINPROGRESS)
+ return err;
- return ret;
+ /* Release the pages in case iov was mapped to pages */
+ for (; pages > 0; pages--)
+ put_page(sg_page(&sgout[pages]));
+
+ kfree(mem);
+ return err;
+}
+
+static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *dest, int *chunk, bool *zc)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct strp_msg *rxm = strp_msg(skb);
+ int err = 0;
+
+#ifdef CONFIG_TLS_DEVICE
+ err = tls_device_decrypted(sk, skb);
+ if (err < 0)
+ return err;
+#endif
+ if (!ctx->decrypted) {
+ err = decrypt_internal(sk, skb, dest, NULL, chunk, zc);
+ if (err < 0) {
+ if (err == -EINPROGRESS)
+ tls_advance_record_sn(sk, &tls_ctx->rx);
+
+ return err;
+ }
+ } else {
+ *zc = false;
+ }
+
+ rxm->offset += tls_ctx->rx.prepend_size;
+ rxm->full_len -= tls_ctx->rx.overhead_size;
+ tls_advance_record_sn(sk, &tls_ctx->rx);
+ ctx->decrypted = true;
+ ctx->saved_data_ready(sk);
+
+ return err;
+}
+
+int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout)
+{
+ bool zc = true;
+ int chunk;
+
+ return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc);
}
static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
@@ -731,18 +1421,20 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm = strp_msg(skb);
- if (len < rxm->full_len) {
- rxm->offset += len;
- rxm->full_len -= len;
+ if (skb) {
+ struct strp_msg *rxm = strp_msg(skb);
- return false;
+ if (len < rxm->full_len) {
+ rxm->offset += len;
+ rxm->full_len -= len;
+ return false;
+ }
+ kfree_skb(skb);
}
/* Finished with message */
ctx->recv_pkt = NULL;
- kfree_skb(skb);
__strp_unpause(&ctx->strp);
return true;
@@ -757,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct sk_psock *psock;
unsigned char control;
struct strp_msg *rxm;
struct sk_buff *skb;
@@ -764,25 +1457,41 @@ int tls_sw_recvmsg(struct sock *sk,
bool cmsg = false;
int target, err = 0;
long timeo;
+ bool is_kvec = msg->msg_iter.type & ITER_KVEC;
+ int num_async = 0;
flags |= nonblock;
if (unlikely(flags & MSG_ERRQUEUE))
return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
+ psock = sk_psock_get(sk);
lock_sock(sk);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
bool zc = false;
+ bool async = false;
int chunk = 0;
- skb = tls_wait_data(sk, flags, timeo, &err);
- if (!skb)
+ skb = tls_wait_data(sk, psock, flags, timeo, &err);
+ if (!skb) {
+ if (psock) {
+ int ret = __tcp_bpf_recvmsg(sk, psock,
+ msg, len, flags);
+
+ if (ret > 0) {
+ copied += ret;
+ len -= ret;
+ continue;
+ }
+ }
goto recv_end;
+ }
rxm = strp_msg(skb);
+
if (!cmsg) {
int cerr;
@@ -801,60 +1510,47 @@ int tls_sw_recvmsg(struct sock *sk,
}
if (!ctx->decrypted) {
- int page_count;
- int to_copy;
-
- page_count = iov_iter_npages(&msg->msg_iter,
- MAX_SKB_FRAGS);
- to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
- if (to_copy <= len && page_count < MAX_SKB_FRAGS &&
- likely(!(flags & MSG_PEEK))) {
- struct scatterlist sgin[MAX_SKB_FRAGS + 1];
- int pages = 0;
+ int to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
+ if (!is_kvec && to_copy <= len &&
+ likely(!(flags & MSG_PEEK)))
zc = true;
- sg_init_table(sgin, MAX_SKB_FRAGS + 1);
- sg_set_buf(&sgin[0], ctx->rx_aad_plaintext,
- TLS_AAD_SPACE_SIZE);
-
- err = zerocopy_from_iter(sk, &msg->msg_iter,
- to_copy, &pages,
- &chunk, &sgin[1],
- MAX_SKB_FRAGS, false);
- if (err < 0)
- goto fallback_to_reg_recv;
-
- err = decrypt_skb(sk, skb, sgin);
- for (; pages > 0; pages--)
- put_page(sg_page(&sgin[pages]));
- if (err < 0) {
- tls_err_abort(sk, EBADMSG);
- goto recv_end;
- }
- } else {
-fallback_to_reg_recv:
- err = decrypt_skb(sk, skb, NULL);
- if (err < 0) {
- tls_err_abort(sk, EBADMSG);
- goto recv_end;
- }
+
+ err = decrypt_skb_update(sk, skb, &msg->msg_iter,
+ &chunk, &zc);
+ if (err < 0 && err != -EINPROGRESS) {
+ tls_err_abort(sk, EBADMSG);
+ goto recv_end;
}
+
+ if (err == -EINPROGRESS) {
+ async = true;
+ num_async++;
+ goto pick_next_record;
+ }
+
ctx->decrypted = true;
}
if (!zc) {
chunk = min_t(unsigned int, rxm->full_len, len);
+
err = skb_copy_datagram_msg(skb, rxm->offset, msg,
chunk);
if (err < 0)
goto recv_end;
}
+pick_next_record:
copied += chunk;
len -= chunk;
if (likely(!(flags & MSG_PEEK))) {
u8 control = ctx->control;
+ /* For async, drop current skb reference */
+ if (async)
+ skb = NULL;
+
if (tls_sw_advance_skb(sk, skb, chunk)) {
/* Return full control message to
* userspace before trying to parse
@@ -863,15 +1559,43 @@ fallback_to_reg_recv:
msg->msg_flags |= MSG_EOR;
if (control != TLS_RECORD_TYPE_DATA)
goto recv_end;
+ } else {
+ break;
}
+ } else {
+ /* MSG_PEEK right now cannot look beyond current skb
+ * from strparser, meaning we cannot advance skb here
+ * and thus unpause strparser since we'd loose original
+ * one.
+ */
+ break;
}
+
/* If we have a new message from strparser, continue now. */
if (copied >= target && !ctx->recv_pkt)
break;
} while (len);
recv_end:
+ if (num_async) {
+ /* Wait for all previously submitted records to be decrypted */
+ smp_store_mb(ctx->async_notify, true);
+ if (atomic_read(&ctx->decrypt_pending)) {
+ err = crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
+ if (err) {
+ /* one of async decrypt failed */
+ tls_err_abort(sk, err);
+ copied = 0;
+ }
+ } else {
+ reinit_completion(&ctx->async_wait.completion);
+ }
+ WRITE_ONCE(ctx->async_notify, false);
+ }
+
release_sock(sk);
+ if (psock)
+ sk_psock_put(sk, psock);
return copied ? : err;
}
@@ -888,12 +1612,13 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
int err = 0;
long timeo;
int chunk;
+ bool zc = false;
lock_sock(sk);
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
- skb = tls_wait_data(sk, flags, timeo, &err);
+ skb = tls_wait_data(sk, NULL, flags, timeo, &err);
if (!skb)
goto splice_read_end;
@@ -904,7 +1629,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
}
if (!ctx->decrypted) {
- err = decrypt_skb(sk, skb, NULL);
+ err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc);
if (err < 0) {
tls_err_abort(sk, EBADMSG);
@@ -927,30 +1652,27 @@ splice_read_end:
return copied ? : err;
}
-unsigned int tls_sw_poll(struct file *file, struct socket *sock,
- struct poll_table_struct *wait)
+bool tls_sw_stream_read(const struct sock *sk)
{
- unsigned int ret;
- struct sock *sk = sock->sk;
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ bool ingress_empty = true;
+ struct sk_psock *psock;
- /* Grab POLLOUT and POLLHUP from the underlying socket */
- ret = ctx->sk_poll(file, sock, wait);
-
- /* Clear POLLIN bits, and set based on recv_pkt */
- ret &= ~(POLLIN | POLLRDNORM);
- if (ctx->recv_pkt)
- ret |= POLLIN | POLLRDNORM;
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock)
+ ingress_empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
- return ret;
+ return !ingress_empty || ctx->recv_pkt;
}
static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- char header[tls_ctx->rx.prepend_size];
+ char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
struct strp_msg *rxm = strp_msg(skb);
size_t cipher_overhead;
size_t data_len = 0;
@@ -960,6 +1682,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
if (rxm->offset + tls_ctx->rx.prepend_size > skb->len)
return 0;
+ /* Sanity-check size of on-stack buffer. */
+ if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) {
+ ret = -EINVAL;
+ goto read_failure;
+ }
+
/* Linearize header to local buffer */
ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size);
@@ -981,12 +1709,16 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
goto read_failure;
}
- if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) ||
- header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) {
+ if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) ||
+ header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) {
ret = -EINVAL;
goto read_failure;
}
+#ifdef CONFIG_TLS_DEVICE
+ handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
+ *(u64*)tls_ctx->rx.rec_seq);
+#endif
return data_len + TLS_HEADER_SIZE;
read_failure:
@@ -999,48 +1731,91 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm;
-
- rxm = strp_msg(skb);
ctx->decrypted = false;
ctx->recv_pkt = skb;
strp_pause(strp);
- strp->sk->sk_state_change(strp->sk);
+ ctx->saved_data_ready(strp->sk);
}
static void tls_data_ready(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct sk_psock *psock;
strp_data_ready(&ctx->strp);
+
+ psock = sk_psock_get(sk);
+ if (psock && !list_empty(&psock->ingress_msg)) {
+ ctx->saved_data_ready(sk);
+ sk_psock_put(sk, psock);
+ }
}
void tls_sw_free_resources_tx(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_rec *rec, *tmp;
+
+ /* Wait for any pending async encryptions to complete */
+ smp_store_mb(ctx->async_notify, true);
+ if (atomic_read(&ctx->encrypt_pending))
+ crypto_wait_req(-EINPROGRESS, &ctx->async_wait);
+
+ cancel_delayed_work_sync(&ctx->tx_work.work);
+
+ /* Tx whatever records we can transmit and abandon the rest */
+ tls_tx_records(sk, -1);
+
+ /* Free up un-sent records in tx_list. First, free
+ * the partially sent record if any at head of tx_list.
+ */
+ if (tls_ctx->partially_sent_record) {
+ struct scatterlist *sg = tls_ctx->partially_sent_record;
+
+ while (1) {
+ put_page(sg_page(sg));
+ sk_mem_uncharge(sk, sg->length);
+
+ if (sg_is_last(sg))
+ break;
+ sg++;
+ }
+
+ tls_ctx->partially_sent_record = NULL;
+
+ rec = list_first_entry(&ctx->tx_list,
+ struct tls_rec, list);
+ list_del(&rec->list);
+ sk_msg_free(sk, &rec->msg_plaintext);
+ kfree(rec);
+ }
+
+ list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) {
+ list_del(&rec->list);
+ sk_msg_free(sk, &rec->msg_encrypted);
+ sk_msg_free(sk, &rec->msg_plaintext);
+ kfree(rec);
+ }
- if (ctx->aead_send)
- crypto_free_aead(ctx->aead_send);
- tls_free_both_sg(sk);
+ crypto_free_aead(ctx->aead_send);
+ tls_free_open_rec(sk);
kfree(ctx);
}
-void tls_sw_free_resources_rx(struct sock *sk)
+void tls_sw_release_resources_rx(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
if (ctx->aead_recv) {
- if (ctx->recv_pkt) {
- kfree_skb(ctx->recv_pkt);
- ctx->recv_pkt = NULL;
- }
+ kfree_skb(ctx->recv_pkt);
+ ctx->recv_pkt = NULL;
crypto_free_aead(ctx->aead_recv);
strp_stop(&ctx->strp);
write_lock_bh(&sk->sk_callback_lock);
@@ -1050,13 +1825,38 @@ void tls_sw_free_resources_rx(struct sock *sk)
strp_done(&ctx->strp);
lock_sock(sk);
}
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
+ tls_sw_release_resources_rx(sk);
kfree(ctx);
}
+/* The work handler to transmitt the encrypted records in tx_list */
+static void tx_work_handler(struct work_struct *work)
+{
+ struct delayed_work *delayed_work = to_delayed_work(work);
+ struct tx_work *tx_work = container_of(delayed_work,
+ struct tx_work, work);
+ struct sock *sk = tx_work->sk;
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+
+ if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
+ return;
+
+ lock_sock(sk);
+ tls_tx_records(sk, -1);
+ release_sock(sk);
+}
+
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
- char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
struct tls_crypto_info *crypto_info;
struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
struct tls_sw_context_tx *sw_ctx_tx = NULL;
@@ -1074,29 +1874,42 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
if (tx) {
- sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
- if (!sw_ctx_tx) {
- rc = -ENOMEM;
- goto out;
+ if (!ctx->priv_ctx_tx) {
+ sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+ if (!sw_ctx_tx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ctx->priv_ctx_tx = sw_ctx_tx;
+ } else {
+ sw_ctx_tx =
+ (struct tls_sw_context_tx *)ctx->priv_ctx_tx;
}
- crypto_init_wait(&sw_ctx_tx->async_wait);
- ctx->priv_ctx_tx = sw_ctx_tx;
} else {
- sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
- if (!sw_ctx_rx) {
- rc = -ENOMEM;
- goto out;
+ if (!ctx->priv_ctx_rx) {
+ sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+ if (!sw_ctx_rx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ctx->priv_ctx_rx = sw_ctx_rx;
+ } else {
+ sw_ctx_rx =
+ (struct tls_sw_context_rx *)ctx->priv_ctx_rx;
}
- crypto_init_wait(&sw_ctx_rx->async_wait);
- ctx->priv_ctx_rx = sw_ctx_rx;
}
if (tx) {
- crypto_info = &ctx->crypto_send;
+ crypto_init_wait(&sw_ctx_tx->async_wait);
+ crypto_info = &ctx->crypto_send.info;
cctx = &ctx->tx;
aead = &sw_ctx_tx->aead_send;
+ INIT_LIST_HEAD(&sw_ctx_tx->tx_list);
+ INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler);
+ sw_ctx_tx->tx_work.sk = sk;
} else {
- crypto_info = &ctx->crypto_recv;
+ crypto_init_wait(&sw_ctx_rx->async_wait);
+ crypto_info = &ctx->crypto_recv.info;
cctx = &ctx->rx;
aead = &sw_ctx_rx->aead_recv;
}
@@ -1120,7 +1933,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
/* Sanity-check the IV size for stack allocations. */
- if (iv_size > MAX_IV_SIZE) {
+ if (iv_size > MAX_IV_SIZE || nonce_size > MAX_IV_SIZE) {
rc = -EINVAL;
goto free_priv;
}
@@ -1138,32 +1951,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
cctx->rec_seq_size = rec_seq_size;
- cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+ cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
if (!cctx->rec_seq) {
rc = -ENOMEM;
goto free_iv;
}
- memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
-
- if (sw_ctx_tx) {
- sg_init_table(sw_ctx_tx->sg_encrypted_data,
- ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
- sg_init_table(sw_ctx_tx->sg_plaintext_data,
- ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
-
- sg_init_table(sw_ctx_tx->sg_aead_in, 2);
- sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
- sizeof(sw_ctx_tx->aad_space));
- sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
- sg_chain(sw_ctx_tx->sg_aead_in, 2,
- sw_ctx_tx->sg_plaintext_data);
- sg_init_table(sw_ctx_tx->sg_aead_out, 2);
- sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
- sizeof(sw_ctx_tx->aad_space));
- sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
- sg_chain(sw_ctx_tx->sg_aead_out, 2,
- sw_ctx_tx->sg_encrypted_data);
- }
if (!*aead) {
*aead = crypto_alloc_aead("gcm(aes)", 0, 0);
@@ -1176,9 +1968,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
ctx->push_pending_record = tls_sw_push_pending_record;
- memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
-
- rc = crypto_aead_setkey(*aead, keyval,
+ rc = crypto_aead_setkey(*aead, gcm_128_info->key,
TLS_CIPHER_AES_GCM_128_KEY_SIZE);
if (rc)
goto free_aead;
@@ -1200,8 +1990,6 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
sk->sk_data_ready = tls_data_ready;
write_unlock_bh(&sk->sk_callback_lock);
- sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
-
strp_check_rcv(&sw_ctx_rx->strp);
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e5473c03d667..74d1eed7cbd4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -225,6 +225,8 @@ static inline void unix_release_addr(struct unix_address *addr)
static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
{
+ *hashp = 0;
+
if (len <= sizeof(short) || len > sizeof(*sunaddr))
return -EINVAL;
if (!sunaddr || sunaddr->sun_family != AF_UNIX)
@@ -430,7 +432,12 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
connected = unix_dgram_peer_wake_connect(sk, other);
- if (unix_recvq_full(other))
+ /* If other is SOCK_DEAD, we want to make sure we signal
+ * POLLOUT, such that a subsequent write() can get a
+ * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
+ * to other and its full, we will hang waiting for POLLOUT.
+ */
+ if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
return 1;
if (connected)
@@ -2635,7 +2642,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
struct sock *sk = sock->sk;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
@@ -2672,7 +2679,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
unsigned int writable;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
index eb2db0d3b880..c2a71ae487ac 100644
--- a/net/wimax/Makefile
+++ b/net/wimax/Makefile
@@ -11,5 +11,3 @@ wimax-y := \
stack.o
wimax-$(CONFIG_DEBUG_FS) += debugfs.o
-
-
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
index 6c9bedb7431e..24514840746e 100644
--- a/net/wimax/debugfs.c
+++ b/net/wimax/debugfs.c
@@ -76,5 +76,3 @@ void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
{
debugfs_remove_recursive(wimax_dev->debugfs_dentry);
}
-
-
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
index 54aa146930bd..101b2fa3f32e 100644
--- a/net/wimax/op-msg.c
+++ b/net/wimax/op-msg.c
@@ -404,4 +404,3 @@ error_no_wimax_dev:
d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
return result;
}
-
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 5db731512014..a6307813b6d5 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -486,7 +486,8 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
/* Do the RFKILL setup before locking, as RFKILL will call
- * into our functions. */
+ * into our functions.
+ */
wimax_dev->net_dev = net_dev;
result = wimax_rfkill_add(wimax_dev);
if (result < 0)
@@ -629,4 +630,3 @@ module_exit(wimax_subsys_exit);
MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
MODULE_DESCRIPTION("Linux WiMAX stack");
MODULE_LICENSE("GPL");
-
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 48e8097339ab..5bd01058b9e6 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -3,7 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2015 Intel Deutschland GmbH
+ * Copyright 2015-2017 Intel Deutschland GmbH
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -744,6 +744,8 @@ int wiphy_register(struct wiphy *wiphy)
/* sanity check supported bands/channels */
for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ u16 types = 0;
+
sband = wiphy->bands[band];
if (!sband)
continue;
@@ -788,6 +790,23 @@ int wiphy_register(struct wiphy *wiphy)
sband->channels[i].band = band;
}
+ for (i = 0; i < sband->n_iftype_data; i++) {
+ const struct ieee80211_sband_iftype_data *iftd;
+
+ iftd = &sband->iftype_data[i];
+
+ if (WARN_ON(!iftd->types_mask))
+ return -EINVAL;
+ if (WARN_ON(types & iftd->types_mask))
+ return -EINVAL;
+
+ /* at least one piece of information must be present */
+ if (WARN_ON(!iftd->he_cap.has_he))
+ return -EINVAL;
+
+ types |= iftd->types_mask;
+ }
+
have_band = true;
}
@@ -1000,36 +1019,49 @@ void cfg80211_cqm_config_free(struct wireless_dev *wdev)
wdev->cqm_config = NULL;
}
-void cfg80211_unregister_wdev(struct wireless_dev *wdev)
+static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
ASSERT_RTNL();
- if (WARN_ON(wdev->netdev))
- return;
-
nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE);
list_del_rcu(&wdev->list);
- synchronize_rcu();
+ if (sync)
+ synchronize_rcu();
rdev->devlist_generation++;
+ cfg80211_mlme_purge_registrations(wdev);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_P2P_DEVICE:
- cfg80211_mlme_purge_registrations(wdev);
cfg80211_stop_p2p_device(rdev, wdev);
break;
case NL80211_IFTYPE_NAN:
cfg80211_stop_nan(rdev, wdev);
break;
default:
- WARN_ON_ONCE(1);
break;
}
+#ifdef CONFIG_CFG80211_WEXT
+ kzfree(wdev->wext.keys);
+#endif
+ /* only initialized if we have a netdev */
+ if (wdev->netdev)
+ flush_work(&wdev->disconnect_wk);
+
cfg80211_cqm_config_free(wdev);
}
+
+void cfg80211_unregister_wdev(struct wireless_dev *wdev)
+{
+ if (WARN_ON(wdev->netdev))
+ return;
+
+ __cfg80211_unregister_wdev(wdev, true);
+}
EXPORT_SYMBOL(cfg80211_unregister_wdev);
static const struct device_type wiphy_type = {
@@ -1134,6 +1166,30 @@ void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
}
EXPORT_SYMBOL(cfg80211_stop_iface);
+void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ mutex_init(&wdev->mtx);
+ INIT_LIST_HEAD(&wdev->event_list);
+ spin_lock_init(&wdev->event_lock);
+ INIT_LIST_HEAD(&wdev->mgmt_registrations);
+ spin_lock_init(&wdev->mgmt_registrations_lock);
+
+ /*
+ * We get here also when the interface changes network namespaces,
+ * as it's registered into the new one, but we don't want it to
+ * change ID in that case. Checking if the ID is already assigned
+ * works, because 0 isn't considered a valid ID and the memory is
+ * 0-initialized.
+ */
+ if (!wdev->identifier)
+ wdev->identifier = ++rdev->wdev_id;
+ list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
+ rdev->devlist_generation++;
+
+ nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
+}
+
static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
unsigned long state, void *ptr)
{
@@ -1159,23 +1215,6 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
* called within code protected by it when interfaces
* are added with nl80211.
*/
- mutex_init(&wdev->mtx);
- INIT_LIST_HEAD(&wdev->event_list);
- spin_lock_init(&wdev->event_lock);
- INIT_LIST_HEAD(&wdev->mgmt_registrations);
- spin_lock_init(&wdev->mgmt_registrations_lock);
-
- /*
- * We get here also when the interface changes network namespaces,
- * as it's registered into the new one, but we don't want it to
- * change ID in that case. Checking if the ID is already assigned
- * works, because 0 isn't considered a valid ID and the memory is
- * 0-initialized.
- */
- if (!wdev->identifier)
- wdev->identifier = ++rdev->wdev_id;
- list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
- rdev->devlist_generation++;
/* can only change netns with wiphy */
dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1204,7 +1243,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk);
- nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
+ cfg80211_init_wdev(rdev, wdev);
break;
case NETDEV_GOING_DOWN:
cfg80211_leave(rdev, wdev);
@@ -1219,7 +1258,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
list_for_each_entry_safe(pos, tmp,
&rdev->sched_scan_req_list, list) {
- if (WARN_ON(pos && pos->dev == wdev->netdev))
+ if (WARN_ON(pos->dev == wdev->netdev))
cfg80211_stop_sched_scan_req(rdev, pos, false);
}
@@ -1283,17 +1322,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
* remove and clean it up.
*/
if (!list_empty(&wdev->list)) {
- nl80211_notify_iface(rdev, wdev,
- NL80211_CMD_DEL_INTERFACE);
+ __cfg80211_unregister_wdev(wdev, false);
sysfs_remove_link(&dev->dev.kobj, "phy80211");
- list_del_rcu(&wdev->list);
- rdev->devlist_generation++;
- cfg80211_mlme_purge_registrations(wdev);
-#ifdef CONFIG_CFG80211_WEXT
- kzfree(wdev->wext.keys);
-#endif
- flush_work(&wdev->disconnect_wk);
- cfg80211_cqm_config_free(wdev);
}
/*
* synchronise (so that we won't find this netdev
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 63eb1b5fdd04..c61dbba8bf47 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -66,6 +66,7 @@ struct cfg80211_registered_device {
/* protected by RTNL only */
int num_running_ifaces;
int num_running_monitor_ifaces;
+ u64 cookie_counter;
/* BSSes/scanning */
spinlock_t bss_lock;
@@ -76,7 +77,7 @@ struct cfg80211_registered_device {
struct cfg80211_scan_request *scan_req; /* protected by RTNL */
struct sk_buff *scan_msg;
struct list_head sched_scan_req_list;
- unsigned long suspend_at;
+ time64_t suspend_at;
struct work_struct scan_done_wk;
struct genl_info *cur_cmd_info;
@@ -133,6 +134,16 @@ cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev)
#endif
}
+static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev)
+{
+ u64 r = ++rdev->cookie_counter;
+
+ if (WARN_ON(r == 0))
+ r = ++rdev->cookie_counter;
+
+ return r;
+}
+
extern struct workqueue_struct *cfg80211_wq;
extern struct list_head cfg80211_rdev_list;
extern int cfg80211_rdev_list_generation;
@@ -187,6 +198,9 @@ struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
struct net *net);
+void cfg80211_init_wdev(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev);
+
static inline void wdev_lock(struct wireless_dev *wdev)
__acquires(wdev)
{
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index ba0a1f398ce5..b5e235573c8a 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -30,7 +30,7 @@
#include <net/iw_handler.h>
#include <crypto/hash.h>
-#include <crypto/skcipher.h>
+#include <linux/crypto.h>
#include <linux/crc32.h>
#include <net/lib80211.h>
@@ -64,10 +64,10 @@ struct lib80211_tkip_data {
int key_idx;
- struct crypto_skcipher *rx_tfm_arc4;
- struct crypto_ahash *rx_tfm_michael;
- struct crypto_skcipher *tx_tfm_arc4;
- struct crypto_ahash *tx_tfm_michael;
+ struct crypto_cipher *rx_tfm_arc4;
+ struct crypto_shash *rx_tfm_michael;
+ struct crypto_cipher *tx_tfm_arc4;
+ struct crypto_shash *tx_tfm_michael;
/* scratch buffers for virt_to_page() (crypto API) */
u8 rx_hdr[16], tx_hdr[16];
@@ -99,29 +99,25 @@ static void *lib80211_tkip_init(int key_idx)
priv->key_idx = key_idx;
- priv->tx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0,
- CRYPTO_ALG_ASYNC);
+ priv->tx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->tx_tfm_arc4)) {
priv->tx_tfm_arc4 = NULL;
goto fail;
}
- priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
+ priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0);
if (IS_ERR(priv->tx_tfm_michael)) {
priv->tx_tfm_michael = NULL;
goto fail;
}
- priv->rx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0,
- CRYPTO_ALG_ASYNC);
+ priv->rx_tfm_arc4 = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->rx_tfm_arc4)) {
priv->rx_tfm_arc4 = NULL;
goto fail;
}
- priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
+ priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0);
if (IS_ERR(priv->rx_tfm_michael)) {
priv->rx_tfm_michael = NULL;
goto fail;
@@ -131,10 +127,10 @@ static void *lib80211_tkip_init(int key_idx)
fail:
if (priv) {
- crypto_free_ahash(priv->tx_tfm_michael);
- crypto_free_skcipher(priv->tx_tfm_arc4);
- crypto_free_ahash(priv->rx_tfm_michael);
- crypto_free_skcipher(priv->rx_tfm_arc4);
+ crypto_free_shash(priv->tx_tfm_michael);
+ crypto_free_cipher(priv->tx_tfm_arc4);
+ crypto_free_shash(priv->rx_tfm_michael);
+ crypto_free_cipher(priv->rx_tfm_arc4);
kfree(priv);
}
@@ -145,10 +141,10 @@ static void lib80211_tkip_deinit(void *priv)
{
struct lib80211_tkip_data *_priv = priv;
if (_priv) {
- crypto_free_ahash(_priv->tx_tfm_michael);
- crypto_free_skcipher(_priv->tx_tfm_arc4);
- crypto_free_ahash(_priv->rx_tfm_michael);
- crypto_free_skcipher(_priv->rx_tfm_arc4);
+ crypto_free_shash(_priv->tx_tfm_michael);
+ crypto_free_cipher(_priv->tx_tfm_arc4);
+ crypto_free_shash(_priv->rx_tfm_michael);
+ crypto_free_cipher(_priv->rx_tfm_arc4);
}
kfree(priv);
}
@@ -346,12 +342,10 @@ static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
- SKCIPHER_REQUEST_ON_STACK(req, tkey->tx_tfm_arc4);
int len;
u8 rc4key[16], *pos, *icv;
u32 crc;
- struct scatterlist sg;
- int err;
+ int i;
if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
@@ -376,14 +370,10 @@ static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
icv[2] = crc >> 16;
icv[3] = crc >> 24;
- crypto_skcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
- sg_init_one(&sg, pos, len + 4);
- skcipher_request_set_tfm(req, tkey->tx_tfm_arc4);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
- err = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- return err;
+ crypto_cipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
+ for (i = 0; i < len + 4; i++)
+ crypto_cipher_encrypt_one(tkey->tx_tfm_arc4, pos + i, pos + i);
+ return 0;
}
/*
@@ -402,7 +392,6 @@ static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
- SKCIPHER_REQUEST_ON_STACK(req, tkey->rx_tfm_arc4);
u8 rc4key[16];
u8 keyidx, *pos;
u32 iv32;
@@ -410,9 +399,8 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
struct ieee80211_hdr *hdr;
u8 icv[4];
u32 crc;
- struct scatterlist sg;
int plen;
- int err;
+ int i;
hdr = (struct ieee80211_hdr *)skb->data;
@@ -465,18 +453,9 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
plen = skb->len - hdr_len - 12;
- crypto_skcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
- sg_init_one(&sg, pos, plen + 4);
- skcipher_request_set_tfm(req, tkey->rx_tfm_arc4);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
- err = crypto_skcipher_decrypt(req);
- skcipher_request_zero(req);
- if (err) {
- net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n",
- hdr->addr2);
- return -7;
- }
+ crypto_cipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
+ for (i = 0; i < plen + 4; i++)
+ crypto_cipher_decrypt_one(tkey->rx_tfm_arc4, pos + i, pos + i);
crc = ~crc32_le(~0, pos, plen);
icv[0] = crc;
@@ -510,29 +489,36 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
return keyidx;
}
-static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr,
- u8 * data, size_t data_len, u8 * mic)
+static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr,
+ u8 *data, size_t data_len, u8 *mic)
{
- AHASH_REQUEST_ON_STACK(req, tfm_michael);
- struct scatterlist sg[2];
+ SHASH_DESC_ON_STACK(desc, tfm_michael);
int err;
if (tfm_michael == NULL) {
pr_warn("%s(): tfm_michael == NULL\n", __func__);
return -1;
}
- sg_init_table(sg, 2);
- sg_set_buf(&sg[0], hdr, 16);
- sg_set_buf(&sg[1], data, data_len);
- if (crypto_ahash_setkey(tfm_michael, key, 8))
+ desc->tfm = tfm_michael;
+ desc->flags = 0;
+
+ if (crypto_shash_setkey(tfm_michael, key, 8))
return -1;
- ahash_request_set_tfm(req, tfm_michael);
- ahash_request_set_callback(req, 0, NULL, NULL);
- ahash_request_set_crypt(req, sg, mic, data_len + 16);
- err = crypto_ahash_digest(req);
- ahash_request_zero(req);
+ err = crypto_shash_init(desc);
+ if (err)
+ goto out;
+ err = crypto_shash_update(desc, hdr, 16);
+ if (err)
+ goto out;
+ err = crypto_shash_update(desc, data, data_len);
+ if (err)
+ goto out;
+ err = crypto_shash_final(desc, mic);
+
+out:
+ shash_desc_zero(desc);
return err;
}
@@ -654,10 +640,10 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
int keyidx;
- struct crypto_ahash *tfm = tkey->tx_tfm_michael;
- struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4;
- struct crypto_ahash *tfm3 = tkey->rx_tfm_michael;
- struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4;
+ struct crypto_shash *tfm = tkey->tx_tfm_michael;
+ struct crypto_cipher *tfm2 = tkey->tx_tfm_arc4;
+ struct crypto_shash *tfm3 = tkey->rx_tfm_michael;
+ struct crypto_cipher *tfm4 = tkey->rx_tfm_arc4;
keyidx = tkey->key_idx;
memset(tkey, 0, sizeof(*tkey));
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index d05f58b0fd04..6015f6b542a6 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -22,7 +22,7 @@
#include <net/lib80211.h>
-#include <crypto/skcipher.h>
+#include <linux/crypto.h>
#include <linux/crc32.h>
MODULE_AUTHOR("Jouni Malinen");
@@ -35,8 +35,8 @@ struct lib80211_wep_data {
u8 key[WEP_KEY_LEN + 1];
u8 key_len;
u8 key_idx;
- struct crypto_skcipher *tx_tfm;
- struct crypto_skcipher *rx_tfm;
+ struct crypto_cipher *tx_tfm;
+ struct crypto_cipher *rx_tfm;
};
static void *lib80211_wep_init(int keyidx)
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
goto fail;
priv->key_idx = keyidx;
- priv->tx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ priv->tx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->tx_tfm)) {
priv->tx_tfm = NULL;
goto fail;
}
- priv->rx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ priv->rx_tfm = crypto_alloc_cipher("arc4", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->rx_tfm)) {
priv->rx_tfm = NULL;
goto fail;
@@ -66,8 +66,8 @@ static void *lib80211_wep_init(int keyidx)
fail:
if (priv) {
- crypto_free_skcipher(priv->tx_tfm);
- crypto_free_skcipher(priv->rx_tfm);
+ crypto_free_cipher(priv->tx_tfm);
+ crypto_free_cipher(priv->rx_tfm);
kfree(priv);
}
return NULL;
@@ -77,8 +77,8 @@ static void lib80211_wep_deinit(void *priv)
{
struct lib80211_wep_data *_priv = priv;
if (_priv) {
- crypto_free_skcipher(_priv->tx_tfm);
- crypto_free_skcipher(_priv->rx_tfm);
+ crypto_free_cipher(_priv->tx_tfm);
+ crypto_free_cipher(_priv->rx_tfm);
}
kfree(priv);
}
@@ -129,12 +129,10 @@ static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len,
static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_wep_data *wep = priv;
- SKCIPHER_REQUEST_ON_STACK(req, wep->tx_tfm);
u32 crc, klen, len;
u8 *pos, *icv;
- struct scatterlist sg;
u8 key[WEP_KEY_LEN + 3];
- int err;
+ int i;
/* other checks are in lib80211_wep_build_iv */
if (skb_tailroom(skb) < 4)
@@ -162,14 +160,12 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
icv[2] = crc >> 16;
icv[3] = crc >> 24;
- crypto_skcipher_setkey(wep->tx_tfm, key, klen);
- sg_init_one(&sg, pos, len + 4);
- skcipher_request_set_tfm(req, wep->tx_tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
- err = crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
- return err;
+ crypto_cipher_setkey(wep->tx_tfm, key, klen);
+
+ for (i = 0; i < len + 4; i++)
+ crypto_cipher_encrypt_one(wep->tx_tfm, pos + i, pos + i);
+
+ return 0;
}
/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
@@ -182,12 +178,10 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_wep_data *wep = priv;
- SKCIPHER_REQUEST_ON_STACK(req, wep->rx_tfm);
u32 crc, klen, plen;
u8 key[WEP_KEY_LEN + 3];
u8 keyidx, *pos, icv[4];
- struct scatterlist sg;
- int err;
+ int i;
if (skb->len < hdr_len + 8)
return -1;
@@ -208,15 +202,9 @@ static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
/* Apply RC4 to data and compute CRC32 over decrypted data */
plen = skb->len - hdr_len - 8;
- crypto_skcipher_setkey(wep->rx_tfm, key, klen);
- sg_init_one(&sg, pos, plen + 4);
- skcipher_request_set_tfm(req, wep->rx_tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
- err = crypto_skcipher_decrypt(req);
- skcipher_request_zero(req);
- if (err)
- return -7;
+ crypto_cipher_setkey(wep->rx_tfm, key, klen);
+ for (i = 0; i < plen + 4; i++)
+ crypto_cipher_decrypt_one(wep->rx_tfm, pos + i, pos + i);
crc = ~crc32_le(~0, pos, plen);
icv[0] = crc;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 80bc986c79e5..744b5851bbf9 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -200,7 +200,46 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
return __cfg80211_rdev_from_attrs(netns, info->attrs);
}
+static int validate_ie_attr(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *pos;
+ int len;
+
+ pos = nla_data(attr);
+ len = nla_len(attr);
+
+ while (len) {
+ u8 elemlen;
+
+ if (len < 2)
+ goto error;
+ len -= 2;
+
+ elemlen = pos[1];
+ if (elemlen > len)
+ goto error;
+
+ len -= elemlen;
+ pos += 2 + elemlen;
+ }
+
+ return 0;
+error:
+ NL_SET_ERR_MSG_ATTR(extack, attr, "malformed information elements");
+ return -EINVAL;
+}
+
/* policy for the attributes */
+static const struct nla_policy
+nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = {
+ [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, },
+ [NL80211_FTM_RESP_ATTR_LCI] = { .type = NLA_BINARY,
+ .len = U8_MAX },
+ [NL80211_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_BINARY,
+ .len = U8_MAX },
+};
+
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
@@ -213,14 +252,14 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
[NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },
- [NL80211_ATTR_WIPHY_RETRY_SHORT] = { .type = NLA_U8 },
- [NL80211_ATTR_WIPHY_RETRY_LONG] = { .type = NLA_U8 },
+ [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1),
+ [NL80211_ATTR_WIPHY_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1),
[NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 },
[NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG },
- [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_IFTYPE] = NLA_POLICY_MAX(NLA_U32, NL80211_IFTYPE_MAX),
[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
[NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 },
@@ -230,24 +269,28 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
[NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
.len = WLAN_MAX_KEY_LEN },
- [NL80211_ATTR_KEY_IDX] = { .type = NLA_U8 },
+ [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 5),
[NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
[NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
[NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
- [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_KEY_TYPE] =
+ NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES),
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
[NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_DATA_LEN },
- [NL80211_ATTR_BEACON_TAIL] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
- [NL80211_ATTR_STA_AID] = { .type = NLA_U16 },
+ [NL80211_ATTR_BEACON_TAIL] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
+ IEEE80211_MAX_DATA_LEN),
+ [NL80211_ATTR_STA_AID] =
+ NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_STA_FLAGS] = { .type = NLA_NESTED },
[NL80211_ATTR_STA_LISTEN_INTERVAL] = { .type = NLA_U16 },
[NL80211_ATTR_STA_SUPPORTED_RATES] = { .type = NLA_BINARY,
.len = NL80211_MAX_SUPP_RATES },
- [NL80211_ATTR_STA_PLINK_ACTION] = { .type = NLA_U8 },
+ [NL80211_ATTR_STA_PLINK_ACTION] =
+ NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_ACTIONS - 1),
[NL80211_ATTR_STA_VLAN] = { .type = NLA_U32 },
[NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ },
[NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
@@ -270,8 +313,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_HT_CAPABILITY] = { .len = NL80211_HT_CAPABILITY_LEN },
[NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 },
- [NL80211_ATTR_IE] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ validate_ie_attr,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_SCAN_FREQUENCIES] = { .type = NLA_NESTED },
[NL80211_ATTR_SCAN_SSIDS] = { .type = NLA_NESTED },
@@ -281,7 +325,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_REASON_CODE] = { .type = NLA_U16 },
[NL80211_ATTR_FREQ_FIXED] = { .type = NLA_FLAG },
[NL80211_ATTR_TIMED_OUT] = { .type = NLA_FLAG },
- [NL80211_ATTR_USE_MFP] = { .type = NLA_U32 },
+ [NL80211_ATTR_USE_MFP] = NLA_POLICY_RANGE(NLA_U32,
+ NL80211_MFP_NO,
+ NL80211_MFP_OPTIONAL),
[NL80211_ATTR_STA_FLAGS2] = {
.len = sizeof(struct nl80211_sta_flag_update),
},
@@ -301,7 +347,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_FRAME] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_DATA_LEN },
[NL80211_ATTR_FRAME_MATCH] = { .type = NLA_BINARY, },
- [NL80211_ATTR_PS_STATE] = { .type = NLA_U32 },
+ [NL80211_ATTR_PS_STATE] = NLA_POLICY_RANGE(NLA_U32,
+ NL80211_PS_DISABLED,
+ NL80211_PS_ENABLED),
[NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
[NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
[NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
@@ -314,15 +362,23 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_OFFCHANNEL_TX_OK] = { .type = NLA_FLAG },
[NL80211_ATTR_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
[NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
- [NL80211_ATTR_STA_PLINK_STATE] = { .type = NLA_U8 },
+ [NL80211_ATTR_STA_PLINK_STATE] =
+ NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1),
+ [NL80211_ATTR_MESH_PEER_AID] =
+ NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_REKEY_DATA] = { .type = NLA_NESTED },
[NL80211_ATTR_SCAN_SUPP_RATES] = { .type = NLA_NESTED },
- [NL80211_ATTR_HIDDEN_SSID] = { .type = NLA_U32 },
- [NL80211_ATTR_IE_PROBE_RESP] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
- [NL80211_ATTR_IE_ASSOC_RESP] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_HIDDEN_SSID] =
+ NLA_POLICY_RANGE(NLA_U32,
+ NL80211_HIDDEN_SSID_NOT_IN_USE,
+ NL80211_HIDDEN_SSID_ZERO_CONTENTS),
+ [NL80211_ATTR_IE_PROBE_RESP] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
+ IEEE80211_MAX_DATA_LEN),
+ [NL80211_ATTR_IE_ASSOC_RESP] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_ROAM_SUPPORT] = { .type = NLA_FLAG },
[NL80211_ATTR_SCHED_SCAN_MATCH] = { .type = NLA_NESTED },
[NL80211_ATTR_TX_NO_CCK_RATE] = { .type = NLA_FLAG },
@@ -348,9 +404,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, },
[NL80211_ATTR_VHT_CAPABILITY] = { .len = NL80211_VHT_CAPABILITY_LEN },
[NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 },
- [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 },
- [NL80211_ATTR_P2P_OPPPS] = { .type = NLA_U8 },
- [NL80211_ATTR_LOCAL_MESH_POWER_MODE] = {. type = NLA_U32 },
+ [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127),
+ [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1),
+ [NL80211_ATTR_LOCAL_MESH_POWER_MODE] =
+ NLA_POLICY_RANGE(NLA_U32,
+ NL80211_MESH_POWER_UNKNOWN + 1,
+ NL80211_MESH_POWER_MAX),
[NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 },
[NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED },
[NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 },
@@ -363,7 +422,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_MDID] = { .type = NLA_U16 },
[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_DATA_LEN },
- [NL80211_ATTR_PEER_AID] = { .type = NLA_U16 },
+ [NL80211_ATTR_PEER_AID] =
+ NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
[NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
[NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
@@ -384,8 +444,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG },
[NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY },
[NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG },
- [NL80211_ATTR_TSID] = { .type = NLA_U8 },
- [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 },
+ [NL80211_ATTR_TSID] = NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_TIDS - 1),
+ [NL80211_ATTR_USER_PRIO] =
+ NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1),
[NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 },
[NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 },
[NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN },
@@ -395,12 +456,13 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
[NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED },
- [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 },
+ [NL80211_ATTR_STA_SUPPORT_P2P_PS] =
+ NLA_POLICY_MAX(NLA_U8, NUM_NL80211_P2P_PS_STATUS - 1),
[NL80211_ATTR_MU_MIMO_GROUP_DATA] = {
.len = VHT_MUMIMO_GROUPS_DATA_LEN
},
[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN },
- [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
+ [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1),
[NL80211_ATTR_BANDS] = { .type = NLA_U32 },
[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
@@ -428,6 +490,13 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
+ [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY,
+ .len = NL80211_HE_MAX_CAPABILITY_LEN },
+
+ [NL80211_ATTR_FTM_RESPONDER] = {
+ .type = NLA_NESTED,
+ .validation_data = nl80211_ftm_responder_policy,
+ },
};
/* policy for the key attributes */
@@ -438,7 +507,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
[NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
[NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
[NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
- [NL80211_KEY_TYPE] = { .type = NLA_U32 },
+ [NL80211_KEY_TYPE] = NLA_POLICY_MAX(NLA_U32, NUM_NL80211_KEYTYPES - 1),
[NL80211_KEY_DEFAULT_TYPES] = { .type = NLA_NESTED },
};
@@ -489,7 +558,10 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
static const struct nla_policy
nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
[NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 },
- [NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 },
+ [NL80211_ATTR_COALESCE_RULE_CONDITION] =
+ NLA_POLICY_RANGE(NLA_U32,
+ NL80211_COALESCE_CONDITION_MATCH,
+ NL80211_COALESCE_CONDITION_NO_MATCH),
[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED },
};
@@ -565,8 +637,7 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
[NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
};
-static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
- struct netlink_callback *cb,
+static int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
struct cfg80211_registered_device **rdev,
struct wireless_dev **wdev)
{
@@ -580,7 +651,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
return err;
*wdev = __cfg80211_wdev_from_attrs(
- sock_net(skb->sk),
+ sock_net(cb->skb->sk),
genl_family_attrbuf(&nl80211_fam));
if (IS_ERR(*wdev))
return PTR_ERR(*wdev);
@@ -612,36 +683,6 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
return 0;
}
-/* IE validation */
-static bool is_valid_ie_attr(const struct nlattr *attr)
-{
- const u8 *pos;
- int len;
-
- if (!attr)
- return true;
-
- pos = nla_data(attr);
- len = nla_len(attr);
-
- while (len) {
- u8 elemlen;
-
- if (len < 2)
- return false;
- len -= 2;
-
- elemlen = pos[1];
- if (elemlen > len)
- return false;
-
- len -= elemlen;
- pos += 2 + elemlen;
- }
-
- return true;
-}
-
/* message building helper */
static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
int flags, u8 cmd)
@@ -667,13 +708,13 @@ static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
goto nla_put_failure;
if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
- rule->wmm_rule->client[j].cw_min) ||
+ rule->wmm_rule.client[j].cw_min) ||
nla_put_u16(msg, NL80211_WMMR_CW_MAX,
- rule->wmm_rule->client[j].cw_max) ||
+ rule->wmm_rule.client[j].cw_max) ||
nla_put_u8(msg, NL80211_WMMR_AIFSN,
- rule->wmm_rule->client[j].aifsn) ||
- nla_put_u8(msg, NL80211_WMMR_TXOP,
- rule->wmm_rule->client[j].cot))
+ rule->wmm_rule.client[j].aifsn) ||
+ nla_put_u16(msg, NL80211_WMMR_TXOP,
+ rule->wmm_rule.client[j].cot))
goto nla_put_failure;
nla_nest_end(msg, nl_wmm_rule);
@@ -764,9 +805,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if (large) {
const struct ieee80211_reg_rule *rule =
- freq_reg_info(wiphy, chan->center_freq);
+ freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq));
- if (!IS_ERR(rule) && rule->wmm_rule) {
+ if (!IS_ERR_OR_NULL(rule) && rule->has_wmm) {
if (nl80211_msg_put_wmm_rules(msg, rule))
goto nla_put_failure;
}
@@ -856,12 +897,8 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key,
if (tb[NL80211_KEY_CIPHER])
k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
- if (tb[NL80211_KEY_TYPE]) {
+ if (tb[NL80211_KEY_TYPE])
k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
- if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
- return genl_err_attr(info, -EINVAL,
- tb[NL80211_KEY_TYPE]);
- }
if (tb[NL80211_KEY_DEFAULT_TYPES]) {
struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
@@ -908,13 +945,8 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
if (k->defmgmt)
k->def_multi = true;
- if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+ if (info->attrs[NL80211_ATTR_KEY_TYPE])
k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
- if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) {
- GENL_SET_ERR_MSG(info, "key type out of range");
- return -EINVAL;
- }
- }
if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) {
struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES];
@@ -1324,6 +1356,34 @@ static int nl80211_send_coalesce(struct sk_buff *msg,
return 0;
}
+static int
+nl80211_send_iftype_data(struct sk_buff *msg,
+ const struct ieee80211_sband_iftype_data *iftdata)
+{
+ const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
+
+ if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
+ iftdata->types_mask))
+ return -ENOBUFS;
+
+ if (he_cap->has_he) {
+ if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MAC,
+ sizeof(he_cap->he_cap_elem.mac_cap_info),
+ he_cap->he_cap_elem.mac_cap_info) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY,
+ sizeof(he_cap->he_cap_elem.phy_cap_info),
+ he_cap->he_cap_elem.phy_cap_info) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_MCS_SET,
+ sizeof(he_cap->he_mcs_nss_supp),
+ &he_cap->he_mcs_nss_supp) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE,
+ sizeof(he_cap->ppe_thres), he_cap->ppe_thres))
+ return -ENOBUFS;
+ }
+
+ return 0;
+}
+
static int nl80211_send_band_rateinfo(struct sk_buff *msg,
struct ieee80211_supported_band *sband)
{
@@ -1353,6 +1413,32 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
sband->vht_cap.cap)))
return -ENOBUFS;
+ if (sband->n_iftype_data) {
+ struct nlattr *nl_iftype_data =
+ nla_nest_start(msg, NL80211_BAND_ATTR_IFTYPE_DATA);
+ int err;
+
+ if (!nl_iftype_data)
+ return -ENOBUFS;
+
+ for (i = 0; i < sband->n_iftype_data; i++) {
+ struct nlattr *iftdata;
+
+ iftdata = nla_nest_start(msg, i + 1);
+ if (!iftdata)
+ return -ENOBUFS;
+
+ err = nl80211_send_iftype_data(msg,
+ &sband->iftype_data[i]);
+ if (err)
+ return err;
+
+ nla_nest_end(msg, iftdata);
+ }
+
+ nla_nest_end(msg, nl_iftype_data);
+ }
+
/* add bitrates */
nl_rates = nla_nest_start(msg, NL80211_BAND_ATTR_RATES);
if (!nl_rates)
@@ -2236,12 +2322,14 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
struct genl_info *info,
struct cfg80211_chan_def *chandef)
{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
u32 control_freq;
- if (!info->attrs[NL80211_ATTR_WIPHY_FREQ])
+ if (!attrs[NL80211_ATTR_WIPHY_FREQ])
return -EINVAL;
- control_freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+ control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]);
chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq);
chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
@@ -2249,14 +2337,16 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
chandef->center_freq2 = 0;
/* Primary channel not allowed */
- if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED)
+ if (!chandef->chan || chandef->chan->flags & IEEE80211_CHAN_DISABLED) {
+ NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ],
+ "Channel is disabled");
return -EINVAL;
+ }
- if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
+ if (attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
enum nl80211_channel_type chantype;
- chantype = nla_get_u32(
- info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
+ chantype = nla_get_u32(attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
switch (chantype) {
case NL80211_CHAN_NO_HT:
@@ -2266,42 +2356,56 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
cfg80211_chandef_create(chandef, chandef->chan,
chantype);
/* user input for center_freq is incorrect */
- if (info->attrs[NL80211_ATTR_CENTER_FREQ1] &&
- chandef->center_freq1 != nla_get_u32(
- info->attrs[NL80211_ATTR_CENTER_FREQ1]))
+ if (attrs[NL80211_ATTR_CENTER_FREQ1] &&
+ chandef->center_freq1 != nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1])) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ attrs[NL80211_ATTR_CENTER_FREQ1],
+ "bad center frequency 1");
return -EINVAL;
+ }
/* center_freq2 must be zero */
- if (info->attrs[NL80211_ATTR_CENTER_FREQ2] &&
- nla_get_u32(info->attrs[NL80211_ATTR_CENTER_FREQ2]))
+ if (attrs[NL80211_ATTR_CENTER_FREQ2] &&
+ nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2])) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ attrs[NL80211_ATTR_CENTER_FREQ2],
+ "center frequency 2 can't be used");
return -EINVAL;
+ }
break;
default:
+ NL_SET_ERR_MSG_ATTR(extack,
+ attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE],
+ "invalid channel type");
return -EINVAL;
}
- } else if (info->attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
+ } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
chandef->width =
- nla_get_u32(info->attrs[NL80211_ATTR_CHANNEL_WIDTH]);
- if (info->attrs[NL80211_ATTR_CENTER_FREQ1])
+ nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]);
+ if (attrs[NL80211_ATTR_CENTER_FREQ1])
chandef->center_freq1 =
- nla_get_u32(
- info->attrs[NL80211_ATTR_CENTER_FREQ1]);
- if (info->attrs[NL80211_ATTR_CENTER_FREQ2])
+ nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]);
+ if (attrs[NL80211_ATTR_CENTER_FREQ2])
chandef->center_freq2 =
- nla_get_u32(
- info->attrs[NL80211_ATTR_CENTER_FREQ2]);
+ nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
}
- if (!cfg80211_chandef_valid(chandef))
+ if (!cfg80211_chandef_valid(chandef)) {
+ NL_SET_ERR_MSG(extack, "invalid channel definition");
return -EINVAL;
+ }
if (!cfg80211_chandef_usable(&rdev->wiphy, chandef,
- IEEE80211_CHAN_DISABLED))
+ IEEE80211_CHAN_DISABLED)) {
+ NL_SET_ERR_MSG(extack, "(extension) channel is disabled");
return -EINVAL;
+ }
if ((chandef->width == NL80211_CHAN_WIDTH_5 ||
chandef->width == NL80211_CHAN_WIDTH_10) &&
- !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ))
+ !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_5_10_MHZ)) {
+ NL_SET_ERR_MSG(extack, "5/10 MHz not supported");
return -EINVAL;
+ }
return 0;
}
@@ -2561,8 +2665,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
retry_short = nla_get_u8(
info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]);
- if (retry_short == 0)
- return -EINVAL;
changed |= WIPHY_PARAM_RETRY_SHORT;
}
@@ -2570,8 +2672,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]) {
retry_long = nla_get_u8(
info->attrs[NL80211_ATTR_WIPHY_RETRY_LONG]);
- if (retry_long == 0)
- return -EINVAL;
changed |= WIPHY_PARAM_RETRY_LONG;
}
@@ -2757,7 +2857,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) ||
nla_put_u32(msg, NL80211_ATTR_GENERATION,
rdev->devlist_generation ^
- (cfg80211_rdev_list_generation << 2)))
+ (cfg80211_rdev_list_generation << 2)) ||
+ nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr))
goto nla_put_failure;
if (rdev->ops->get_channel) {
@@ -3062,8 +3163,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
if (otype != ntype)
change = true;
- if (ntype > NL80211_IFTYPE_MAX)
- return -EINVAL;
}
if (info->attrs[NL80211_ATTR_MESH_ID]) {
@@ -3128,11 +3227,8 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_IFNAME])
return -EINVAL;
- if (info->attrs[NL80211_ATTR_IFTYPE]) {
+ if (info->attrs[NL80211_ATTR_IFTYPE])
type = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
- if (type > NL80211_IFTYPE_MAX)
- return -EINVAL;
- }
if (!rdev->ops->add_virtual_intf ||
!(rdev->wiphy.interface_modes & (1 << type)))
@@ -3195,15 +3291,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
* P2P Device and NAN do not have a netdev, so don't go
* through the netdev notifier and must be added here
*/
- mutex_init(&wdev->mtx);
- INIT_LIST_HEAD(&wdev->event_list);
- spin_lock_init(&wdev->event_lock);
- INIT_LIST_HEAD(&wdev->mgmt_registrations);
- spin_lock_init(&wdev->mgmt_registrations_lock);
-
- wdev->identifier = ++rdev->wdev_id;
- list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list);
- rdev->devlist_generation++;
+ cfg80211_init_wdev(rdev, wdev);
break;
default:
break;
@@ -3215,15 +3303,6 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
return -ENOBUFS;
}
- /*
- * For wdevs which have no associated netdev object (e.g. of type
- * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here.
- * For all other types, the event will be generated from the
- * netdev notifier
- */
- if (!wdev->netdev)
- nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
-
return genlmsg_reply(msg, info);
}
@@ -3302,7 +3381,7 @@ static void get_key_callback(void *c, struct key_params *params)
params->cipher)))
goto nla_put_failure;
- if (nla_put_u8(cookie->msg, NL80211_ATTR_KEY_IDX, cookie->idx))
+ if (nla_put_u8(cookie->msg, NL80211_KEY_IDX, cookie->idx))
goto nla_put_failure;
nla_nest_end(cookie->msg, key);
@@ -3329,9 +3408,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_KEY_IDX])
key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
- if (key_idx > 5)
- return -EINVAL;
-
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
@@ -3339,8 +3415,6 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
- if (kt >= NUM_NL80211_KEYTYPES)
- return -EINVAL;
if (kt != NL80211_KEYTYPE_GROUP &&
kt != NL80211_KEYTYPE_PAIRWISE)
return -EINVAL;
@@ -3699,6 +3773,7 @@ static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband,
return false;
/* check availability */
+ ridx = array_index_nospec(ridx, IEEE80211_HT_MCS_MASK_LEN);
if (sband->ht_cap.mcs.rx_mask[ridx] & rbit)
mcs[ridx] |= rbit;
else
@@ -3940,16 +4015,12 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
return 0;
}
-static int nl80211_parse_beacon(struct nlattr *attrs[],
+static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
+ struct nlattr *attrs[],
struct cfg80211_beacon_data *bcn)
{
bool haveinfo = false;
-
- if (!is_valid_ie_attr(attrs[NL80211_ATTR_BEACON_TAIL]) ||
- !is_valid_ie_attr(attrs[NL80211_ATTR_IE]) ||
- !is_valid_ie_attr(attrs[NL80211_ATTR_IE_PROBE_RESP]) ||
- !is_valid_ie_attr(attrs[NL80211_ATTR_IE_ASSOC_RESP]))
- return -EINVAL;
+ int err;
memset(bcn, 0, sizeof(*bcn));
@@ -3994,6 +4065,35 @@ static int nl80211_parse_beacon(struct nlattr *attrs[],
bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]);
}
+ if (attrs[NL80211_ATTR_FTM_RESPONDER]) {
+ struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1];
+
+ err = nla_parse_nested(tb, NL80211_FTM_RESP_ATTR_MAX,
+ attrs[NL80211_ATTR_FTM_RESPONDER],
+ NULL, NULL);
+ if (err)
+ return err;
+
+ if (tb[NL80211_FTM_RESP_ATTR_ENABLED] &&
+ wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER))
+ bcn->ftm_responder = 1;
+ else
+ return -EOPNOTSUPP;
+
+ if (tb[NL80211_FTM_RESP_ATTR_LCI]) {
+ bcn->lci = nla_data(tb[NL80211_FTM_RESP_ATTR_LCI]);
+ bcn->lci_len = nla_len(tb[NL80211_FTM_RESP_ATTR_LCI]);
+ }
+
+ if (tb[NL80211_FTM_RESP_ATTR_CIVICLOC]) {
+ bcn->civicloc = nla_data(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
+ bcn->civicloc_len = nla_len(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]);
+ }
+ } else {
+ bcn->ftm_responder = -1;
+ }
+
return 0;
}
@@ -4038,6 +4138,9 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len);
if (cap && cap[1] >= sizeof(*params->vht_cap))
params->vht_cap = (void *)(cap + 2);
+ cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len);
+ if (cap && cap[1] >= sizeof(*params->he_cap) + 1)
+ params->he_cap = (void *)(cap + 3);
}
static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
@@ -4137,7 +4240,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
!info->attrs[NL80211_ATTR_BEACON_HEAD])
return -EINVAL;
- err = nl80211_parse_beacon(info->attrs, &params.beacon);
+ err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon);
if (err)
return err;
@@ -4167,14 +4270,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
- if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) {
+ if (info->attrs[NL80211_ATTR_HIDDEN_SSID])
params.hidden_ssid = nla_get_u32(
info->attrs[NL80211_ATTR_HIDDEN_SSID]);
- if (params.hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE &&
- params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_LEN &&
- params.hidden_ssid != NL80211_HIDDEN_SSID_ZERO_CONTENTS)
- return -EINVAL;
- }
params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
@@ -4204,8 +4302,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
params.p2p_ctwindow =
nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
- if (params.p2p_ctwindow > 127)
- return -EINVAL;
if (params.p2p_ctwindow != 0 &&
!(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
return -EINVAL;
@@ -4217,8 +4313,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
return -EINVAL;
tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
- if (tmp > 1)
- return -EINVAL;
params.p2p_opp_ps = tmp;
if (params.p2p_opp_ps != 0 &&
!(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
@@ -4321,7 +4415,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
if (!wdev->beacon_interval)
return -EINVAL;
- err = nl80211_parse_beacon(info->attrs, &params);
+ err = nl80211_parse_beacon(rdev, info->attrs, &params);
if (err)
return err;
@@ -4472,6 +4566,9 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
case RATE_INFO_BW_160:
rate_flg = NL80211_RATE_INFO_160_MHZ_WIDTH;
break;
+ case RATE_INFO_BW_HE_RU:
+ rate_flg = 0;
+ WARN_ON(!(info->flags & RATE_INFO_FLAGS_HE_MCS));
}
if (rate_flg && nla_put_flag(msg, rate_flg))
@@ -4491,6 +4588,19 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info,
if (info->flags & RATE_INFO_FLAGS_SHORT_GI &&
nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI))
return false;
+ } else if (info->flags & RATE_INFO_FLAGS_HE_MCS) {
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_MCS, info->mcs))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_NSS, info->nss))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_GI, info->he_gi))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_HE_DCM, info->he_dcm))
+ return false;
+ if (info->bw == RATE_INFO_BW_HE_RU &&
+ nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC,
+ info->he_ru_alloc))
+ return false;
}
nla_nest_end(msg, rate);
@@ -4547,13 +4657,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
#define PUT_SINFO(attr, memb, type) do { \
BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \
- if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \
nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \
sinfo->memb)) \
goto nla_put_failure; \
} while (0)
#define PUT_SINFO_U64(attr, memb) do { \
- if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \
nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \
sinfo->memb, NL80211_STA_INFO_PAD)) \
goto nla_put_failure; \
@@ -4562,14 +4672,14 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO(CONNECTED_TIME, connected_time, u32);
PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
- if (sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES) |
- BIT(NL80211_STA_INFO_RX_BYTES64)) &&
+ if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) |
+ BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) &&
nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES,
(u32)sinfo->rx_bytes))
goto nla_put_failure;
- if (sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES) |
- BIT(NL80211_STA_INFO_TX_BYTES64)) &&
+ if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) |
+ BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) &&
nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES,
(u32)sinfo->tx_bytes))
goto nla_put_failure;
@@ -4589,24 +4699,24 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
default:
break;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) {
if (!nl80211_put_signal(msg, sinfo->chains,
sinfo->chain_signal,
NL80211_STA_INFO_CHAIN_SIGNAL))
goto nla_put_failure;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) {
if (!nl80211_put_signal(msg, sinfo->chains,
sinfo->chain_signal_avg,
NL80211_STA_INFO_CHAIN_SIGNAL_AVG))
goto nla_put_failure;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) {
if (!nl80211_put_sta_rate(msg, &sinfo->txrate,
NL80211_STA_INFO_TX_BITRATE))
goto nla_put_failure;
}
- if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) {
if (!nl80211_put_sta_rate(msg, &sinfo->rxrate,
NL80211_STA_INFO_RX_BITRATE))
goto nla_put_failure;
@@ -4622,7 +4732,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO(PEER_PM, peer_pm, u32);
PUT_SINFO(NONPEER_PM, nonpeer_pm, u32);
- if (sinfo->filled & BIT(NL80211_STA_INFO_BSS_PARAM)) {
+ if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) {
bss_param = nla_nest_start(msg, NL80211_STA_INFO_BSS_PARAM);
if (!bss_param)
goto nla_put_failure;
@@ -4641,7 +4751,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
nla_nest_end(msg, bss_param);
}
- if ((sinfo->filled & BIT(NL80211_STA_INFO_STA_FLAGS)) &&
+ if ((sinfo->filled & BIT_ULL(NL80211_STA_INFO_STA_FLAGS)) &&
nla_put(msg, NL80211_STA_INFO_STA_FLAGS,
sizeof(struct nl80211_sta_flag_update),
&sinfo->sta_flags))
@@ -4651,10 +4761,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
PUT_SINFO_U64(BEACON_RX, rx_beacon);
PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
- PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+ PUT_SINFO(RX_MPDUS, rx_mpdu_count, u32);
+ PUT_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32);
if (wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
- PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
+ NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) {
+ PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+ PUT_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8);
+ }
#undef PUT_SINFO
#undef PUT_SINFO_U64
@@ -4733,7 +4846,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
int err;
rtnl_lock();
- err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
if (err)
goto out_err;
@@ -4887,7 +5000,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
return -EINVAL;
if (params->supported_rates)
return -EINVAL;
- if (params->ext_capab || params->ht_capa || params->vht_capa)
+ if (params->ext_capab || params->ht_capa || params->vht_capa ||
+ params->he_capa)
return -EINVAL;
}
@@ -5093,6 +5207,15 @@ static int nl80211_set_station_tdls(struct genl_info *info,
if (info->attrs[NL80211_ATTR_VHT_CAPABILITY])
params->vht_capa =
nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+ if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+ params->he_capa =
+ nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+ params->he_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+ if (params->he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+ return -EINVAL;
+ }
err = nl80211_parse_sta_channel_info(info, params);
if (err)
@@ -5128,17 +5251,11 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
else
params.listen_interval = -1;
- if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
- u8 tmp;
-
- tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
- if (tmp >= NUM_NL80211_P2P_PS_STATUS)
- return -EINVAL;
-
- params.support_p2p_ps = tmp;
- } else {
+ if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS])
+ params.support_p2p_ps =
+ nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
+ else
params.support_p2p_ps = -1;
- }
if (!info->attrs[NL80211_ATTR_MAC])
return -EINVAL;
@@ -5168,38 +5285,23 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
if (parse_station_flags(info, dev->ieee80211_ptr->iftype, &params))
return -EINVAL;
- if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) {
+ if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
params.plink_action =
nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
- if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS)
- return -EINVAL;
- }
if (info->attrs[NL80211_ATTR_STA_PLINK_STATE]) {
params.plink_state =
nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_STATE]);
- if (params.plink_state >= NUM_NL80211_PLINK_STATES)
- return -EINVAL;
- if (info->attrs[NL80211_ATTR_MESH_PEER_AID]) {
+ if (info->attrs[NL80211_ATTR_MESH_PEER_AID])
params.peer_aid = nla_get_u16(
info->attrs[NL80211_ATTR_MESH_PEER_AID]);
- if (params.peer_aid > IEEE80211_MAX_AID)
- return -EINVAL;
- }
params.sta_modify_mask |= STATION_PARAM_APPLY_PLINK_STATE;
}
- if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]) {
- enum nl80211_mesh_power_mode pm = nla_get_u32(
+ if (info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE])
+ params.local_pm = nla_get_u32(
info->attrs[NL80211_ATTR_LOCAL_MESH_POWER_MODE]);
- if (pm <= NL80211_MESH_POWER_UNKNOWN ||
- pm > NL80211_MESH_POWER_MAX)
- return -EINVAL;
-
- params.local_pm = pm;
- }
-
if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
params.opmode_notif_used = true;
params.opmode_notif =
@@ -5276,13 +5378,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
- u8 tmp;
-
- tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
- if (tmp >= NUM_NL80211_P2P_PS_STATUS)
- return -EINVAL;
-
- params.support_p2p_ps = tmp;
+ params.support_p2p_ps =
+ nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
} else {
/*
* if not specified, assume it's supported for P2P GO interface,
@@ -5296,8 +5393,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
else
params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
- if (!params.aid || params.aid > IEEE80211_MAX_AID)
- return -EINVAL;
if (info->attrs[NL80211_ATTR_STA_CAPABILITY]) {
params.capability =
@@ -5320,18 +5415,26 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
params.vht_capa =
nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY]);
+ if (info->attrs[NL80211_ATTR_HE_CAPABILITY]) {
+ params.he_capa =
+ nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+ params.he_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_HE_CAPABILITY]);
+
+ /* max len is validated in nla policy */
+ if (params.he_capa_len < NL80211_HE_MIN_CAPABILITY_LEN)
+ return -EINVAL;
+ }
+
if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
params.opmode_notif_used = true;
params.opmode_notif =
nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]);
}
- if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION]) {
+ if (info->attrs[NL80211_ATTR_STA_PLINK_ACTION])
params.plink_action =
nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
- if (params.plink_action >= NUM_NL80211_PLINK_ACTIONS)
- return -EINVAL;
- }
err = nl80211_parse_sta_channel_info(info, &params);
if (err)
@@ -5352,6 +5455,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) {
params.ht_capa = NULL;
params.vht_capa = NULL;
+
+ /* HE requires WME */
+ if (params.he_capa_len)
+ return -EINVAL;
}
/* When you run into this, adjust the code below for the new flag */
@@ -5559,7 +5666,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
int err;
rtnl_lock();
- err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
if (err)
goto out_err;
@@ -5755,7 +5862,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb,
int err;
rtnl_lock();
- err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
if (err)
goto out_err;
@@ -5837,9 +5944,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
return -EINVAL;
params.p2p_ctwindow =
- nla_get_s8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
- if (params.p2p_ctwindow < 0)
- return -EINVAL;
+ nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
if (params.p2p_ctwindow != 0 &&
!(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
return -EINVAL;
@@ -5851,8 +5956,6 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
return -EINVAL;
tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
- if (tmp > 1)
- return -EINVAL;
params.p2p_opp_ps = tmp;
if (params.p2p_opp_ps &&
!(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
@@ -6031,33 +6134,49 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
return -ENOBUFS;
}
-static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
- [NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 },
- [NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 },
- [NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 },
- [NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 },
- [NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 },
- [NL80211_MESHCONF_TTL] = { .type = NLA_U8 },
- [NL80211_MESHCONF_ELEMENT_TTL] = { .type = NLA_U8 },
- [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 },
- [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] = { .type = NLA_U32 },
+static const struct nla_policy
+nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = {
+ [NL80211_MESHCONF_RETRY_TIMEOUT] =
+ NLA_POLICY_RANGE(NLA_U16, 1, 255),
+ [NL80211_MESHCONF_CONFIRM_TIMEOUT] =
+ NLA_POLICY_RANGE(NLA_U16, 1, 255),
+ [NL80211_MESHCONF_HOLDING_TIMEOUT] =
+ NLA_POLICY_RANGE(NLA_U16, 1, 255),
+ [NL80211_MESHCONF_MAX_PEER_LINKS] =
+ NLA_POLICY_RANGE(NLA_U16, 0, 255),
+ [NL80211_MESHCONF_MAX_RETRIES] = NLA_POLICY_MAX(NLA_U8, 16),
+ [NL80211_MESHCONF_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
+ [NL80211_MESHCONF_ELEMENT_TTL] = NLA_POLICY_MIN(NLA_U8, 1),
+ [NL80211_MESHCONF_AUTO_OPEN_PLINKS] = NLA_POLICY_MAX(NLA_U8, 1),
+ [NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR] =
+ NLA_POLICY_RANGE(NLA_U32, 1, 255),
[NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 },
[NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 },
- [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 },
+ [NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = NLA_POLICY_MIN(NLA_U16, 1),
[NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 },
- [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 },
- [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] = { .type = NLA_U16 },
- [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 },
- [NL80211_MESHCONF_HWMP_ROOTMODE] = { .type = NLA_U8 },
- [NL80211_MESHCONF_HWMP_RANN_INTERVAL] = { .type = NLA_U16 },
- [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = { .type = NLA_U8 },
- [NL80211_MESHCONF_FORWARDING] = { .type = NLA_U8 },
- [NL80211_MESHCONF_RSSI_THRESHOLD] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] =
+ NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL] =
+ NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] =
+ NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_MESHCONF_HWMP_ROOTMODE] = NLA_POLICY_MAX(NLA_U8, 4),
+ [NL80211_MESHCONF_HWMP_RANN_INTERVAL] =
+ NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_MESHCONF_GATE_ANNOUNCEMENTS] = NLA_POLICY_MAX(NLA_U8, 1),
+ [NL80211_MESHCONF_FORWARDING] = NLA_POLICY_MAX(NLA_U8, 1),
+ [NL80211_MESHCONF_RSSI_THRESHOLD] =
+ NLA_POLICY_RANGE(NLA_S32, -255, 0),
[NL80211_MESHCONF_HT_OPMODE] = { .type = NLA_U16 },
[NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT] = { .type = NLA_U32 },
- [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] = { .type = NLA_U16 },
- [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] = { .type = NLA_U16 },
- [NL80211_MESHCONF_POWER_MODE] = { .type = NLA_U32 },
+ [NL80211_MESHCONF_HWMP_ROOT_INTERVAL] =
+ NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL] =
+ NLA_POLICY_MIN(NLA_U16, 1),
+ [NL80211_MESHCONF_POWER_MODE] =
+ NLA_POLICY_RANGE(NLA_U32,
+ NL80211_MESH_POWER_ACTIVE,
+ NL80211_MESH_POWER_MAX),
[NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 },
[NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 },
};
@@ -6070,68 +6189,12 @@ static const struct nla_policy
[NL80211_MESH_SETUP_USERSPACE_AUTH] = { .type = NLA_FLAG },
[NL80211_MESH_SETUP_AUTH_PROTOCOL] = { .type = NLA_U8 },
[NL80211_MESH_SETUP_USERSPACE_MPM] = { .type = NLA_FLAG },
- [NL80211_MESH_SETUP_IE] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_MESH_SETUP_IE] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_MESH_SETUP_USERSPACE_AMPE] = { .type = NLA_FLAG },
};
-static int nl80211_check_bool(const struct nlattr *nla, u8 min, u8 max, bool *out)
-{
- u8 val = nla_get_u8(nla);
- if (val < min || val > max)
- return -EINVAL;
- *out = val;
- return 0;
-}
-
-static int nl80211_check_u8(const struct nlattr *nla, u8 min, u8 max, u8 *out)
-{
- u8 val = nla_get_u8(nla);
- if (val < min || val > max)
- return -EINVAL;
- *out = val;
- return 0;
-}
-
-static int nl80211_check_u16(const struct nlattr *nla, u16 min, u16 max, u16 *out)
-{
- u16 val = nla_get_u16(nla);
- if (val < min || val > max)
- return -EINVAL;
- *out = val;
- return 0;
-}
-
-static int nl80211_check_u32(const struct nlattr *nla, u32 min, u32 max, u32 *out)
-{
- u32 val = nla_get_u32(nla);
- if (val < min || val > max)
- return -EINVAL;
- *out = val;
- return 0;
-}
-
-static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *out)
-{
- s32 val = nla_get_s32(nla);
- if (val < min || val > max)
- return -EINVAL;
- *out = val;
- return 0;
-}
-
-static int nl80211_check_power_mode(const struct nlattr *nla,
- enum nl80211_mesh_power_mode min,
- enum nl80211_mesh_power_mode max,
- enum nl80211_mesh_power_mode *out)
-{
- u32 val = nla_get_u32(nla);
- if (val < min || val > max)
- return -EINVAL;
- *out = val;
- return 0;
-}
-
static int nl80211_parse_mesh_config(struct genl_info *info,
struct mesh_config *cfg,
u32 *mask_out)
@@ -6140,13 +6203,12 @@ static int nl80211_parse_mesh_config(struct genl_info *info,
u32 mask = 0;
u16 ht_opmode;
-#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, min, max, mask, attr, fn) \
-do { \
- if (tb[attr]) { \
- if (fn(tb[attr], min, max, &cfg->param)) \
- return -EINVAL; \
- mask |= (1 << (attr - 1)); \
- } \
+#define FILL_IN_MESH_PARAM_IF_SET(tb, cfg, param, mask, attr, fn) \
+do { \
+ if (tb[attr]) { \
+ cfg->param = fn(tb[attr]); \
+ mask |= BIT((attr) - 1); \
+ } \
} while (0)
if (!info->attrs[NL80211_ATTR_MESH_CONFIG])
@@ -6161,75 +6223,73 @@ do { \
BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
/* Fill in the params struct */
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, 1, 255,
- mask, NL80211_MESHCONF_RETRY_TIMEOUT,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, 1, 255,
- mask, NL80211_MESHCONF_CONFIRM_TIMEOUT,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, 1, 255,
- mask, NL80211_MESHCONF_HOLDING_TIMEOUT,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, 0, 255,
- mask, NL80211_MESHCONF_MAX_PEER_LINKS,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, 0, 16,
- mask, NL80211_MESHCONF_MAX_RETRIES,
- nl80211_check_u8);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, 1, 255,
- mask, NL80211_MESHCONF_TTL, nl80211_check_u8);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, 1, 255,
- mask, NL80211_MESHCONF_ELEMENT_TTL,
- nl80211_check_u8);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, 0, 1,
- mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
- nl80211_check_bool);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout, mask,
+ NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout, mask,
+ NL80211_MESHCONF_CONFIRM_TIMEOUT,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout, mask,
+ NL80211_MESHCONF_HOLDING_TIMEOUT,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks, mask,
+ NL80211_MESHCONF_MAX_PEER_LINKS,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries, mask,
+ NL80211_MESHCONF_MAX_RETRIES, nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL, mask,
+ NL80211_MESHCONF_TTL, nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, element_ttl, mask,
+ NL80211_MESHCONF_ELEMENT_TTL, nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks, mask,
+ NL80211_MESHCONF_AUTO_OPEN_PLINKS,
+ nla_get_u8);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNbrOffsetMaxNeighbor,
- 1, 255, mask,
+ mask,
NL80211_MESHCONF_SYNC_OFFSET_MAX_NEIGHBOR,
- nl80211_check_u32);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, 0, 255,
- mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
- nl80211_check_u8);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, 1, 65535,
- mask, NL80211_MESHCONF_PATH_REFRESH_TIME,
- nl80211_check_u32);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, 1, 65535,
- mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
- nl80211_check_u16);
+ nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries, mask,
+ NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time, mask,
+ NL80211_MESHCONF_PATH_REFRESH_TIME,
+ nla_get_u32);
+ if (mask & BIT(NL80211_MESHCONF_PATH_REFRESH_TIME) &&
+ (cfg->path_refresh_time < 1 || cfg->path_refresh_time > 65535))
+ return -EINVAL;
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout, mask,
+ NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+ nla_get_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
- 1, 65535, mask,
+ mask,
NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
- nl80211_check_u32);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval,
- 1, 65535, mask,
+ nla_get_u32);
+ if (mask & BIT(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT) &&
+ (cfg->dot11MeshHWMPactivePathTimeout < 1 ||
+ cfg->dot11MeshHWMPactivePathTimeout > 65535))
+ return -EINVAL;
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval, mask,
NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval,
- 1, 65535, mask,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPperrMinInterval, mask,
NL80211_MESHCONF_HWMP_PERR_MIN_INTERVAL,
- nl80211_check_u16);
+ nla_get_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
- dot11MeshHWMPnetDiameterTraversalTime,
- 1, 65535, mask,
+ dot11MeshHWMPnetDiameterTraversalTime, mask,
NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, 0, 4,
- mask, NL80211_MESHCONF_HWMP_ROOTMODE,
- nl80211_check_u8);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, 1, 65535,
- mask, NL80211_MESHCONF_HWMP_RANN_INTERVAL,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
- dot11MeshGateAnnouncementProtocol, 0, 1,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRootMode, mask,
+ NL80211_MESHCONF_HWMP_ROOTMODE, nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPRannInterval, mask,
+ NL80211_MESHCONF_HWMP_RANN_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshGateAnnouncementProtocol,
mask, NL80211_MESHCONF_GATE_ANNOUNCEMENTS,
- nl80211_check_bool);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, 0, 1,
- mask, NL80211_MESHCONF_FORWARDING,
- nl80211_check_bool);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, -255, 0,
- mask, NL80211_MESHCONF_RSSI_THRESHOLD,
- nl80211_check_s32);
+ nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshForwarding, mask,
+ NL80211_MESHCONF_FORWARDING, nla_get_u8);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, rssi_threshold, mask,
+ NL80211_MESHCONF_RSSI_THRESHOLD,
+ nla_get_s32);
/*
* Check HT operation mode based on
* IEEE 802.11-2016 9.4.2.57 HT Operation element.
@@ -6248,29 +6308,27 @@ do { \
cfg->ht_opmode = ht_opmode;
mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1));
}
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathToRootTimeout,
- 1, 65535, mask,
- NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
- nl80211_check_u32);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, 1, 65535,
- mask, NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
- nl80211_check_u16);
FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
- dot11MeshHWMPconfirmationInterval,
- 1, 65535, mask,
+ dot11MeshHWMPactivePathToRootTimeout, mask,
+ NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT,
+ nla_get_u32);
+ if (mask & BIT(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT) &&
+ (cfg->dot11MeshHWMPactivePathToRootTimeout < 1 ||
+ cfg->dot11MeshHWMPactivePathToRootTimeout > 65535))
+ return -EINVAL;
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMProotInterval, mask,
+ NL80211_MESHCONF_HWMP_ROOT_INTERVAL,
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPconfirmationInterval,
+ mask,
NL80211_MESHCONF_HWMP_CONFIRMATION_INTERVAL,
- nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode,
- NL80211_MESH_POWER_ACTIVE,
- NL80211_MESH_POWER_MAX,
- mask, NL80211_MESHCONF_POWER_MODE,
- nl80211_check_power_mode);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration,
- 0, 65535, mask,
- NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16);
- FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, 0, 0xffffffff,
- mask, NL80211_MESHCONF_PLINK_TIMEOUT,
- nl80211_check_u32);
+ nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, power_mode, mask,
+ NL80211_MESHCONF_POWER_MODE, nla_get_u32);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, mask,
+ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16);
+ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask,
+ NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32);
if (mask_out)
*mask_out = mask;
@@ -6313,8 +6371,6 @@ static int nl80211_parse_mesh_setup(struct genl_info *info,
if (tb[NL80211_MESH_SETUP_IE]) {
struct nlattr *ieattr =
tb[NL80211_MESH_SETUP_IE];
- if (!is_valid_ie_attr(ieattr))
- return -EINVAL;
setup->ie = nla_data(ieattr);
setup->ie_len = nla_len(ieattr);
}
@@ -6849,6 +6905,16 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev)
return regulatory_pre_cac_allowed(wdev->wiphy);
}
+static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag,
+ enum nl80211_ext_feature_index feat)
+{
+ if (!(flags & flag))
+ return true;
+ if (wiphy_ext_feature_isset(wiphy, feat))
+ return true;
+ return false;
+}
+
static int
nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
void *request, struct nlattr **attrs,
@@ -6883,15 +6949,33 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
!(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) ||
- ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) ||
- ((*flags & NL80211_SCAN_FLAG_LOW_POWER) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_LOW_POWER_SCAN)) ||
- ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN)))
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_LOW_SPAN,
+ NL80211_EXT_FEATURE_LOW_SPAN_SCAN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_LOW_POWER,
+ NL80211_EXT_FEATURE_LOW_POWER_SCAN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_HIGH_ACCURACY,
+ NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME,
+ NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP,
+ NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION,
+ NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE,
+ NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_RANDOM_SN,
+ NL80211_EXT_FEATURE_SCAN_RANDOM_SN) ||
+ !nl80211_check_scan_feat(wiphy, *flags,
+ NL80211_SCAN_FLAG_MIN_PREQ_CONTENT,
+ NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT))
return -EOPNOTSUPP;
if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
@@ -6906,26 +6990,6 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
return err;
}
- if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME))
- return -EOPNOTSUPP;
-
- if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP))
- return -EOPNOTSUPP;
-
- if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION))
- return -EOPNOTSUPP;
-
- if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) &&
- !wiphy_ext_feature_isset(wiphy,
- NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE))
- return -EOPNOTSUPP;
-
return 0;
}
@@ -6939,9 +7003,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
int err, tmp, n_ssids = 0, n_channels, i;
size_t ie_len;
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
wiphy = &rdev->wiphy;
if (wdev->iftype == NL80211_IFTYPE_NAN)
@@ -7295,9 +7356,6 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev,
struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1];
s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF;
- if (!is_valid_ie_attr(attrs[NL80211_ATTR_IE]))
- return ERR_PTR(-EINVAL);
-
if (attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
n_channels = validate_scan_freqs(
attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
@@ -7657,7 +7715,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
*/
if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) {
while (!sched_scan_req->reqid)
- sched_scan_req->reqid = rdev->wiphy.cookie_counter++;
+ sched_scan_req->reqid = cfg80211_assign_cookie(rdev);
}
err = rdev_sched_scan_start(rdev, dev, sched_scan_req);
@@ -7833,7 +7891,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
if (!need_new_beacon)
goto skip_beacons;
- err = nl80211_parse_beacon(info->attrs, &params.beacon_after);
+ err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after);
if (err)
return err;
@@ -7843,7 +7901,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- err = nl80211_parse_beacon(csa_attrs, &params.beacon_csa);
+ err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa);
if (err)
return err;
@@ -8080,7 +8138,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
int err;
rtnl_lock();
- err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
if (err) {
rtnl_unlock();
return err;
@@ -8201,7 +8259,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
bool radio_stats;
rtnl_lock();
- res = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
+ res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
if (res)
goto out_err;
@@ -8265,9 +8323,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
struct key_parse key;
bool local_state_change;
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
if (!info->attrs[NL80211_ATTR_MAC])
return -EINVAL;
@@ -8506,9 +8561,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
return -EPERM;
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
if (!info->attrs[NL80211_ATTR_MAC] ||
!info->attrs[NL80211_ATTR_SSID] ||
!info->attrs[NL80211_ATTR_WIPHY_FREQ])
@@ -8632,9 +8684,6 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
return -EPERM;
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
if (!info->attrs[NL80211_ATTR_MAC])
return -EINVAL;
@@ -8683,9 +8732,6 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
return -EPERM;
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
if (!info->attrs[NL80211_ATTR_MAC])
return -EINVAL;
@@ -8760,9 +8806,6 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
memset(&ibss, 0, sizeof(ibss));
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
if (!info->attrs[NL80211_ATTR_SSID] ||
!nla_len(info->attrs[NL80211_ATTR_SSID]))
return -EINVAL;
@@ -9200,9 +9243,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
memset(&connect, 0, sizeof(connect));
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
-
if (!info->attrs[NL80211_ATTR_SSID] ||
!nla_len(info->attrs[NL80211_ATTR_SSID]))
return -EINVAL;
@@ -9261,11 +9301,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
!wiphy_ext_feature_isset(&rdev->wiphy,
NL80211_EXT_FEATURE_MFP_OPTIONAL))
return -EOPNOTSUPP;
-
- if (connect.mfp != NL80211_MFP_REQUIRED &&
- connect.mfp != NL80211_MFP_NO &&
- connect.mfp != NL80211_MFP_OPTIONAL)
- return -EINVAL;
} else {
connect.mfp = NL80211_MFP_NO;
}
@@ -9438,8 +9473,6 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
return -EOPNOTSUPP;
if (info->attrs[NL80211_ATTR_IE]) {
- if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
- return -EINVAL;
connect.ie = nla_data(info->attrs[NL80211_ATTR_IE]);
connect.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
changed |= UPDATE_ASSOC_IES;
@@ -10024,9 +10057,6 @@ static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
- if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
- return -EINVAL;
-
wdev = dev->ieee80211_ptr;
if (!rdev->ops->set_power_mgmt)
@@ -10124,7 +10154,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev = dev->ieee80211_ptr;
s32 last, low, high;
u32 hyst;
- int i, n;
+ int i, n, low_index;
int err;
/* RSSI reporting disabled? */
@@ -10148,7 +10178,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
if (err)
return err;
- if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
wdev->cqm_config->last_rssi_event_value =
(s8) sinfo.rx_beacon_signal_avg;
}
@@ -10161,10 +10191,19 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
if (last < wdev->cqm_config->rssi_thresholds[i])
break;
- low = i > 0 ?
- (wdev->cqm_config->rssi_thresholds[i - 1] - hyst) : S32_MIN;
- high = i < n ?
- (wdev->cqm_config->rssi_thresholds[i] + hyst - 1) : S32_MAX;
+ low_index = i - 1;
+ if (low_index >= 0) {
+ low_index = array_index_nospec(low_index, n);
+ low = wdev->cqm_config->rssi_thresholds[low_index] - hyst;
+ } else {
+ low = S32_MIN;
+ }
+ if (i < n) {
+ i = array_index_nospec(i, n);
+ high = wdev->cqm_config->rssi_thresholds[i] + hyst - 1;
+ } else {
+ high = S32_MAX;
+ }
return rdev_set_cqm_rssi_range_config(rdev, dev, low, high);
}
@@ -10580,8 +10619,7 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg,
if (!scan_plan)
return -ENOBUFS;
- if (!scan_plan ||
- nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
+ if (nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL,
req->scan_plans[i].interval) ||
(req->scan_plans[i].iterations &&
nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_ITERATIONS,
@@ -11179,9 +11217,6 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION])
new_rule->condition =
nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]);
- if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH &&
- new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH)
- return -EINVAL;
if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN])
return -EINVAL;
@@ -11534,8 +11569,6 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
conf.master_pref =
nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
- if (!conf.master_pref)
- return -EINVAL;
if (info->attrs[NL80211_ATTR_BANDS]) {
u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
@@ -11653,7 +11686,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb,
if (!func)
return -ENOMEM;
- func->cookie = wdev->wiphy->cookie_counter++;
+ func->cookie = cfg80211_assign_cookie(rdev);
if (!tb[NL80211_NAN_FUNC_TYPE] ||
nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) {
@@ -12099,7 +12132,7 @@ static int nl80211_update_ft_ies(struct sk_buff *skb, struct genl_info *info)
return -EOPNOTSUPP;
if (!info->attrs[NL80211_ATTR_MDID] ||
- !is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
+ !info->attrs[NL80211_ATTR_IE])
return -EINVAL;
memset(&ft_params, 0, sizeof(ft_params));
@@ -12519,12 +12552,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]);
- if (tsid >= IEEE80211_NUM_TIDS)
- return -EINVAL;
-
up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]);
- if (up >= IEEE80211_NUM_UPS)
- return -EINVAL;
/* WMM uses TIDs 0-7 even for TSPEC */
if (tsid >= IEEE80211_FIRST_TSPEC_TSID) {
@@ -12882,6 +12910,76 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_ftm_responder_stats ftm_stats = {};
+ struct sk_buff *msg;
+ void *hdr;
+ struct nlattr *ftm_stats_attr;
+ int err;
+
+ if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval)
+ return -EOPNOTSUPP;
+
+ err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats);
+ if (err)
+ return err;
+
+ if (!ftm_stats.filled)
+ return -ENODATA;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+ NL80211_CMD_GET_FTM_RESPONDER_STATS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+ goto nla_put_failure;
+
+ ftm_stats_attr = nla_nest_start(msg, NL80211_ATTR_FTM_RESPONDER_STATS);
+ if (!ftm_stats_attr)
+ goto nla_put_failure;
+
+#define SET_FTM(field, name, type) \
+ do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
+ nla_put_ ## type(msg, NL80211_FTM_STATS_ ## name, \
+ ftm_stats.field)) \
+ goto nla_put_failure; } while (0)
+#define SET_FTM_U64(field, name) \
+ do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \
+ nla_put_u64_64bit(msg, NL80211_FTM_STATS_ ## name, \
+ ftm_stats.field, NL80211_FTM_STATS_PAD)) \
+ goto nla_put_failure; } while (0)
+
+ SET_FTM(success_num, SUCCESS_NUM, u32);
+ SET_FTM(partial_num, PARTIAL_NUM, u32);
+ SET_FTM(failed_num, FAILED_NUM, u32);
+ SET_FTM(asap_num, ASAP_NUM, u32);
+ SET_FTM(non_asap_num, NON_ASAP_NUM, u32);
+ SET_FTM_U64(total_duration_ms, TOTAL_DURATION_MSEC);
+ SET_FTM(unknown_triggers_num, UNKNOWN_TRIGGERS_NUM, u32);
+ SET_FTM(reschedule_requests_num, RESCHEDULE_REQUESTS_NUM, u32);
+ SET_FTM(out_of_window_triggers_num, OUT_OF_WINDOW_TRIGGERS_NUM, u32);
+#undef SET_FTM
+
+ nla_nest_end(msg, ftm_stats_attr);
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ nlmsg_free(msg);
+ return -ENOBUFS;
+}
+
#define NL80211_FLAG_NEED_WIPHY 0x01
#define NL80211_FLAG_NEED_NETDEV 0x02
#define NL80211_FLAG_NEED_RTNL 0x04
@@ -13793,6 +13891,13 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS,
+ .doit = nl80211_get_ftm_responder_stats,
+ .policy = nl80211_policy,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
};
static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 364f5d67f05b..51380b5c32f2 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1232,4 +1232,19 @@ rdev_external_auth(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+ int ret = -EOPNOTSUPP;
+
+ trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats);
+ if (rdev->ops->get_ftm_responder_stats)
+ ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev,
+ ftm_stats);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 4fc66a117b7d..ecfb1a06dbb2 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -425,36 +425,23 @@ static const struct ieee80211_regdomain *
reg_copy_regd(const struct ieee80211_regdomain *src_regd)
{
struct ieee80211_regdomain *regd;
- int size_of_regd, size_of_wmms;
+ int size_of_regd;
unsigned int i;
- struct ieee80211_wmm_rule *d_wmm, *s_wmm;
size_of_regd =
sizeof(struct ieee80211_regdomain) +
src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule);
- size_of_wmms = src_regd->n_wmm_rules *
- sizeof(struct ieee80211_wmm_rule);
- regd = kzalloc(size_of_regd + size_of_wmms, GFP_KERNEL);
+ regd = kzalloc(size_of_regd, GFP_KERNEL);
if (!regd)
return ERR_PTR(-ENOMEM);
memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain));
- d_wmm = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd);
- s_wmm = (struct ieee80211_wmm_rule *)((u8 *)src_regd + size_of_regd);
- memcpy(d_wmm, s_wmm, size_of_wmms);
-
- for (i = 0; i < src_regd->n_reg_rules; i++) {
+ for (i = 0; i < src_regd->n_reg_rules; i++)
memcpy(&regd->reg_rules[i], &src_regd->reg_rules[i],
sizeof(struct ieee80211_reg_rule));
- if (!src_regd->reg_rules[i].wmm_rule)
- continue;
- regd->reg_rules[i].wmm_rule = d_wmm +
- (src_regd->reg_rules[i].wmm_rule - s_wmm) /
- sizeof(struct ieee80211_wmm_rule);
- }
return regd;
}
@@ -860,27 +847,44 @@ static bool valid_regdb(const u8 *data, unsigned int size)
return true;
}
-static void set_wmm_rule(struct ieee80211_wmm_rule *rule,
- struct fwdb_wmm_rule *wmm)
+static void set_wmm_rule(const struct fwdb_header *db,
+ const struct fwdb_country *country,
+ const struct fwdb_rule *rule,
+ struct ieee80211_reg_rule *rrule)
{
- unsigned int i;
+ struct ieee80211_wmm_rule *wmm_rule = &rrule->wmm_rule;
+ struct fwdb_wmm_rule *wmm;
+ unsigned int i, wmm_ptr;
+
+ wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
+ wmm = (void *)((u8 *)db + wmm_ptr);
+
+ if (!valid_wmm(wmm)) {
+ pr_err("Invalid regulatory WMM rule %u-%u in domain %c%c\n",
+ be32_to_cpu(rule->start), be32_to_cpu(rule->end),
+ country->alpha2[0], country->alpha2[1]);
+ return;
+ }
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
- rule->client[i].cw_min =
+ wmm_rule->client[i].cw_min =
ecw2cw((wmm->client[i].ecw & 0xf0) >> 4);
- rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f);
- rule->client[i].aifsn = wmm->client[i].aifsn;
- rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot);
- rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4);
- rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f);
- rule->ap[i].aifsn = wmm->ap[i].aifsn;
- rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot);
+ wmm_rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f);
+ wmm_rule->client[i].aifsn = wmm->client[i].aifsn;
+ wmm_rule->client[i].cot =
+ 1000 * be16_to_cpu(wmm->client[i].cot);
+ wmm_rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4);
+ wmm_rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f);
+ wmm_rule->ap[i].aifsn = wmm->ap[i].aifsn;
+ wmm_rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot);
}
+
+ rrule->has_wmm = true;
}
static int __regdb_query_wmm(const struct fwdb_header *db,
const struct fwdb_country *country, int freq,
- u32 *dbptr, struct ieee80211_wmm_rule *rule)
+ struct ieee80211_reg_rule *rrule)
{
unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
@@ -889,20 +893,14 @@ static int __regdb_query_wmm(const struct fwdb_header *db,
for (i = 0; i < coll->n_rules; i++) {
__be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2));
unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2;
- struct fwdb_rule *rrule = (void *)((u8 *)db + rule_ptr);
- struct fwdb_wmm_rule *wmm;
- unsigned int wmm_ptr;
+ struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr);
- if (rrule->len < offsetofend(struct fwdb_rule, wmm_ptr))
+ if (rule->len < offsetofend(struct fwdb_rule, wmm_ptr))
continue;
- if (freq >= KHZ_TO_MHZ(be32_to_cpu(rrule->start)) &&
- freq <= KHZ_TO_MHZ(be32_to_cpu(rrule->end))) {
- wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2;
- wmm = (void *)((u8 *)db + wmm_ptr);
- set_wmm_rule(rule, wmm);
- if (dbptr)
- *dbptr = wmm_ptr;
+ if (freq >= KHZ_TO_MHZ(be32_to_cpu(rule->start)) &&
+ freq <= KHZ_TO_MHZ(be32_to_cpu(rule->end))) {
+ set_wmm_rule(db, country, rule, rrule);
return 0;
}
}
@@ -910,8 +908,7 @@ static int __regdb_query_wmm(const struct fwdb_header *db,
return -ENODATA;
}
-int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr,
- struct ieee80211_wmm_rule *rule)
+int reg_query_regdb_wmm(char *alpha2, int freq, struct ieee80211_reg_rule *rule)
{
const struct fwdb_header *hdr = regdb;
const struct fwdb_country *country;
@@ -925,8 +922,7 @@ int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr,
country = &hdr->country[0];
while (country->coll_ptr) {
if (alpha2_equal(alpha2, country->alpha2))
- return __regdb_query_wmm(regdb, country, freq, dbptr,
- rule);
+ return __regdb_query_wmm(regdb, country, freq, rule);
country++;
}
@@ -935,32 +931,13 @@ int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr,
}
EXPORT_SYMBOL(reg_query_regdb_wmm);
-struct wmm_ptrs {
- struct ieee80211_wmm_rule *rule;
- u32 ptr;
-};
-
-static struct ieee80211_wmm_rule *find_wmm_ptr(struct wmm_ptrs *wmm_ptrs,
- u32 wmm_ptr, int n_wmms)
-{
- int i;
-
- for (i = 0; i < n_wmms; i++) {
- if (wmm_ptrs[i].ptr == wmm_ptr)
- return wmm_ptrs[i].rule;
- }
- return NULL;
-}
-
static int regdb_query_country(const struct fwdb_header *db,
const struct fwdb_country *country)
{
unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
struct ieee80211_regdomain *regdom;
- struct ieee80211_regdomain *tmp_rd;
- unsigned int size_of_regd, i, n_wmms = 0;
- struct wmm_ptrs *wmm_ptrs;
+ unsigned int size_of_regd, i;
size_of_regd = sizeof(struct ieee80211_regdomain) +
coll->n_rules * sizeof(struct ieee80211_reg_rule);
@@ -969,12 +946,6 @@ static int regdb_query_country(const struct fwdb_header *db,
if (!regdom)
return -ENOMEM;
- wmm_ptrs = kcalloc(coll->n_rules, sizeof(*wmm_ptrs), GFP_KERNEL);
- if (!wmm_ptrs) {
- kfree(regdom);
- return -ENOMEM;
- }
-
regdom->n_reg_rules = coll->n_rules;
regdom->alpha2[0] = country->alpha2[0];
regdom->alpha2[1] = country->alpha2[1];
@@ -1011,39 +982,9 @@ static int regdb_query_country(const struct fwdb_header *db,
if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout))
rrule->dfs_cac_ms =
1000 * be16_to_cpu(rule->cac_timeout);
- if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) {
- u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
- struct ieee80211_wmm_rule *wmm_pos =
- find_wmm_ptr(wmm_ptrs, wmm_ptr, n_wmms);
- struct fwdb_wmm_rule *wmm;
- struct ieee80211_wmm_rule *wmm_rule;
-
- if (wmm_pos) {
- rrule->wmm_rule = wmm_pos;
- continue;
- }
- wmm = (void *)((u8 *)db + wmm_ptr);
- tmp_rd = krealloc(regdom, size_of_regd + (n_wmms + 1) *
- sizeof(struct ieee80211_wmm_rule),
- GFP_KERNEL);
-
- if (!tmp_rd) {
- kfree(regdom);
- kfree(wmm_ptrs);
- return -ENOMEM;
- }
- regdom = tmp_rd;
-
- wmm_rule = (struct ieee80211_wmm_rule *)
- ((u8 *)regdom + size_of_regd + n_wmms *
- sizeof(struct ieee80211_wmm_rule));
-
- set_wmm_rule(wmm_rule, wmm);
- wmm_ptrs[n_wmms].ptr = wmm_ptr;
- wmm_ptrs[n_wmms++].rule = wmm_rule;
- }
+ if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr))
+ set_wmm_rule(db, country, rule, rrule);
}
- kfree(wmm_ptrs);
return reg_schedule_apply(regdom);
}
@@ -2726,11 +2667,12 @@ static void reg_process_hint(struct regulatory_request *reg_request)
{
struct wiphy *wiphy = NULL;
enum reg_request_treatment treatment;
+ enum nl80211_reg_initiator initiator = reg_request->initiator;
if (reg_request->wiphy_idx != WIPHY_IDX_INVALID)
wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
- switch (reg_request->initiator) {
+ switch (initiator) {
case NL80211_REGDOM_SET_BY_CORE:
treatment = reg_process_hint_core(reg_request);
break;
@@ -2748,7 +2690,7 @@ static void reg_process_hint(struct regulatory_request *reg_request)
treatment = reg_process_hint_country_ie(wiphy, reg_request);
break;
default:
- WARN(1, "invalid initiator %d\n", reg_request->initiator);
+ WARN(1, "invalid initiator %d\n", initiator);
goto out_free;
}
@@ -2763,7 +2705,7 @@ static void reg_process_hint(struct regulatory_request *reg_request)
*/
if (treatment == REG_REQ_ALREADY_SET && wiphy &&
wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
- wiphy_update_regulatory(wiphy, reg_request->initiator);
+ wiphy_update_regulatory(wiphy, initiator);
wiphy_all_share_dfs_chan_state(wiphy);
reg_check_channels();
}
@@ -2932,6 +2874,7 @@ static int regulatory_hint_core(const char *alpha2)
request->alpha2[0] = alpha2[0];
request->alpha2[1] = alpha2[1];
request->initiator = NL80211_REGDOM_SET_BY_CORE;
+ request->wiphy_idx = WIPHY_IDX_INVALID;
queue_regulatory_request(request);
@@ -3249,13 +3192,59 @@ static void restore_regulatory_settings(bool reset_user)
schedule_work(&reg_work);
}
+static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag)
+{
+ struct cfg80211_registered_device *rdev;
+ struct wireless_dev *wdev;
+
+ list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
+ wdev_lock(wdev);
+ if (!(wdev->wiphy->regulatory_flags & flag)) {
+ wdev_unlock(wdev);
+ return false;
+ }
+ wdev_unlock(wdev);
+ }
+ }
+
+ return true;
+}
+
void regulatory_hint_disconnect(void)
{
+ /* Restore of regulatory settings is not required when wiphy(s)
+ * ignore IE from connected access point but clearance of beacon hints
+ * is required when wiphy(s) supports beacon hints.
+ */
+ if (is_wiphy_all_set_reg_flag(REGULATORY_COUNTRY_IE_IGNORE)) {
+ struct reg_beacon *reg_beacon, *btmp;
+
+ if (is_wiphy_all_set_reg_flag(REGULATORY_DISABLE_BEACON_HINTS))
+ return;
+
+ spin_lock_bh(&reg_pending_beacons_lock);
+ list_for_each_entry_safe(reg_beacon, btmp,
+ &reg_pending_beacons, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+ spin_unlock_bh(&reg_pending_beacons_lock);
+
+ list_for_each_entry_safe(reg_beacon, btmp,
+ &reg_beacon_list, list) {
+ list_del(&reg_beacon->list);
+ kfree(reg_beacon);
+ }
+
+ return;
+ }
+
pr_debug("All devices are disconnected, going to restore regulatory settings\n");
restore_regulatory_settings(false);
}
-static bool freq_is_chan_12_13_14(u16 freq)
+static bool freq_is_chan_12_13_14(u32 freq)
{
if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) ||
freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) ||
@@ -3842,6 +3831,15 @@ static int __init regulatory_init_db(void)
{
int err;
+ /*
+ * It's possible that - due to other bugs/issues - cfg80211
+ * never called regulatory_init() below, or that it failed;
+ * in that case, don't try to do any further work here as
+ * it's doomed to lead to crashes.
+ */
+ if (IS_ERR_OR_NULL(reg_pdev))
+ return -EINVAL;
+
err = load_builtin_regdb_keys();
if (err)
return err;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index d36c3eb7b931..d0e7472dd9fd 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1058,13 +1058,23 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
return NULL;
}
+/*
+ * Update RX channel information based on the available frame payload
+ * information. This is mainly for the 2.4 GHz band where frames can be received
+ * from neighboring channels and the Beacon frames use the DSSS Parameter Set
+ * element to indicate the current (transmitting) channel, but this might also
+ * be needed on other bands if RX frequency does not match with the actual
+ * operating channel of a BSS.
+ */
static struct ieee80211_channel *
cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
- struct ieee80211_channel *channel)
+ struct ieee80211_channel *channel,
+ enum nl80211_bss_scan_width scan_width)
{
const u8 *tmp;
u32 freq;
int channel_number = -1;
+ struct ieee80211_channel *alt_channel;
tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen);
if (tmp && tmp[1] == 1) {
@@ -1078,16 +1088,45 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
}
}
- if (channel_number < 0)
+ if (channel_number < 0) {
+ /* No channel information in frame payload */
return channel;
+ }
freq = ieee80211_channel_to_frequency(channel_number, channel->band);
- channel = ieee80211_get_channel(wiphy, freq);
- if (!channel)
- return NULL;
- if (channel->flags & IEEE80211_CHAN_DISABLED)
+ alt_channel = ieee80211_get_channel(wiphy, freq);
+ if (!alt_channel) {
+ if (channel->band == NL80211_BAND_2GHZ) {
+ /*
+ * Better not allow unexpected channels when that could
+ * be going beyond the 1-11 range (e.g., discovering
+ * BSS on channel 12 when radio is configured for
+ * channel 11.
+ */
+ return NULL;
+ }
+
+ /* No match for the payload channel number - ignore it */
+ return channel;
+ }
+
+ if (scan_width == NL80211_BSS_CHAN_WIDTH_10 ||
+ scan_width == NL80211_BSS_CHAN_WIDTH_5) {
+ /*
+ * Ignore channel number in 5 and 10 MHz channels where there
+ * may not be an n:1 or 1:n mapping between frequencies and
+ * channel numbers.
+ */
+ return channel;
+ }
+
+ /*
+ * Use the channel determined through the payload channel number
+ * instead of the RX channel reported by the driver.
+ */
+ if (alt_channel->flags & IEEE80211_CHAN_DISABLED)
return NULL;
- return channel;
+ return alt_channel;
}
/* Returned bss is reference counted and must be cleaned up appropriately. */
@@ -1112,7 +1151,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
(data->signal < 0 || data->signal > 100)))
return NULL;
- channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan);
+ channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan,
+ data->scan_width);
if (!channel)
return NULL;
@@ -1210,7 +1250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
return NULL;
channel = cfg80211_get_bss_channel(wiphy, mgmt->u.beacon.variable,
- ielen, data->chan);
+ ielen, data->chan, data->scan_width);
if (!channel)
return NULL;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 570a2b67ca10..6ab32f6a1961 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -102,7 +102,7 @@ static int wiphy_suspend(struct device *dev)
struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
int ret = 0;
- rdev->suspend_at = get_seconds();
+ rdev->suspend_at = ktime_get_boottime_seconds();
rtnl_lock();
if (rdev->wiphy.registered) {
@@ -130,7 +130,7 @@ static int wiphy_resume(struct device *dev)
int ret = 0;
/* Age scan results with time spent in suspend */
- cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
+ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at);
rtnl_lock();
if (rdev->wiphy.registered && rdev->ops->resume)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 7c73510b161f..c6a9446b4e6b 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -112,7 +112,7 @@
} while (0)
#define CHAN_ENTRY __field(enum nl80211_band, band) \
- __field(u16, center_freq)
+ __field(u32, center_freq)
#define CHAN_ASSIGN(chan) \
do { \
if (chan) { \
@@ -2368,6 +2368,140 @@ TRACE_EVENT(rdev_external_auth,
__entry->bssid, __entry->ssid, __entry->status)
);
+TRACE_EVENT(rdev_start_radar_detection,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_chan_def *chandef,
+ u32 cac_time_ms),
+ TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ __field(u32, cac_time_ms)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ __entry->cac_time_ms = cac_time_ms;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
+ ", cac_time_ms=%u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+ __entry->cac_time_ms)
+);
+
+TRACE_EVENT(rdev_set_mcast_rate,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ int *mcast_rate),
+ TP_ARGS(wiphy, netdev, mcast_rate),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __array(int, mcast_rate, NUM_NL80211_BANDS)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ memcpy(__entry->mcast_rate, mcast_rate,
+ sizeof(int) * NUM_NL80211_BANDS);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
+ "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ __entry->mcast_rate[NL80211_BAND_2GHZ],
+ __entry->mcast_rate[NL80211_BAND_5GHZ],
+ __entry->mcast_rate[NL80211_BAND_60GHZ])
+);
+
+TRACE_EVENT(rdev_set_coalesce,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
+ TP_ARGS(wiphy, coalesce),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, n_rules)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->n_rules = coalesce ? coalesce->n_rules : 0;
+ ),
+ TP_printk(WIPHY_PR_FMT ", n_rules=%d",
+ WIPHY_PR_ARG, __entry->n_rules)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_set_multicast_to_unicast,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const bool enabled),
+ TP_ARGS(wiphy, netdev, enabled),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(bool, enabled)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ BOOL_TO_STR(__entry->enabled))
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
+
+TRACE_EVENT(rdev_get_ftm_responder_stats,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_ftm_responder_stats *ftm_stats),
+
+ TP_ARGS(wiphy, netdev, ftm_stats),
+
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u64, timestamp)
+ __field(u32, success_num)
+ __field(u32, partial_num)
+ __field(u32, failed_num)
+ __field(u32, asap_num)
+ __field(u32, non_asap_num)
+ __field(u64, duration)
+ __field(u32, unknown_triggers)
+ __field(u32, reschedule)
+ __field(u32, out_of_window)
+ ),
+
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->success_num = ftm_stats->success_num;
+ __entry->partial_num = ftm_stats->partial_num;
+ __entry->failed_num = ftm_stats->failed_num;
+ __entry->asap_num = ftm_stats->asap_num;
+ __entry->non_asap_num = ftm_stats->non_asap_num;
+ __entry->duration = ftm_stats->total_duration_ms;
+ __entry->unknown_triggers = ftm_stats->unknown_triggers_num;
+ __entry->reschedule = ftm_stats->reschedule_requests_num;
+ __entry->out_of_window = ftm_stats->out_of_window_triggers_num;
+ ),
+
+ TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, "
+ "failed %u, asap %u, non asap %u, total duration %llu, unknown "
+ "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG,
+ __entry->success_num, __entry->partial_num, __entry->failed_num,
+ __entry->asap_num, __entry->non_asap_num, __entry->duration,
+ __entry->unknown_triggers, __entry->reschedule,
+ __entry->out_of_window)
+);
+
/*************************************************************
* cfg80211 exported functions traces *
*************************************************************/
@@ -3160,105 +3294,6 @@ TRACE_EVENT(cfg80211_stop_iface,
TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
WIPHY_PR_ARG, WDEV_PR_ARG)
);
-
-TRACE_EVENT(rdev_start_radar_detection,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
- struct cfg80211_chan_def *chandef,
- u32 cac_time_ms),
- TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- NETDEV_ENTRY
- CHAN_DEF_ENTRY
- __field(u32, cac_time_ms)
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- NETDEV_ASSIGN;
- CHAN_DEF_ASSIGN(chandef);
- __entry->cac_time_ms = cac_time_ms;
- ),
- TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
- ", cac_time_ms=%u",
- WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
- __entry->cac_time_ms)
-);
-
-TRACE_EVENT(rdev_set_mcast_rate,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
- int *mcast_rate),
- TP_ARGS(wiphy, netdev, mcast_rate),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- NETDEV_ENTRY
- __array(int, mcast_rate, NUM_NL80211_BANDS)
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- NETDEV_ASSIGN;
- memcpy(__entry->mcast_rate, mcast_rate,
- sizeof(int) * NUM_NL80211_BANDS);
- ),
- TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
- "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
- WIPHY_PR_ARG, NETDEV_PR_ARG,
- __entry->mcast_rate[NL80211_BAND_2GHZ],
- __entry->mcast_rate[NL80211_BAND_5GHZ],
- __entry->mcast_rate[NL80211_BAND_60GHZ])
-);
-
-TRACE_EVENT(rdev_set_coalesce,
- TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
- TP_ARGS(wiphy, coalesce),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- __field(int, n_rules)
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- __entry->n_rules = coalesce ? coalesce->n_rules : 0;
- ),
- TP_printk(WIPHY_PR_FMT ", n_rules=%d",
- WIPHY_PR_ARG, __entry->n_rules)
-);
-
-DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
- TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
- TP_ARGS(wiphy, wdev)
-);
-
-TRACE_EVENT(rdev_set_multicast_to_unicast,
- TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
- const bool enabled),
- TP_ARGS(wiphy, netdev, enabled),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- NETDEV_ENTRY
- __field(bool, enabled)
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- NETDEV_ASSIGN;
- __entry->enabled = enabled;
- ),
- TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", unicast: %s",
- WIPHY_PR_ARG, NETDEV_PR_ARG,
- BOOL_TO_STR(__entry->enabled))
-);
-
-TRACE_EVENT(rdev_get_txq_stats,
- TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
- TP_ARGS(wiphy, wdev),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- WDEV_ENTRY
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- WDEV_ASSIGN;
- ),
- TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
-);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 3c654cd7ba56..ef14d80ca03e 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -4,17 +4,21 @@
*
* Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
*/
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/etherdevice.h>
#include <linux/slab.h>
+#include <linux/ieee80211.h>
#include <net/cfg80211.h>
#include <net/ip.h>
#include <net/dsfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/gcd.h>
+#include <linux/bitfield.h>
#include "core.h"
#include "rdev-ops.h"
@@ -87,7 +91,7 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
return 5000 + chan * 5;
break;
case NL80211_BAND_60GHZ:
- if (chan < 5)
+ if (chan < 7)
return 56160 + chan * 2160;
break;
default:
@@ -108,7 +112,7 @@ int ieee80211_frequency_to_channel(int freq)
return (freq - 4000) / 5;
else if (freq <= 45000) /* DMG band lower limit */
return (freq - 5000) / 5;
- else if (freq >= 58320 && freq <= 64800)
+ else if (freq >= 58320 && freq <= 70200)
return (freq - 56160) / 2160;
else
return 0;
@@ -1142,6 +1146,85 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
return 0;
}
+static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
+{
+#define SCALE 2048
+ u16 mcs_divisors[12] = {
+ 34133, /* 16.666666... */
+ 17067, /* 8.333333... */
+ 11378, /* 5.555555... */
+ 8533, /* 4.166666... */
+ 5689, /* 2.777777... */
+ 4267, /* 2.083333... */
+ 3923, /* 1.851851... */
+ 3413, /* 1.666666... */
+ 2844, /* 1.388888... */
+ 2560, /* 1.250000... */
+ 2276, /* 1.111111... */
+ 2048, /* 1.000000... */
+ };
+ u32 rates_160M[3] = { 960777777, 907400000, 816666666 };
+ u32 rates_969[3] = { 480388888, 453700000, 408333333 };
+ u32 rates_484[3] = { 229411111, 216666666, 195000000 };
+ u32 rates_242[3] = { 114711111, 108333333, 97500000 };
+ u32 rates_106[3] = { 40000000, 37777777, 34000000 };
+ u32 rates_52[3] = { 18820000, 17777777, 16000000 };
+ u32 rates_26[3] = { 9411111, 8888888, 8000000 };
+ u64 tmp;
+ u32 result;
+
+ if (WARN_ON_ONCE(rate->mcs > 11))
+ return 0;
+
+ if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2))
+ return 0;
+ if (WARN_ON_ONCE(rate->he_ru_alloc >
+ NL80211_RATE_INFO_HE_RU_ALLOC_2x996))
+ return 0;
+ if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8))
+ return 0;
+
+ if (rate->bw == RATE_INFO_BW_160)
+ result = rates_160M[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_80 ||
+ (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996))
+ result = rates_969[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_40 ||
+ (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484))
+ result = rates_484[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_20 ||
+ (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242))
+ result = rates_242[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106)
+ result = rates_106[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52)
+ result = rates_52[rate->he_gi];
+ else if (rate->bw == RATE_INFO_BW_HE_RU &&
+ rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26)
+ result = rates_26[rate->he_gi];
+ else if (WARN(1, "invalid HE MCS: bw:%d, ru:%d\n",
+ rate->bw, rate->he_ru_alloc))
+ return 0;
+
+ /* now scale to the appropriate MCS */
+ tmp = result;
+ tmp *= SCALE;
+ do_div(tmp, mcs_divisors[rate->mcs]);
+ result = tmp;
+
+ /* and take NSS, DCM into account */
+ result = (result * rate->nss) / 8;
+ if (rate->he_dcm)
+ result /= 2;
+
+ return result;
+}
+
u32 cfg80211_calculate_bitrate(struct rate_info *rate)
{
if (rate->flags & RATE_INFO_FLAGS_MCS)
@@ -1150,6 +1233,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
return cfg80211_calculate_bitrate_60g(rate);
if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
return cfg80211_calculate_bitrate_vht(rate);
+ if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
+ return cfg80211_calculate_bitrate_he(rate);
return rate->legacy;
}
@@ -1374,7 +1459,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
u8 *op_class)
{
u8 vht_opclass;
- u16 freq = chandef->center_freq1;
+ u32 freq = chandef->center_freq1;
if (freq >= 2412 && freq <= 2472) {
if (chandef->width > NL80211_CHAN_WIDTH_40)
@@ -1486,7 +1571,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
}
/* 56.16 GHz, channel 1..4 */
- if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 4) {
+ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) {
if (chandef->width >= NL80211_CHAN_WIDTH_40)
return false;
@@ -1791,8 +1876,9 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
{
- sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)),
- IEEE80211_NUM_TIDS + 1, gfp);
+ sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1,
+ sizeof(*(sinfo->pertid)),
+ gfp);
if (!sinfo->pertid)
return -ENOMEM;
@@ -1810,3 +1896,154 @@ EXPORT_SYMBOL(rfc1042_header);
const unsigned char bridge_tunnel_header[] __aligned(2) =
{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
EXPORT_SYMBOL(bridge_tunnel_header);
+
+/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
+struct iapp_layer2_update {
+ u8 da[ETH_ALEN]; /* broadcast */
+ u8 sa[ETH_ALEN]; /* STA addr */
+ __be16 len; /* 6 */
+ u8 dsap; /* 0 */
+ u8 ssap; /* 0 */
+ u8 control;
+ u8 xid_info[3];
+} __packed;
+
+void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr)
+{
+ struct iapp_layer2_update *msg;
+ struct sk_buff *skb;
+
+ /* Send Level 2 Update Frame to update forwarding tables in layer 2
+ * bridge devices */
+
+ skb = dev_alloc_skb(sizeof(*msg));
+ if (!skb)
+ return;
+ msg = skb_put(skb, sizeof(*msg));
+
+ /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
+ * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
+
+ eth_broadcast_addr(msg->da);
+ ether_addr_copy(msg->sa, addr);
+ msg->len = htons(6);
+ msg->dsap = 0;
+ msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */
+ msg->control = 0xaf; /* XID response lsb.1111F101.
+ * F=0 (no poll command; unsolicited frame) */
+ msg->xid_info[0] = 0x81; /* XID format identifier */
+ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */
+ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */
+
+ skb->dev = dev;
+ skb->protocol = eth_type_trans(skb, dev);
+ memset(skb->cb, 0, sizeof(skb->cb));
+ netif_rx_ni(skb);
+}
+EXPORT_SYMBOL(cfg80211_send_layer2_update);
+
+int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap,
+ enum ieee80211_vht_chanwidth bw,
+ int mcs, bool ext_nss_bw_capable)
+{
+ u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map);
+ int max_vht_nss = 0;
+ int ext_nss_bw;
+ int supp_width;
+ int i, mcs_encoding;
+
+ if (map == 0xffff)
+ return 0;
+
+ if (WARN_ON(mcs > 9))
+ return 0;
+ if (mcs <= 7)
+ mcs_encoding = 0;
+ else if (mcs == 8)
+ mcs_encoding = 1;
+ else
+ mcs_encoding = 2;
+
+ /* find max_vht_nss for the given MCS */
+ for (i = 7; i >= 0; i--) {
+ int supp = (map >> (2 * i)) & 3;
+
+ if (supp == 3)
+ continue;
+
+ if (supp >= mcs_encoding) {
+ max_vht_nss = i;
+ break;
+ }
+ }
+
+ if (!(cap->supp_mcs.tx_mcs_map &
+ cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE)))
+ return max_vht_nss;
+
+ ext_nss_bw = le32_get_bits(cap->vht_cap_info,
+ IEEE80211_VHT_CAP_EXT_NSS_BW_MASK);
+ supp_width = le32_get_bits(cap->vht_cap_info,
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK);
+
+ /* if not capable, treat ext_nss_bw as 0 */
+ if (!ext_nss_bw_capable)
+ ext_nss_bw = 0;
+
+ /* This is invalid */
+ if (supp_width == 3)
+ return 0;
+
+ /* This is an invalid combination so pretend nothing is supported */
+ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2))
+ return 0;
+
+ /*
+ * Cover all the special cases according to IEEE 802.11-2016
+ * Table 9-250. All other cases are either factor of 1 or not
+ * valid/supported.
+ */
+ switch (bw) {
+ case IEEE80211_VHT_CHANWIDTH_USE_HT:
+ case IEEE80211_VHT_CHANWIDTH_80MHZ:
+ if ((supp_width == 1 || supp_width == 2) &&
+ ext_nss_bw == 3)
+ return 2 * max_vht_nss;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_160MHZ:
+ if (supp_width == 0 &&
+ (ext_nss_bw == 1 || ext_nss_bw == 2))
+ return DIV_ROUND_UP(max_vht_nss, 2);
+ if (supp_width == 0 &&
+ ext_nss_bw == 3)
+ return DIV_ROUND_UP(3 * max_vht_nss, 4);
+ if (supp_width == 1 &&
+ ext_nss_bw == 3)
+ return 2 * max_vht_nss;
+ break;
+ case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
+ if (supp_width == 0 &&
+ (ext_nss_bw == 1 || ext_nss_bw == 2))
+ return 0; /* not possible */
+ if (supp_width == 0 &&
+ ext_nss_bw == 2)
+ return DIV_ROUND_UP(max_vht_nss, 2);
+ if (supp_width == 0 &&
+ ext_nss_bw == 3)
+ return DIV_ROUND_UP(3 * max_vht_nss, 4);
+ if (supp_width == 1 &&
+ ext_nss_bw == 0)
+ return 0; /* not possible */
+ if (supp_width == 1 &&
+ ext_nss_bw == 1)
+ return DIV_ROUND_UP(max_vht_nss, 2);
+ if (supp_width == 1 &&
+ ext_nss_bw == 2)
+ return DIV_ROUND_UP(3 * max_vht_nss, 4);
+ break;
+ }
+
+ /* not covered or invalid combination received */
+ return max_vht_nss;
+}
+EXPORT_SYMBOL(ieee80211_get_vht_max_nss);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 05186a47878f..06943d9c9835 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1278,12 +1278,16 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
if (err)
return err;
- if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
- return -EOPNOTSUPP;
+ if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) {
+ err = -EOPNOTSUPP;
+ goto free;
+ }
rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
- return 0;
+free:
+ cfg80211_sinfo_release_content(&sinfo);
+ return err;
}
/* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */
@@ -1293,7 +1297,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
/* we are under RTNL - globally locked - so can use static structs */
static struct iw_statistics wstats;
- static struct station_info sinfo;
+ static struct station_info sinfo = {};
u8 bssid[ETH_ALEN];
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION)
@@ -1320,7 +1324,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
switch (rdev->wiphy.signal_type) {
case CFG80211_SIGNAL_TYPE_MBM:
- if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) {
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
int sig = sinfo.signal;
wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
@@ -1334,7 +1338,7 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
break;
}
case CFG80211_SIGNAL_TYPE_UNSPEC:
- if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL)) {
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) {
wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED;
wstats.qual.updated |= IW_QUAL_QUAL_UPDATED;
wstats.qual.level = sinfo.signal;
@@ -1347,11 +1351,13 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
}
wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
- if (sinfo.filled & BIT(NL80211_STA_INFO_RX_DROP_MISC))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC))
wstats.discard.misc = sinfo.rx_dropped_misc;
- if (sinfo.filled & BIT(NL80211_STA_INFO_TX_FAILED))
+ if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))
wstats.discard.retries = sinfo.tx_failed;
+ cfg80211_sinfo_release_content(&sinfo);
+
return &wstats;
}
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e2fa133f9fba..59fcb41fc5e6 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -31,5 +31,3 @@ config X25
To compile this driver as a module, choose M here: the module
will be called x25. If unsure, say N.
-
-
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 9c214ec681ac..743103786652 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -381,4 +381,3 @@ void x25_check_rbuf(struct sock *sk)
x25_stop_timer(sk);
}
}
-
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f47abb46c587..a264cf2accd0 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -11,6 +11,8 @@
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
@@ -30,22 +32,49 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
unsigned long flags;
- if (xs->dev) {
- spin_lock_irqsave(&umem->xsk_list_lock, flags);
- list_del_rcu(&xs->list);
- spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+ spin_lock_irqsave(&umem->xsk_list_lock, flags);
+ list_del_rcu(&xs->list);
+ spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+}
- if (umem->zc)
- synchronize_net();
- }
+/* The umem is stored both in the _rx struct and the _tx struct as we do
+ * not know if the device has more tx queues than rx, or the opposite.
+ * This might also change during run time.
+ */
+static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
+ u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].umem = umem;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].umem = umem;
+}
+
+struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
+ u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ return dev->_rx[queue_id].umem;
+ if (queue_id < dev->real_num_tx_queues)
+ return dev->_tx[queue_id].umem;
+
+ return NULL;
+}
+
+static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
+{
+ if (queue_id < dev->real_num_rx_queues)
+ dev->_rx[queue_id].umem = NULL;
+ if (queue_id < dev->real_num_tx_queues)
+ dev->_tx[queue_id].umem = NULL;
}
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
- u32 queue_id, u16 flags)
+ u16 queue_id, u16 flags)
{
bool force_zc, force_copy;
struct netdev_bpf bpf;
- int err;
+ int err = 0;
force_zc = flags & XDP_ZEROCOPY;
force_copy = flags & XDP_COPY;
@@ -53,44 +82,45 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
if (force_zc && force_copy)
return -EINVAL;
- if (force_copy)
- return 0;
-
- dev_hold(dev);
-
- if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
- bpf.command = XDP_QUERY_XSK_UMEM;
-
- rtnl_lock();
- err = dev->netdev_ops->ndo_bpf(dev, &bpf);
- rtnl_unlock();
+ rtnl_lock();
+ if (xdp_get_umem_from_qid(dev, queue_id)) {
+ err = -EBUSY;
+ goto out_rtnl_unlock;
+ }
- if (err) {
- dev_put(dev);
- return force_zc ? -ENOTSUPP : 0;
- }
+ xdp_reg_umem_at_qid(dev, umem, queue_id);
+ umem->dev = dev;
+ umem->queue_id = queue_id;
+ if (force_copy)
+ /* For copy-mode, we are done. */
+ goto out_rtnl_unlock;
- bpf.command = XDP_SETUP_XSK_UMEM;
- bpf.xsk.umem = umem;
- bpf.xsk.queue_id = queue_id;
+ if (!dev->netdev_ops->ndo_bpf ||
+ !dev->netdev_ops->ndo_xsk_async_xmit) {
+ err = -EOPNOTSUPP;
+ goto err_unreg_umem;
+ }
- rtnl_lock();
- err = dev->netdev_ops->ndo_bpf(dev, &bpf);
- rtnl_unlock();
+ bpf.command = XDP_SETUP_XSK_UMEM;
+ bpf.xsk.umem = umem;
+ bpf.xsk.queue_id = queue_id;
- if (err) {
- dev_put(dev);
- return force_zc ? err : 0; /* fail or fallback */
- }
+ err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+ if (err)
+ goto err_unreg_umem;
+ rtnl_unlock();
- umem->dev = dev;
- umem->queue_id = queue_id;
- umem->zc = true;
- return 0;
- }
+ dev_hold(dev);
+ umem->zc = true;
+ return 0;
- dev_put(dev);
- return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+err_unreg_umem:
+ xdp_clear_umem_at_qid(dev, queue_id);
+ if (!force_zc)
+ err = 0; /* fallback to copy mode */
+out_rtnl_unlock:
+ rtnl_unlock();
+ return err;
}
static void xdp_umem_clear_dev(struct xdp_umem *umem)
@@ -98,7 +128,7 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
struct netdev_bpf bpf;
int err;
- if (umem->dev) {
+ if (umem->zc) {
bpf.command = XDP_SETUP_XSK_UMEM;
bpf.xsk.umem = NULL;
bpf.xsk.queue_id = umem->queue_id;
@@ -109,9 +139,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
if (err)
WARN(1, "failed to disable umem!\n");
+ }
+ if (umem->dev) {
+ rtnl_lock();
+ xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
+ rtnl_unlock();
+ }
+
+ if (umem->zc) {
dev_put(umem->dev);
- umem->dev = NULL;
+ umem->zc = false;
}
}
@@ -155,6 +193,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
umem->cq = NULL;
}
+ xsk_reuseq_destroy(umem);
+
xdp_umem_unpin_pages(umem);
task = get_pid_task(umem->pid, PIDTYPE_PID);
@@ -302,8 +342,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->pid = get_task_pid(current, PIDTYPE_PID);
umem->address = (unsigned long)addr;
- umem->props.chunk_mask = ~((u64)chunk_size - 1);
- umem->props.size = size;
+ umem->chunk_mask = ~((u64)chunk_size - 1);
+ umem->size = size;
umem->headroom = headroom;
umem->chunk_size_nohr = chunk_size - headroom;
umem->npgs = size / PAGE_SIZE;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index f11560334f88..27603227601b 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -8,18 +8,8 @@
#include <net/xdp_sock.h>
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
-{
- return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
-}
-
-static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
-{
- return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
-}
-
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
- u32 queue_id, u16 flags);
+ u16 queue_id, u16 flags);
bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem);
void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
deleted file mode 100644
index 40eab10dfc49..000000000000
--- a/net/xdp/xdp_umem_props.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* XDP user-space packet buffer
- * Copyright(c) 2018 Intel Corporation.
- */
-
-#ifndef XDP_UMEM_PROPS_H_
-#define XDP_UMEM_PROPS_H_
-
-struct xdp_umem_props {
- u64 chunk_mask;
- u64 size;
-};
-
-#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4e937cd7c17d..07156f43d295 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -55,20 +55,30 @@ EXPORT_SYMBOL(xsk_umem_discard_addr);
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- void *buffer;
+ void *to_buf, *from_buf;
+ u32 metalen;
u64 addr;
int err;
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
- len > xs->umem->chunk_size_nohr) {
+ len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
addr += xs->umem->headroom;
- buffer = xdp_umem_get_data(xs->umem, addr);
- memcpy(buffer, xdp->data, len);
+ if (unlikely(xdp_data_meta_unsupported(xdp))) {
+ from_buf = xdp->data;
+ metalen = 0;
+ } else {
+ from_buf = xdp->data_meta;
+ metalen = xdp->data - xdp->data_meta;
+ }
+
+ to_buf = xdp_umem_get_data(xs->umem, addr);
+ memcpy(to_buf, from_buf, len + metalen);
+ addr += metalen;
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (!err) {
xskq_discard_addr(xs->umem->fq);
@@ -111,6 +121,7 @@ void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
+ u32 metalen = xdp->data - xdp->data_meta;
u32 len = xdp->data_end - xdp->data;
void *buffer;
u64 addr;
@@ -120,7 +131,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
return -EINVAL;
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
- len > xs->umem->chunk_size_nohr) {
+ len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
@@ -128,7 +139,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
addr += xs->umem->headroom;
buffer = xdp_umem_get_data(xs->umem, addr);
- memcpy(buffer, xdp->data, len);
+ memcpy(buffer, xdp->data_meta, len + metalen);
+ addr += metalen;
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (!err) {
xskq_discard_addr(xs->umem->fq);
@@ -343,12 +355,18 @@ static int xsk_release(struct socket *sock)
local_bh_enable();
if (xs->dev) {
+ struct net_device *dev = xs->dev;
+
/* Wait for driver to stop using the xdp socket. */
- synchronize_net();
- dev_put(xs->dev);
+ xdp_del_sk_umem(xs->umem, xs);
xs->dev = NULL;
+ synchronize_net();
+ dev_put(dev);
}
+ xskq_destroy(xs->rx);
+ xskq_destroy(xs->tx);
+
sock_orphan(sk);
sock->sk = NULL;
@@ -407,13 +425,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
qid = sxdp->sxdp_queue_id;
-
- if ((xs->rx && qid >= dev->real_num_rx_queues) ||
- (xs->tx && qid >= dev->real_num_tx_queues)) {
- err = -EINVAL;
- goto out_unlock;
- }
-
flags = sxdp->sxdp_flags;
if (flags & XDP_SHARED_UMEM) {
@@ -458,8 +469,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
goto out_unlock;
} else {
/* This xsk has its own umem. */
- xskq_set_umem(xs->umem->fq, &xs->umem->props);
- xskq_set_umem(xs->umem->cq, &xs->umem->props);
+ xskq_set_umem(xs->umem->fq, xs->umem->size,
+ xs->umem->chunk_mask);
+ xskq_set_umem(xs->umem->cq, xs->umem->size,
+ xs->umem->chunk_mask);
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
@@ -469,8 +482,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
- xskq_set_umem(xs->rx, &xs->umem->props);
- xskq_set_umem(xs->tx, &xs->umem->props);
+ xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
+ xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
@@ -707,9 +720,6 @@ static void xsk_destruct(struct sock *sk)
if (!sock_flag(sk, SOCK_DEAD))
return;
- xskq_destroy(xs->rx);
- xskq_destroy(xs->tx);
- xdp_del_sk_umem(xs->umem, xs);
xdp_put_umem(xs->umem);
sk_refcnt_debug_dec(sk);
@@ -744,6 +754,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
sk->sk_destruct = xsk_destruct;
sk_refcnt_debug_inc(sk);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
xs = xdp_sk(sk);
mutex_init(&xs->mutex);
spin_lock_init(&xs->tx_completion_lock);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 6c32e92e98fc..b66504592d9b 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -3,16 +3,19 @@
* Copyright(c) 2018 Intel Corporation.
*/
+#include <linux/log2.h>
#include <linux/slab.h>
+#include <linux/overflow.h>
#include "xsk_queue.h"
-void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
{
if (!q)
return;
- q->umem_props = *umem_props;
+ q->size = size;
+ q->chunk_mask = chunk_mask;
}
static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
@@ -61,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q)
page_frag_free(q->ring);
kfree(q);
}
+
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+ struct xdp_umem_fq_reuse *newq;
+
+ /* Check for overflow */
+ if (nentries > (u32)roundup_pow_of_two(nentries))
+ return NULL;
+ nentries = roundup_pow_of_two(nentries);
+
+ newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
+ if (!newq)
+ return NULL;
+ memset(newq, 0, offsetof(typeof(*newq), handles));
+
+ newq->nentries = nentries;
+ return newq;
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
+
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+ struct xdp_umem_fq_reuse *newq)
+{
+ struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
+
+ if (!oldq) {
+ umem->fq_reuse = newq;
+ return NULL;
+ }
+
+ if (newq->nentries < oldq->length)
+ return newq;
+
+ memcpy(newq->handles, oldq->handles,
+ array_size(oldq->length, sizeof(u64)));
+ newq->length = oldq->length;
+
+ umem->fq_reuse = newq;
+ return oldq;
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
+
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+{
+ kvfree(rq);
+}
+EXPORT_SYMBOL_GPL(xsk_reuseq_free);
+
+void xsk_reuseq_destroy(struct xdp_umem *umem)
+{
+ xsk_reuseq_free(umem->fq_reuse);
+ umem->fq_reuse = NULL;
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 8a64b150be54..bcb5cbb40419 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -31,7 +31,8 @@ struct xdp_umem_ring {
};
struct xsk_queue {
- struct xdp_umem_props umem_props;
+ u64 chunk_mask;
+ u64 size;
u32 ring_mask;
u32 nentries;
u32 prod_head;
@@ -78,7 +79,7 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
{
- if (addr >= q->umem_props.size) {
+ if (addr >= q->size) {
q->invalid_descs++;
return false;
}
@@ -92,7 +93,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
unsigned int idx = q->cons_tail & q->ring_mask;
- *addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+ *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
if (xskq_is_valid_addr(q, *addr))
return addr;
@@ -173,8 +174,8 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
if (!xskq_is_valid_addr(q, d->addr))
return false;
- if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
- (d->addr & q->umem_props.chunk_mask)) {
+ if (((d->addr + d->len) & q->chunk_mask) !=
+ (d->addr & q->chunk_mask)) {
q->invalid_descs++;
return false;
}
@@ -253,8 +254,11 @@ static inline bool xskq_empty_desc(struct xsk_queue *q)
return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
}
-void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
+void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops);
+/* Executed by the core when the entire UMEM gets freed */
+void xsk_reuseq_destroy(struct xdp_umem *umem);
+
#endif /* _LINUX_XSK_QUEUE_H */
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 286ed25c1a69..4a9ee2d83158 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -25,6 +25,14 @@ config XFRM_USER
If unsure, say Y.
+config XFRM_INTERFACE
+ tristate "Transformation virtual interface"
+ depends on XFRM && IPV6
+ ---help---
+ This provides a virtual interface to route IPsec traffic.
+
+ If unsure, say N.
+
config XFRM_SUB_POLICY
bool "Transformation sub policy support"
depends on XFRM
@@ -87,4 +95,3 @@ config NET_KEY_MIGRATE
<draft-sugimoto-mip6-pfkey-migrate>.
If unsure, say N.
-
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 0bd2465a8c5a..fbc4552d17b8 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
+obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e15a6e..144c137886b1 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -56,7 +56,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
if (skb_is_gso(skb)) {
struct net_device *dev = skb->dev;
- if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) {
+ if (unlikely(x->xso.dev != dev)) {
struct sk_buff *segs;
/* Packet got rerouted, fixup features and segment it. */
@@ -99,7 +99,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
do {
struct sk_buff *nskb = skb2->next;
- skb2->next = NULL;
+ skb_mark_not_on_list(skb2);
xo = xfrm_offload(skb2);
xo->flags |= XFRM_DEV_RESUME;
@@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
}
dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
- x->props.family, x->props.output_mark);
+ x->props.family,
+ xfrm_smark_get(0, x));
if (IS_ERR(dst))
return 0;
@@ -191,9 +192,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
err = dev->xfrmdev_ops->xdo_dev_state_add(x);
if (err) {
+ xso->num_exthdrs = 0;
+ xso->flags = 0;
xso->dev = NULL;
dev_put(dev);
- return err;
+
+ if (err != -EOPNOTSUPP)
+ return err;
}
return 0;
@@ -210,8 +215,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
if (!x->type_offload || x->encap)
return false;
- if ((!dev || (x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev))) &&
- (!xdst->child->xfrm && x->type->get_mtu)) {
+ if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
+ (!xdst->child->xfrm && x->type->get_mtu)) {
mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
if (skb->len <= mtu)
@@ -306,12 +311,6 @@ static int xfrm_dev_register(struct net_device *dev)
return xfrm_api_check(dev);
}
-static int xfrm_dev_unregister(struct net_device *dev)
-{
- xfrm_policy_cache_flush();
- return NOTIFY_DONE;
-}
-
static int xfrm_dev_feat_change(struct net_device *dev)
{
return xfrm_api_check(dev);
@@ -322,7 +321,6 @@ static int xfrm_dev_down(struct net_device *dev)
if (dev->features & NETIF_F_HW_ESP)
xfrm_dev_state_flush(dev_net(dev), dev, true);
- xfrm_policy_cache_flush();
return NOTIFY_DONE;
}
@@ -334,9 +332,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
case NETDEV_REGISTER:
return xfrm_dev_register(dev);
- case NETDEV_UNREGISTER:
- return xfrm_dev_unregister(dev);
-
case NETDEV_FEAT_CHANGE:
return xfrm_dev_feat_change(dev);
diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h
index 61be810389d8..ce66323102f9 100644
--- a/net/xfrm/xfrm_hash.h
+++ b/net/xfrm/xfrm_hash.h
@@ -13,7 +13,7 @@ static inline unsigned int __xfrm4_addr_hash(const xfrm_address_t *addr)
static inline unsigned int __xfrm6_addr_hash(const xfrm_address_t *addr)
{
- return ntohl(addr->a6[2] ^ addr->a6[3]);
+ return jhash2((__force u32 *)addr->a6, 4, 0);
}
static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
@@ -26,8 +26,7 @@ static inline unsigned int __xfrm4_daddr_saddr_hash(const xfrm_address_t *daddr,
static inline unsigned int __xfrm6_daddr_saddr_hash(const xfrm_address_t *daddr,
const xfrm_address_t *saddr)
{
- return ntohl(daddr->a6[2] ^ daddr->a6[3] ^
- saddr->a6[2] ^ saddr->a6[3]);
+ return __xfrm6_addr_hash(daddr) ^ __xfrm6_addr_hash(saddr);
}
static inline u32 __bits2mask32(__u8 bits)
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 352abca2605f..684c0bc01e2c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -131,7 +131,7 @@ struct sec_path *secpath_dup(struct sec_path *src)
sp->len = 0;
sp->olen = 0;
- memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH]));
+ memset(sp->ovec, 0, sizeof(sp->ovec));
if (src) {
int i;
@@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
seq = 0;
if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
goto drop;
}
@@ -328,17 +329,21 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
XFRM_SPI_SKB_CB(skb)->daddroff);
do {
if (skb->sp->len == XFRM_MAX_DEPTH) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
goto drop;
}
x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
if (x == NULL) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
xfrm_audit_state_notfound(skb, family, spi, seq);
goto drop;
}
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
skb->sp->xvec[skb->sp->len++] = x;
lock:
@@ -453,6 +458,7 @@ resume:
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
goto drop;
}
+ crypto_done = false;
} while (!err);
err = xfrm_rcv_cb(skb, family, x->type->proto, 0);
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
new file mode 100644
index 000000000000..d679fa0f44b3
--- /dev/null
+++ b/net/xfrm/xfrm_interface.c
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XFRM virtual interface
+ *
+ * Copyright (C) 2018 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_link.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/etherdevice.h>
+
+static int xfrmi_dev_init(struct net_device *dev);
+static void xfrmi_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly;
+static unsigned int xfrmi_net_id __read_mostly;
+
+struct xfrmi_net {
+ /* lists for storing interfaces in use */
+ struct xfrm_if __rcu *xfrmi[1];
+};
+
+#define for_each_xfrmi_rcu(start, xi) \
+ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
+
+static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
+{
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ struct xfrm_if *xi;
+
+ for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+ if (x->if_id == xi->p.if_id &&
+ (xi->dev->flags & IFF_UP))
+ return xi;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb)
+{
+ struct xfrmi_net *xfrmn;
+ int ifindex;
+ struct xfrm_if *xi;
+
+ if (!skb->dev)
+ return NULL;
+
+ xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id);
+ ifindex = skb->dev->ifindex;
+
+ for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+ if (ifindex == xi->dev->ifindex &&
+ (xi->dev->flags & IFF_UP))
+ return xi;
+ }
+
+ return NULL;
+}
+
+static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+ struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0];
+
+ rcu_assign_pointer(xi->next , rtnl_dereference(*xip));
+ rcu_assign_pointer(*xip, xi);
+}
+
+static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *iter;
+
+ for (xip = &xfrmn->xfrmi[0];
+ (iter = rtnl_dereference(*xip)) != NULL;
+ xip = &iter->next) {
+ if (xi == iter) {
+ rcu_assign_pointer(*xip, xi->next);
+ break;
+ }
+ }
+}
+
+static void xfrmi_dev_free(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ gro_cells_destroy(&xi->gro_cells);
+ free_percpu(dev->tstats);
+}
+
+static int xfrmi_create2(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ int err;
+
+ dev->rtnl_link_ops = &xfrmi_link_ops;
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(xi->p.name, dev->name);
+
+ dev_hold(dev);
+ xfrmi_link(xfrmn, xi);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
+{
+ struct net_device *dev;
+ struct xfrm_if *xi;
+ char name[IFNAMSIZ];
+ int err;
+
+ if (p->name[0]) {
+ strlcpy(name, p->name, IFNAMSIZ);
+ } else {
+ err = -EINVAL;
+ goto failed;
+ }
+
+ dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup);
+ if (!dev) {
+ err = -EAGAIN;
+ goto failed;
+ }
+
+ dev_net_set(dev, net);
+
+ xi = netdev_priv(dev);
+ xi->p = *p;
+ xi->net = net;
+ xi->dev = dev;
+ xi->phydev = dev_get_by_index(net, p->link);
+ if (!xi->phydev) {
+ err = -ENODEV;
+ goto failed_free;
+ }
+
+ err = xfrmi_create2(dev);
+ if (err < 0)
+ goto failed_dev_put;
+
+ return xi;
+
+failed_dev_put:
+ dev_put(xi->phydev);
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
+ int create)
+{
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *xi;
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+ for (xip = &xfrmn->xfrmi[0];
+ (xi = rtnl_dereference(*xip)) != NULL;
+ xip = &xi->next) {
+ if (xi->p.if_id == p->if_id) {
+ if (create)
+ return ERR_PTR(-EEXIST);
+
+ return xi;
+ }
+ }
+ if (!create)
+ return ERR_PTR(-ENODEV);
+ return xfrmi_create(net, p);
+}
+
+static void xfrmi_dev_uninit(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
+
+ xfrmi_unlink(xfrmn, xi);
+ dev_put(xi->phydev);
+ dev_put(dev);
+}
+
+static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->tstamp = 0;
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+
+ if (!xnet)
+ return;
+
+ ipvs_reset(skb);
+ secpath_reset(skb);
+ skb_orphan(skb);
+ skb->mark = 0;
+}
+
+static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
+{
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_mode *inner_mode;
+ struct net_device *dev;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ bool xnet;
+
+ if (err && !skb->sp)
+ return 0;
+
+ x = xfrm_input_state(skb);
+
+ xi = xfrmi_lookup(xs_net(x), x);
+ if (!xi)
+ return 1;
+
+ dev = xi->dev;
+ skb->dev = dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ xnet = !net_eq(xi->net, dev_net(skb->dev));
+
+ if (xnet) {
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb,
+ inner_mode->afinfo->family))
+ return -EPERM;
+ }
+
+ xfrmi_scrub_packet(skb, xnet);
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+static int
+xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device_stats *stats = &xi->dev->stats;
+ struct dst_entry *dst = skb_dst(skb);
+ unsigned int length = skb->len;
+ struct net_device *tdev;
+ struct xfrm_state *x;
+ int err = -1;
+ int mtu;
+
+ if (!dst)
+ goto tx_err_link_failure;
+
+ dst_hold(dst);
+ dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+
+ x = dst->xfrm;
+ if (!x)
+ goto tx_err_link_failure;
+
+ if (x->if_id != xi->p.if_id)
+ goto tx_err_link_failure;
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ stats->collisions++;
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ xi->p.name);
+ goto tx_err_dst_release;
+ }
+
+ mtu = dst_mtu(dst);
+ if (!skb->ignore_df && skb->len > mtu) {
+ skb_dst_update_pmtu(skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ } else {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ }
+
+ dst_release(dst);
+ return -EMSGSIZE;
+ }
+
+ xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = tdev;
+
+ err = dst_output(xi->net, skb->sk, skb);
+ if (net_xmit_eval(err) == 0) {
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += length;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ stats->tx_errors++;
+ stats->tx_aborted_errors++;
+ }
+
+ return 0;
+tx_err_link_failure:
+ stats->tx_carrier_errors++;
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(dst);
+ return err;
+}
+
+static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device_stats *stats = &xi->dev->stats;
+ struct flowi fl;
+ int ret;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ default:
+ goto tx_err;
+ }
+
+ fl.flowi_oif = xi->phydev->ifindex;
+
+ ret = xfrmi_xmit2(skb, dev, &fl);
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int xfrmi4_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ int protocol = iph->protocol;
+ struct ip_comp_hdr *ipch;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ __be32 spi;
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ xi = xfrmi_lookup(net, x);
+ if (!xi) {
+ xfrm_state_put(x);
+ return -1;
+ }
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, protocol);
+ else
+ ipv4_redirect(skb, net, 0, protocol);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ int protocol = iph->nexthdr;
+ struct ip_comp_hdr *ipch;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ __be32 spi;
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data + offset);
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data + offset);
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data + offset);
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET6);
+ if (!x)
+ return 0;
+
+ xi = xfrmi_lookup(net, x);
+ if (!xi) {
+ xfrm_state_put(x);
+ return -1;
+ }
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
+{
+ if (xi->p.link != p->link)
+ return -EINVAL;
+
+ xi->p.if_id = p->if_id;
+
+ return 0;
+}
+
+static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
+{
+ struct net *net = dev_net(xi->dev);
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ int err;
+
+ xfrmi_unlink(xfrmn, xi);
+ synchronize_net();
+ err = xfrmi_change(xi, p);
+ xfrmi_link(xfrmn, xi);
+ netdev_state_change(xi->dev);
+ return err;
+}
+
+static void xfrmi_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_sw_netstats *stats;
+ struct pcpu_sw_netstats tmp;
+ int start;
+
+ stats = per_cpu_ptr(dev->tstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ tmp.rx_packets = stats->rx_packets;
+ tmp.rx_bytes = stats->rx_bytes;
+ tmp.tx_packets = stats->tx_packets;
+ tmp.tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += tmp.rx_packets;
+ s->rx_bytes += tmp.rx_bytes;
+ s->tx_packets += tmp.tx_packets;
+ s->tx_bytes += tmp.tx_bytes;
+ }
+
+ s->rx_dropped = dev->stats.rx_dropped;
+ s->tx_dropped = dev->stats.tx_dropped;
+}
+
+static int xfrmi_get_iflink(const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ return xi->phydev->ifindex;
+}
+
+
+static const struct net_device_ops xfrmi_netdev_ops = {
+ .ndo_init = xfrmi_dev_init,
+ .ndo_uninit = xfrmi_dev_uninit,
+ .ndo_start_xmit = xfrmi_xmit,
+ .ndo_get_stats64 = xfrmi_get_stats64,
+ .ndo_get_iflink = xfrmi_get_iflink,
+};
+
+static void xfrmi_dev_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &xfrmi_netdev_ops;
+ dev->type = ARPHRD_NONE;
+ dev->hard_header_len = ETH_HLEN;
+ dev->min_header_len = ETH_HLEN;
+ dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = ETH_DATA_LEN;
+ dev->addr_len = ETH_ALEN;
+ dev->flags = IFF_NOARP;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = xfrmi_dev_free;
+ netif_keep_dst(dev);
+}
+
+static int xfrmi_dev_init(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device *phydev = xi->phydev;
+ int err;
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ err = gro_cells_init(&xi->gro_cells, dev);
+ if (err) {
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ dev->features |= NETIF_F_LLTX;
+
+ dev->needed_headroom = phydev->needed_headroom;
+ dev->needed_tailroom = phydev->needed_tailroom;
+
+ if (is_zero_ether_addr(dev->dev_addr))
+ eth_hw_addr_inherit(dev, phydev);
+ if (is_zero_ether_addr(dev->broadcast))
+ memcpy(dev->broadcast, phydev->broadcast, dev->addr_len);
+
+ return 0;
+}
+
+static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void xfrmi_netlink_parms(struct nlattr *data[],
+ struct xfrm_if_parms *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_XFRM_LINK])
+ parms->link = nla_get_u32(data[IFLA_XFRM_LINK]);
+
+ if (data[IFLA_XFRM_IF_ID])
+ parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
+}
+
+static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = dev_net(dev);
+ struct xfrm_if_parms *p;
+ struct xfrm_if *xi;
+
+ xi = netdev_priv(dev);
+ p = &xi->p;
+
+ xfrmi_netlink_parms(data, p);
+
+ if (!tb[IFLA_IFNAME])
+ return -EINVAL;
+
+ nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ xi = xfrmi_locate(net, p, 1);
+ return PTR_ERR_OR_ZERO(xi);
+}
+
+static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
+{
+ unregister_netdevice_queue(dev, head);
+}
+
+static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+
+ xfrmi_netlink_parms(data, &xi->p);
+
+ xi = xfrmi_locate(net, &xi->p, 0);
+
+ if (IS_ERR_OR_NULL(xi)) {
+ xi = netdev_priv(dev);
+ } else {
+ if (xi->dev != dev)
+ return -EEXIST;
+ }
+
+ return xfrmi_update(xi, &xi->p);
+}
+
+static size_t xfrmi_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_XFRM_LINK */
+ nla_total_size(4) +
+ /* IFLA_XFRM_IF_ID */
+ nla_total_size(4) +
+ 0;
+}
+
+static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct xfrm_if_parms *parm = &xi->p;
+
+ if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
+ nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct net *xfrmi_get_link_net(const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ return dev_net(xi->phydev);
+}
+
+static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
+ [IFLA_XFRM_LINK] = { .type = NLA_U32 },
+ [IFLA_XFRM_IF_ID] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
+ .kind = "xfrm",
+ .maxtype = IFLA_XFRM_MAX,
+ .policy = xfrmi_policy,
+ .priv_size = sizeof(struct xfrm_if),
+ .setup = xfrmi_dev_setup,
+ .validate = xfrmi_validate,
+ .newlink = xfrmi_newlink,
+ .dellink = xfrmi_dellink,
+ .changelink = xfrmi_changelink,
+ .get_size = xfrmi_get_size,
+ .fill_info = xfrmi_fill_info,
+ .get_link_net = xfrmi_get_link_net,
+};
+
+static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
+{
+ struct xfrm_if *xi;
+ LIST_HEAD(list);
+
+ xi = rtnl_dereference(xfrmn->xfrmi[0]);
+ if (!xi)
+ return;
+
+ unregister_netdevice_queue(xi->dev, &list);
+ unregister_netdevice_many(&list);
+}
+
+static int __net_init xfrmi_init_net(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit xfrmi_exit_net(struct net *net)
+{
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+ rtnl_lock();
+ xfrmi_destroy_interfaces(xfrmn);
+ rtnl_unlock();
+}
+
+static struct pernet_operations xfrmi_net_ops = {
+ .init = xfrmi_init_net,
+ .exit = xfrmi_exit_net,
+ .id = &xfrmi_net_id,
+ .size = sizeof(struct xfrmi_net),
+};
+
+static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static int __init xfrmi4_init(void)
+{
+ int err;
+
+ err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ return 0;
+
+xfrm_proto_comp_failed:
+ xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ return err;
+}
+
+static void xfrmi4_fini(void)
+{
+ xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+}
+
+static int __init xfrmi6_init(void)
+{
+ int err;
+
+ err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ return 0;
+
+xfrm_proto_comp_failed:
+ xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ return err;
+}
+
+static void xfrmi6_fini(void)
+{
+ xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+ xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+ xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+}
+
+static const struct xfrm_if_cb xfrm_if_cb = {
+ .decode_session = xfrmi_decode_session,
+};
+
+static int __init xfrmi_init(void)
+{
+ const char *msg;
+ int err;
+
+ pr_info("IPsec XFRM device driver\n");
+
+ msg = "tunnel device";
+ err = register_pernet_device(&xfrmi_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "xfrm4 protocols";
+ err = xfrmi4_init();
+ if (err < 0)
+ goto xfrmi4_failed;
+
+ msg = "xfrm6 protocols";
+ err = xfrmi6_init();
+ if (err < 0)
+ goto xfrmi6_failed;
+
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&xfrmi_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ xfrm_if_register_cb(&xfrm_if_cb);
+
+ return err;
+
+rtnl_link_failed:
+ xfrmi6_fini();
+xfrmi6_failed:
+ xfrmi4_fini();
+xfrmi4_failed:
+ unregister_pernet_device(&xfrmi_net_ops);
+pernet_dev_failed:
+ pr_err("xfrmi init: failed to register %s\n", msg);
+ return err;
+}
+
+static void __exit xfrmi_fini(void)
+{
+ xfrm_if_unregister_cb();
+ rtnl_link_unregister(&xfrmi_link_ops);
+ xfrmi4_fini();
+ xfrmi6_fini();
+ unregister_pernet_device(&xfrmi_net_ops);
+}
+
+module_init(xfrmi_init);
+module_exit(xfrmi_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("xfrm");
+MODULE_ALIAS_NETDEV("xfrm0");
+MODULE_AUTHOR("Steffen Klassert");
+MODULE_DESCRIPTION("XFRM virtual interface");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 89b178a78dc7..4ae87c5ce2e3 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
goto error_nolock;
}
- if (x->props.output_mark)
- skb->mark = x->props.output_mark;
+ skb->mark = xfrm_smark_get(skb->mark, x);
err = x->outer_mode->output(x, skb);
if (err) {
@@ -101,6 +100,10 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
spin_unlock_bh(&x->lock);
skb_dst_force(skb);
+ if (!skb_dst(skb)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ goto error_nolock;
+ }
if (xfrm_offload(skb)) {
x->type_offload->encap(x, skb);
@@ -190,7 +193,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
struct sk_buff *nskb = segs->next;
int err;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
err = xfrm_output2(net, sk, segs);
if (unlikely(err)) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7c5e8978aeaa..119a427d9b2b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,8 +45,9 @@ struct xfrm_flo {
u8 flags;
};
-static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
-static struct work_struct *xfrm_pcpu_work __read_mostly;
+static DEFINE_SPINLOCK(xfrm_if_cb_lock);
+static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
+
static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
__read_mostly;
@@ -119,6 +120,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
return afinfo;
}
+/* Called with rcu_read_lock(). */
+static const struct xfrm_if_cb *xfrm_if_get_cb(void)
+{
+ return rcu_dereference(xfrm_if_cb);
+}
+
struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr,
@@ -182,8 +189,8 @@ static inline unsigned long make_jiffies(long secs)
static void xfrm_policy_timer(struct timer_list *t)
{
struct xfrm_policy *xp = from_timer(xp, t, timer);
- unsigned long now = get_seconds();
- long next = LONG_MAX;
+ time64_t now = ktime_get_real_seconds();
+ time64_t next = TIME64_MAX;
int warn = 0;
int dir;
@@ -195,7 +202,7 @@ static void xfrm_policy_timer(struct timer_list *t)
dir = xfrm_policy_id2dir(xp->index);
if (xp->lft.hard_add_expires_seconds) {
- long tmo = xp->lft.hard_add_expires_seconds +
+ time64_t tmo = xp->lft.hard_add_expires_seconds +
xp->curlft.add_time - now;
if (tmo <= 0)
goto expired;
@@ -203,7 +210,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.hard_use_expires_seconds) {
- long tmo = xp->lft.hard_use_expires_seconds +
+ time64_t tmo = xp->lft.hard_use_expires_seconds +
(xp->curlft.use_time ? : xp->curlft.add_time) - now;
if (tmo <= 0)
goto expired;
@@ -211,7 +218,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.soft_add_expires_seconds) {
- long tmo = xp->lft.soft_add_expires_seconds +
+ time64_t tmo = xp->lft.soft_add_expires_seconds +
xp->curlft.add_time - now;
if (tmo <= 0) {
warn = 1;
@@ -221,7 +228,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.soft_use_expires_seconds) {
- long tmo = xp->lft.soft_use_expires_seconds +
+ time64_t tmo = xp->lft.soft_use_expires_seconds +
(xp->curlft.use_time ? : xp->curlft.add_time) - now;
if (tmo <= 0) {
warn = 1;
@@ -233,7 +240,7 @@ static void xfrm_policy_timer(struct timer_list *t)
if (warn)
km_policy_expired(xp, dir, 0, 0);
- if (next != LONG_MAX &&
+ if (next != TIME64_MAX &&
!mod_timer(&xp->timer, jiffies + make_jiffies(next)))
xfrm_pol_hold(xp);
@@ -625,9 +632,9 @@ static void xfrm_hash_rebuild(struct work_struct *work)
break;
}
if (newpos)
- hlist_add_behind(&policy->bydst, newpos);
+ hlist_add_behind_rcu(&policy->bydst, newpos);
else
- hlist_add_head(&policy->bydst, chain);
+ hlist_add_head_rcu(&policy->bydst, chain);
}
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
@@ -747,6 +754,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
newpos = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == policy->type &&
+ pol->if_id == policy->if_id &&
!selector_cmp(&pol->selector, &policy->selector) &&
xfrm_policy_mark_match(policy, pol) &&
xfrm_sec_ctx_match(pol->security, policy->security) &&
@@ -766,9 +774,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
break;
}
if (newpos)
- hlist_add_behind(&policy->bydst, newpos);
+ hlist_add_behind_rcu(&policy->bydst, newpos);
else
- hlist_add_head(&policy->bydst, chain);
+ hlist_add_head_rcu(&policy->bydst, chain);
__xfrm_policy_link(policy, dir);
/* After previous checking, family can either be AF_INET or AF_INET6 */
@@ -783,7 +791,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
- policy->curlft.add_time = get_seconds();
+ policy->curlft.add_time = ktime_get_real_seconds();
policy->curlft.use_time = 0;
if (!mod_timer(&policy->timer, jiffies + HZ))
xfrm_pol_hold(policy);
@@ -798,8 +806,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
- int dir, struct xfrm_selector *sel,
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
+ u8 type, int dir,
+ struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err)
{
@@ -812,6 +821,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == type &&
+ pol->if_id == if_id &&
(mark & pol->mark.m) == pol->mark.v &&
!selector_cmp(sel, &pol->selector) &&
xfrm_sec_ctx_match(ctx, pol->security)) {
@@ -837,8 +847,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
- int dir, u32 id, int delete, int *err)
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
+ u8 type, int dir, u32 id, int delete,
+ int *err)
{
struct xfrm_policy *pol, *ret;
struct hlist_head *chain;
@@ -853,6 +864,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
ret = NULL;
hlist_for_each_entry(pol, chain, byidx) {
if (pol->type == type && pol->index == id &&
+ pol->if_id == if_id &&
(mark & pol->mark.m) == pol->mark.v) {
xfrm_pol_hold(pol);
if (delete) {
@@ -1056,13 +1068,14 @@ EXPORT_SYMBOL(xfrm_policy_walk_done);
*/
static int xfrm_policy_match(const struct xfrm_policy *pol,
const struct flowi *fl,
- u8 type, u16 family, int dir)
+ u8 type, u16 family, int dir, u32 if_id)
{
const struct xfrm_selector *sel = &pol->selector;
int ret = -ESRCH;
bool match;
if (pol->family != family ||
+ pol->if_id != if_id ||
(fl->flowi_mark & pol->mark.m) != pol->mark.v ||
pol->type != type)
return ret;
@@ -1077,7 +1090,8 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
const struct flowi *fl,
- u16 family, u8 dir)
+ u16 family, u8 dir,
+ u32 if_id)
{
int err;
struct xfrm_policy *pol, *ret;
@@ -1101,7 +1115,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
priority = ~0U;
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
- err = xfrm_policy_match(pol, fl, type, family, dir);
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
if (err) {
if (err == -ESRCH)
continue;
@@ -1120,7 +1134,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
if ((pol->priority >= priority) && ret)
break;
- err = xfrm_policy_match(pol, fl, type, family, dir);
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
if (err) {
if (err == -ESRCH)
continue;
@@ -1145,21 +1159,25 @@ fail:
return ret;
}
-static struct xfrm_policy *
-xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
+static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
+ const struct flowi *fl,
+ u16 family, u8 dir, u32 if_id)
{
#ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_policy *pol;
- pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+ pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
+ dir, if_id);
if (pol != NULL)
return pol;
#endif
- return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+ return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
+ dir, if_id);
}
static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
- const struct flowi *fl, u16 family)
+ const struct flowi *fl,
+ u16 family, u32 if_id)
{
struct xfrm_policy *pol;
@@ -1177,7 +1195,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
match = xfrm_selector_match(&pol->selector, fl, family);
if (match) {
- if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
+ if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
+ pol->if_id != if_id) {
pol = NULL;
goto out;
}
@@ -1268,7 +1287,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
old_pol = rcu_dereference_protected(sk->sk_policy[dir],
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
- pol->curlft.add_time = get_seconds();
+ pol->curlft.add_time = ktime_get_real_seconds();
pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
xfrm_sk_policy_link(pol, dir);
}
@@ -1305,6 +1324,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
newp->lft = old->lft;
newp->curlft = old->curlft;
newp->mark = old->mark;
+ newp->if_id = old->if_id;
newp->action = old->action;
newp->flags = old->flags;
newp->xfrm_nr = old->xfrm_nr;
@@ -1390,7 +1410,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
}
}
- x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
+ x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
+ family, policy->if_id);
if (x && x->km.state == XFRM_STATE_VALID) {
xfrm[nx++] = x;
@@ -1607,10 +1628,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
dst_copy_metrics(dst1, dst);
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+ __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+
family = xfrm[i]->props.family;
dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
- &saddr, &daddr, family,
- xfrm[i]->props.output_mark);
+ &saddr, &daddr, family, mark);
err = PTR_ERR(dst);
if (IS_ERR(dst))
goto put_states;
@@ -1692,7 +1714,8 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
XFRM_POLICY_TYPE_MAIN,
fl, family,
- XFRM_POLICY_OUT);
+ XFRM_POLICY_OUT,
+ pols[0]->if_id);
if (pols[1]) {
if (IS_ERR(pols[1])) {
xfrm_pols_put(pols, *num_pols);
@@ -1714,108 +1737,6 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
}
-static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
-{
- this_cpu_write(xfrm_last_dst, xdst);
- if (old)
- dst_release(&old->u.dst);
-}
-
-static void __xfrm_pcpu_work_fn(void)
-{
- struct xfrm_dst *old;
-
- old = this_cpu_read(xfrm_last_dst);
- if (old && !xfrm_bundle_ok(old))
- xfrm_last_dst_update(NULL, old);
-}
-
-static void xfrm_pcpu_work_fn(struct work_struct *work)
-{
- local_bh_disable();
- rcu_read_lock();
- __xfrm_pcpu_work_fn();
- rcu_read_unlock();
- local_bh_enable();
-}
-
-void xfrm_policy_cache_flush(void)
-{
- struct xfrm_dst *old;
- bool found = false;
- int cpu;
-
- might_sleep();
-
- local_bh_disable();
- rcu_read_lock();
- for_each_possible_cpu(cpu) {
- old = per_cpu(xfrm_last_dst, cpu);
- if (old && !xfrm_bundle_ok(old)) {
- if (smp_processor_id() == cpu) {
- __xfrm_pcpu_work_fn();
- continue;
- }
- found = true;
- break;
- }
- }
-
- rcu_read_unlock();
- local_bh_enable();
-
- if (!found)
- return;
-
- get_online_cpus();
-
- for_each_possible_cpu(cpu) {
- bool bundle_release;
-
- rcu_read_lock();
- old = per_cpu(xfrm_last_dst, cpu);
- bundle_release = old && !xfrm_bundle_ok(old);
- rcu_read_unlock();
-
- if (!bundle_release)
- continue;
-
- if (cpu_online(cpu)) {
- schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
- continue;
- }
-
- rcu_read_lock();
- old = per_cpu(xfrm_last_dst, cpu);
- if (old && !xfrm_bundle_ok(old)) {
- per_cpu(xfrm_last_dst, cpu) = NULL;
- dst_release(&old->u.dst);
- }
- rcu_read_unlock();
- }
-
- put_online_cpus();
-}
-
-static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
- struct xfrm_state * const xfrm[],
- int num)
-{
- const struct dst_entry *dst = &xdst->u.dst;
- int i;
-
- if (xdst->num_xfrms != num)
- return false;
-
- for (i = 0; i < num; i++) {
- if (!dst || dst->xfrm != xfrm[i])
- return false;
- dst = xfrm_dst_child(dst);
- }
-
- return xfrm_bundle_ok(xdst);
-}
-
static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
const struct flowi *fl, u16 family,
@@ -1824,34 +1745,21 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
struct net *net = xp_net(pols[0]);
struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
- struct xfrm_dst *xdst, *old;
+ struct xfrm_dst *xdst;
struct dst_entry *dst;
int err;
/* Try to instantiate a bundle */
err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
if (err <= 0) {
- if (err != 0 && err != -EAGAIN)
+ if (err == 0)
+ return NULL;
+
+ if (err != -EAGAIN)
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
return ERR_PTR(err);
}
- xdst = this_cpu_read(xfrm_last_dst);
- if (xdst &&
- xdst->u.dst.dev == dst_orig->dev &&
- xdst->num_pols == num_pols &&
- memcmp(xdst->pols, pols,
- sizeof(struct xfrm_policy *) * num_pols) == 0 &&
- xfrm_xdst_can_reuse(xdst, xfrm, err)) {
- dst_hold(&xdst->u.dst);
- xfrm_pols_put(pols, num_pols);
- while (err > 0)
- xfrm_state_put(xfrm[--err]);
- return xdst;
- }
-
- old = xdst;
-
dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
if (IS_ERR(dst)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
@@ -1864,9 +1772,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
xdst->policy_genid = atomic_read(&pols[0]->genid);
- atomic_set(&xdst->u.dst.__refcnt, 2);
- xfrm_last_dst_update(xdst, old);
-
return xdst;
}
@@ -2047,8 +1952,10 @@ free_dst:
goto out;
}
-static struct xfrm_dst *
-xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo)
+static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
+ const struct flowi *fl,
+ u16 family, u8 dir,
+ struct xfrm_flo *xflo, u32 if_id)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
int num_pols = 0, num_xfrms = 0, err;
@@ -2057,7 +1964,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
/* Resolve policies to use if we couldn't get them from
* previous cache entry */
num_pols = 1;
- pols[0] = xfrm_policy_lookup(net, fl, family, dir);
+ pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
@@ -2067,13 +1974,15 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
if (num_xfrms <= 0)
goto make_dummy_bundle;
- local_bh_disable();
xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
xflo->dst_orig);
- local_bh_enable();
-
if (IS_ERR(xdst)) {
err = PTR_ERR(xdst);
+ if (err == -EREMOTE) {
+ xfrm_pols_put(pols, num_pols);
+ return NULL;
+ }
+
if (err != -EAGAIN)
goto error;
goto make_dummy_bundle;
@@ -2123,14 +2032,19 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
return ret;
}
-/* Main function: finds/creates a bundle for given flow.
+/* Finds/creates a bundle for given flow and if_id
*
* At the moment we eat a raw IP route. Mostly to speed up lookups
* on interfaces with disabled IPsec.
+ *
+ * xfrm_lookup uses an if_id of 0 by default, and is provided for
+ * compatibility
*/
-struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
- const struct flowi *fl,
- const struct sock *sk, int flags)
+struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
+ struct dst_entry *dst_orig,
+ const struct flowi *fl,
+ const struct sock *sk,
+ int flags, u32 if_id)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct xfrm_dst *xdst;
@@ -2146,7 +2060,8 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
sk = sk_const_to_full_sk(sk);
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
- pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family);
+ pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
+ if_id);
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
@@ -2158,15 +2073,16 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
goto no_transform;
}
- local_bh_disable();
xdst = xfrm_resolve_and_create_bundle(
pols, num_pols, fl,
family, dst_orig);
- local_bh_enable();
if (IS_ERR(xdst)) {
xfrm_pols_put(pols, num_pols);
err = PTR_ERR(xdst);
+ if (err == -EREMOTE)
+ goto nopol;
+
goto dropdst;
} else if (xdst == NULL) {
num_xfrms = 0;
@@ -2189,7 +2105,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
!net->xfrm.policy_count[XFRM_POLICY_OUT])
goto nopol;
- xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
+ xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
if (xdst == NULL)
goto nopol;
if (IS_ERR(xdst)) {
@@ -2234,7 +2150,7 @@ no_transform:
}
for (i = 0; i < num_pols; i++)
- pols[i]->curlft.use_time = get_seconds();
+ pols[i]->curlft.use_time = ktime_get_real_seconds();
if (num_xfrms < 0) {
/* Prohibit the flow */
@@ -2270,6 +2186,19 @@ dropdst:
xfrm_pols_put(pols, drop_pols);
return ERR_PTR(err);
}
+EXPORT_SYMBOL(xfrm_lookup_with_ifid);
+
+/* Main function: finds/creates a bundle for given flow.
+ *
+ * At the moment we eat a raw IP route. Mostly to speed up lookups
+ * on interfaces with disabled IPsec.
+ */
+struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
+ const struct flowi *fl, const struct sock *sk,
+ int flags)
+{
+ return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
+}
EXPORT_SYMBOL(xfrm_lookup);
/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
@@ -2368,6 +2297,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
return -EAFNOSUPPORT;
afinfo->decode_session(skb, fl, reverse);
+
err = security_xfrm_decode_session(skb, &fl->flowi_secid);
rcu_read_unlock();
return err;
@@ -2398,6 +2328,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
int reverse;
struct flowi fl;
int xerr_idx = -1;
+ const struct xfrm_if_cb *ifcb;
+ struct xfrm_if *xi;
+ u32 if_id = 0;
+
+ rcu_read_lock();
+ ifcb = xfrm_if_get_cb();
+
+ if (ifcb) {
+ xi = ifcb->decode_session(skb);
+ if (xi)
+ if_id = xi->p.if_id;
+ }
+ rcu_read_unlock();
reverse = dir & ~XFRM_POLICY_MASK;
dir &= XFRM_POLICY_MASK;
@@ -2425,7 +2368,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
pol = NULL;
sk = sk_to_full_sk(sk);
if (sk && sk->sk_policy[dir]) {
- pol = xfrm_sk_policy_lookup(sk, dir, &fl, family);
+ pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
return 0;
@@ -2433,7 +2376,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
}
if (!pol)
- pol = xfrm_policy_lookup(net, &fl, family, dir);
+ pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
@@ -2449,7 +2392,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
return 1;
}
- pol->curlft.use_time = get_seconds();
+ pol->curlft.use_time = ktime_get_real_seconds();
pols[0] = pol;
npols++;
@@ -2457,13 +2400,13 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
&fl, family,
- XFRM_POLICY_IN);
+ XFRM_POLICY_IN, if_id);
if (pols[1]) {
if (IS_ERR(pols[1])) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
return 0;
}
- pols[1]->curlft.use_time = get_seconds();
+ pols[1]->curlft.use_time = ktime_get_real_seconds();
npols++;
}
}
@@ -2548,6 +2491,10 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
}
skb_dst_force(skb);
+ if (!skb_dst(skb)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
+ return 0;
+ }
dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
if (IS_ERR(dst)) {
@@ -2822,6 +2769,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
+void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
+{
+ spin_lock(&xfrm_if_cb_lock);
+ rcu_assign_pointer(xfrm_if_cb, ifcb);
+ spin_unlock(&xfrm_if_cb_lock);
+}
+EXPORT_SYMBOL(xfrm_if_register_cb);
+
+void xfrm_if_unregister_cb(void)
+{
+ RCU_INIT_POINTER(xfrm_if_cb, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(xfrm_if_unregister_cb);
+
#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
{
@@ -2989,19 +2951,13 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
void __init xfrm_init(void)
{
- int i;
-
- xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
- GFP_KERNEL);
- BUG_ON(!xfrm_pcpu_work);
-
- for (i = 0; i < NR_CPUS; i++)
- INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
-
register_pernet_subsys(&xfrm_net_ops);
xfrm_dev_init();
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
+
+ RCU_INIT_POINTER(xfrm_if_cb, NULL);
+ synchronize_rcu();
}
#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8308281f3253..b669262682c9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -475,8 +475,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
{
struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
- unsigned long now = get_seconds();
- long next = LONG_MAX;
+ time64_t now = ktime_get_real_seconds();
+ time64_t next = TIME64_MAX;
int warn = 0;
int err = 0;
@@ -537,7 +537,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (warn)
km_state_expired(x, 0, 0);
resched:
- if (next != LONG_MAX) {
+ if (next != TIME64_MAX) {
tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
}
@@ -577,7 +577,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
- x->curlft.add_time = get_seconds();
+ x->curlft.add_time = ktime_get_real_seconds();
x->lft.soft_byte_limit = XFRM_INF;
x->lft.soft_packet_limit = XFRM_INF;
x->lft.hard_byte_limit = XFRM_INF;
@@ -735,10 +735,9 @@ restart:
}
out:
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
- if (cnt) {
+ if (cnt)
err = 0;
- xfrm_policy_cache_flush();
- }
+
return err;
}
EXPORT_SYMBOL(xfrm_state_flush);
@@ -931,7 +930,7 @@ struct xfrm_state *
xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
const struct flowi *fl, struct xfrm_tmpl *tmpl,
struct xfrm_policy *pol, int *err,
- unsigned short family)
+ unsigned short family, u32 if_id)
{
static xfrm_address_t saddr_wildcard = { };
struct net *net = xp_net(pol);
@@ -955,6 +954,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
tmpl->mode == x->props.mode &&
@@ -971,6 +971,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
tmpl->mode == x->props.mode &&
@@ -1010,6 +1011,7 @@ found:
* to current session. */
xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
memcpy(&x->mark, &pol->mark, sizeof(x->mark));
+ x->if_id = if_id;
error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
if (error) {
@@ -1067,7 +1069,7 @@ out:
}
struct xfrm_state *
-xfrm_stateonly_find(struct net *net, u32 mark,
+xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
xfrm_address_t *daddr, xfrm_address_t *saddr,
unsigned short family, u8 mode, u8 proto, u32 reqid)
{
@@ -1080,6 +1082,7 @@ xfrm_stateonly_find(struct net *net, u32 mark,
if (x->props.family == family &&
x->props.reqid == reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_state_addr_check(x, daddr, saddr, family) &&
mode == x->props.mode &&
@@ -1160,11 +1163,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
struct xfrm_state *x;
unsigned int h;
u32 mark = xnew->mark.v & xnew->mark.m;
+ u32 if_id = xnew->if_id;
h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
if (x->props.family == family &&
x->props.reqid == reqid &&
+ x->if_id == if_id &&
(mark & x->mark.m) == x->mark.v &&
xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
@@ -1187,7 +1192,7 @@ EXPORT_SYMBOL(xfrm_state_insert);
static struct xfrm_state *__find_acq_core(struct net *net,
const struct xfrm_mark *m,
unsigned short family, u8 mode,
- u32 reqid, u8 proto,
+ u32 reqid, u32 if_id, u8 proto,
const xfrm_address_t *daddr,
const xfrm_address_t *saddr,
int create)
@@ -1242,6 +1247,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
x->props.family = family;
x->props.mode = mode;
x->props.reqid = reqid;
+ x->if_id = if_id;
x->mark.v = m->v;
x->mark.m = m->m;
x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
@@ -1296,7 +1302,7 @@ int xfrm_state_add(struct xfrm_state *x)
if (use_spi && !x1)
x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
- x->props.reqid, x->id.proto,
+ x->props.reqid, x->if_id, x->id.proto,
&x->id.daddr, &x->props.saddr, 0);
__xfrm_state_bump_genids(x);
@@ -1395,6 +1401,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
x->props.flags = orig->props.flags;
x->props.extra_flags = orig->props.extra_flags;
+ x->if_id = orig->if_id;
x->tfcpad = orig->tfcpad;
x->replay_maxdiff = orig->replay_maxdiff;
x->replay_maxage = orig->replay_maxage;
@@ -1554,6 +1561,19 @@ out:
if (x1->curlft.use_time)
xfrm_state_check_expire(x1);
+ if (x->props.smark.m || x->props.smark.v || x->if_id) {
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+ if (x->props.smark.m || x->props.smark.v)
+ x1->props.smark = x->props.smark;
+
+ if (x->if_id)
+ x1->if_id = x->if_id;
+
+ __xfrm_state_bump_genids(x1);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ }
+
err = 0;
x->km.state = XFRM_STATE_DEAD;
__xfrm_state_put(x);
@@ -1571,7 +1591,7 @@ EXPORT_SYMBOL(xfrm_state_update);
int xfrm_state_check_expire(struct xfrm_state *x)
{
if (!x->curlft.use_time)
- x->curlft.use_time = get_seconds();
+ x->curlft.use_time = ktime_get_real_seconds();
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {
@@ -1619,13 +1639,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
struct xfrm_state *
xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
- u8 proto, const xfrm_address_t *daddr,
+ u32 if_id, u8 proto, const xfrm_address_t *daddr,
const xfrm_address_t *saddr, int create, unsigned short family)
{
struct xfrm_state *x;
spin_lock_bh(&net->xfrm.xfrm_state_lock);
- x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create);
+ x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
return x;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 33878e6e0d0a..ca7a207b81a9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -151,10 +151,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
err = -EINVAL;
switch (p->family) {
case AF_INET:
+ if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
+ goto out;
+
break;
case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
+ if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128)
+ goto out;
+
break;
#else
err = -EAFNOSUPPORT;
@@ -527,6 +533,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
x->replay_maxdiff = nla_get_u32(rt);
}
+static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
+{
+ if (attrs[XFRMA_SET_MARK]) {
+ m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
+ if (attrs[XFRMA_SET_MARK_MASK])
+ m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]);
+ else
+ m->m = 0xffffffff;
+ } else {
+ m->v = m->m = 0;
+ }
+}
+
static struct xfrm_state *xfrm_state_construct(struct net *net,
struct xfrm_usersa_info *p,
struct nlattr **attrs,
@@ -579,8 +598,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
xfrm_mark_get(attrs, &x->mark);
- if (attrs[XFRMA_OUTPUT_MARK])
- x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+ xfrm_smark_init(attrs, &x->props.smark);
+
+ if (attrs[XFRMA_IF_ID])
+ x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
if (err)
@@ -824,6 +845,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
return 0;
}
+static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
+{
+ int ret = 0;
+
+ if (m->v | m->m) {
+ ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
+ if (!ret)
+ ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
+ }
+ return ret;
+}
+
/* Don't change this without updating xfrm_sa_len! */
static int copy_to_user_state_extra(struct xfrm_state *x,
struct xfrm_usersa_info *p,
@@ -887,6 +920,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
ret = xfrm_mark_put(skb, &x->mark);
if (ret)
goto out;
+
+ ret = xfrm_smark_put(skb, &x->props.smark);
+ if (ret)
+ goto out;
+
if (x->replay_esn)
ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
xfrm_replay_state_esn_len(x->replay_esn),
@@ -900,8 +938,8 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
ret = copy_user_offload(&x->xso, skb);
if (ret)
goto out;
- if (x->props.output_mark) {
- ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
+ if (x->if_id) {
+ ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
if (ret)
goto out;
}
@@ -969,7 +1007,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
int err;
err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy,
- NULL);
+ cb->extack);
if (err < 0)
return err;
@@ -1255,6 +1293,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
int err;
u32 mark;
struct xfrm_mark m;
+ u32 if_id = 0;
p = nlmsg_data(nlh);
err = verify_spi_info(p->info.id.proto, p->min, p->max);
@@ -1267,6 +1306,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
x = NULL;
mark = xfrm_mark_get(attrs, &m);
+
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->info.seq) {
x = xfrm_find_acq_byseq(net, mark, p->info.seq);
if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
@@ -1277,7 +1320,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!x)
x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
- p->info.id.proto, daddr,
+ if_id, p->info.id.proto, daddr,
&p->info.saddr, 1,
family);
err = -ENOENT;
@@ -1359,10 +1402,16 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
switch (p->sel.family) {
case AF_INET:
+ if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
+ return -EINVAL;
+
break;
case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
+ if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128)
+ return -EINVAL;
+
break;
#else
return -EAFNOSUPPORT;
@@ -1443,6 +1492,9 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
(ut[i].family != prev_family))
return -EINVAL;
+ if (ut[i].mode >= XFRM_MODE_MAX)
+ return -EINVAL;
+
prev_family = ut[i].family;
switch (ut[i].family) {
@@ -1565,6 +1617,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
xfrm_mark_get(attrs, &xp->mark);
+ if (attrs[XFRMA_IF_ID])
+ xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
return xp;
error:
*errp = err;
@@ -1712,6 +1767,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -1793,6 +1850,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
int delete;
struct xfrm_mark m;
u32 mark = xfrm_mark_get(attrs, &m);
+ u32 if_id = 0;
p = nlmsg_data(nlh);
delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
@@ -1805,8 +1863,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->index)
- xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err);
+ xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err);
else {
struct nlattr *rt = attrs[XFRMA_SEC_CTX];
struct xfrm_sec_ctx *ctx;
@@ -1823,7 +1884,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
}
- xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel,
+ xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel,
ctx, delete, &err);
security_xfrm_policy_free(ctx);
}
@@ -1946,6 +2007,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
if (err)
goto out_cancel;
+ err = xfrm_if_id_put(skb, x->if_id);
+ if (err)
+ goto out_cancel;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2088,6 +2153,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
int err = -ENOENT;
struct xfrm_mark m;
u32 mark = xfrm_mark_get(attrs, &m);
+ u32 if_id = 0;
err = copy_from_user_policy_type(&type, attrs);
if (err)
@@ -2097,8 +2163,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->index)
- xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
+ xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err);
else {
struct nlattr *rt = attrs[XFRMA_SEC_CTX];
struct xfrm_sec_ctx *ctx;
@@ -2115,7 +2184,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
}
- xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir,
+ xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir,
&p->sel, ctx, 0, &err);
security_xfrm_policy_free(ctx);
}
@@ -2497,7 +2566,9 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_PROTO] = { .type = NLA_U8 },
[XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
[XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
- [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 },
+ [XFRMA_SET_MARK] = { .type = NLA_U32 },
+ [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 },
+ [XFRMA_IF_ID] = { .type = NLA_U32 },
};
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2629,6 +2700,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
if (err)
return err;
+ err = xfrm_if_id_put(skb, x->if_id);
+ if (err)
+ return err;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -2723,8 +2798,12 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(x->props.extra_flags));
if (x->xso.dev)
l += nla_total_size(sizeof(x->xso));
- if (x->props.output_mark)
- l += nla_total_size(sizeof(x->props.output_mark));
+ if (x->props.smark.v | x->props.smark.m) {
+ l += nla_total_size(sizeof(x->props.smark.v));
+ l += nla_total_size(sizeof(x->props.smark.m));
+ }
+ if (x->if_id)
+ l += nla_total_size(sizeof(x->if_id));
/* Must count x->lastused as it may become non-zero behind our back. */
l += nla_total_size_64bit(sizeof(u64));
@@ -2854,6 +2933,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -2970,6 +3051,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -3051,6 +3134,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err)
goto out_free_skb;
@@ -3284,4 +3369,3 @@ module_init(xfrm_user_init);
module_exit(xfrm_user_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
-
OpenPOWER on IntegriCloud