summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/mrp.c6
-rw-r--r--net/8021q/vlan.c1
-rw-r--r--net/8021q/vlan.h1
-rw-r--r--net/8021q/vlan_dev.c40
-rw-r--r--net/8021q/vlan_netlink.c19
-rw-r--r--net/9p/client.c1
-rw-r--r--net/9p/trans_rdma.c6
-rw-r--r--net/Kconfig43
-rw-r--r--net/Makefile3
-rw-r--r--net/appletalk/ddp.c9
-rw-r--r--net/atm/atm_sysfs.c22
-rw-r--r--net/atm/clip.c6
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/lec.c78
-rw-r--r--net/atm/mpoa_caches.c6
-rw-r--r--net/atm/mpoa_proc.c17
-rw-r--r--net/atm/pppoatm.c4
-rw-r--r--net/atm/proc.c11
-rw-r--r--net/atm/signaling.c2
-rw-r--r--net/atm/svc.c2
-rw-r--r--net/ax25/af_ax25.c6
-rw-r--r--net/ax25/ax25_in.c2
-rw-r--r--net/batman-adv/Kconfig13
-rw-r--r--net/batman-adv/Makefile2
-rw-r--r--net/batman-adv/bat_algo.c2
-rw-r--r--net/batman-adv/bat_algo.h2
-rw-r--r--net/batman-adv/bat_iv_ogm.c83
-rw-r--r--net/batman-adv/bat_iv_ogm.h2
-rw-r--r--net/batman-adv/bat_v.c8
-rw-r--r--net/batman-adv/bat_v.h2
-rw-r--r--net/batman-adv/bat_v_elp.c15
-rw-r--r--net/batman-adv/bat_v_elp.h2
-rw-r--r--net/batman-adv/bat_v_ogm.c236
-rw-r--r--net/batman-adv/bat_v_ogm.h5
-rw-r--r--net/batman-adv/bitarray.c2
-rw-r--r--net/batman-adv/bitarray.h2
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c4
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h2
-rw-r--r--net/batman-adv/debugfs.c2
-rw-r--r--net/batman-adv/debugfs.h2
-rw-r--r--net/batman-adv/distributed-arp-table.c14
-rw-r--r--net/batman-adv/distributed-arp-table.h2
-rw-r--r--net/batman-adv/fragmentation.c2
-rw-r--r--net/batman-adv/fragmentation.h2
-rw-r--r--net/batman-adv/gateway_client.c2
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c2
-rw-r--r--net/batman-adv/gateway_common.h2
-rw-r--r--net/batman-adv/hard-interface.c4
-rw-r--r--net/batman-adv/hard-interface.h2
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h2
-rw-r--r--net/batman-adv/icmp_socket.c2
-rw-r--r--net/batman-adv/icmp_socket.h2
-rw-r--r--net/batman-adv/log.c2
-rw-r--r--net/batman-adv/log.h12
-rw-r--r--net/batman-adv/main.c4
-rw-r--r--net/batman-adv/main.h4
-rw-r--r--net/batman-adv/multicast.c4
-rw-r--r--net/batman-adv/multicast.h2
-rw-r--r--net/batman-adv/netlink.c4
-rw-r--r--net/batman-adv/netlink.h2
-rw-r--r--net/batman-adv/network-coding.c2
-rw-r--r--net/batman-adv/network-coding.h2
-rw-r--r--net/batman-adv/originator.c2
-rw-r--r--net/batman-adv/originator.h2
-rw-r--r--net/batman-adv/routing.c2
-rw-r--r--net/batman-adv/routing.h2
-rw-r--r--net/batman-adv/send.c2
-rw-r--r--net/batman-adv/send.h2
-rw-r--r--net/batman-adv/soft-interface.c49
-rw-r--r--net/batman-adv/soft-interface.h2
-rw-r--r--net/batman-adv/sysfs.c4
-rw-r--r--net/batman-adv/sysfs.h2
-rw-r--r--net/batman-adv/tp_meter.c2
-rw-r--r--net/batman-adv/tp_meter.h2
-rw-r--r--net/batman-adv/trace.c2
-rw-r--r--net/batman-adv/trace.h2
-rw-r--r--net/batman-adv/translation-table.c2
-rw-r--r--net/batman-adv/translation-table.h2
-rw-r--r--net/batman-adv/tvlv.c2
-rw-r--r--net/batman-adv/tvlv.h2
-rw-r--r--net/batman-adv/types.h22
-rw-r--r--net/bluetooth/6lowpan.c10
-rw-r--r--net/bluetooth/Kconfig5
-rw-r--r--net/bluetooth/af_bluetooth.c8
-rw-r--r--net/bluetooth/bnep/netdev.c2
-rw-r--r--net/bluetooth/hci_conn.c12
-rw-r--r--net/bluetooth/hci_core.c139
-rw-r--r--net/bluetooth/hci_debugfs.c78
-rw-r--r--net/bluetooth/hci_event.c46
-rw-r--r--net/bluetooth/hci_request.c48
-rw-r--r--net/bluetooth/hci_sock.c42
-rw-r--r--net/bluetooth/hidp/core.c4
-rw-r--r--net/bluetooth/l2cap_core.c68
-rw-r--r--net/bluetooth/lib.c16
-rw-r--r--net/bluetooth/mgmt.c96
-rw-r--r--net/bluetooth/rfcomm/sock.c14
-rw-r--r--net/bluetooth/smp.c220
-rw-r--r--net/bpf/test_run.c137
-rw-r--r--net/bpfilter/Makefile2
-rw-r--r--net/bridge/Makefile2
-rw-r--r--net/bridge/br.c2
-rw-r--r--net/bridge/br_device.c53
-rw-r--r--net/bridge/br_fdb.c157
-rw-r--r--net/bridge/br_forward.c2
-rw-r--r--net/bridge/br_input.c14
-rw-r--r--net/bridge/br_mdb.c179
-rw-r--r--net/bridge/br_multicast.c32
-rw-r--r--net/bridge/br_netfilter_hooks.c7
-rw-r--r--net/bridge/br_netlink.c74
-rw-r--r--net/bridge/br_nf_core.c3
-rw-r--r--net/bridge/br_private.h178
-rw-r--r--net/bridge/br_stp.c15
-rw-r--r--net/bridge/br_stp_bpdu.c4
-rw-r--r--net/bridge/br_switchdev.c12
-rw-r--r--net/bridge/br_vlan.c557
-rw-r--r--net/bridge/br_vlan_options.c160
-rw-r--r--net/bridge/netfilter/ebt_802_3.c8
-rw-r--r--net/bridge/netfilter/ebt_dnat.c19
-rw-r--r--net/bridge/netfilter/ebtables.c41
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c20
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c2
-rw-r--r--net/caif/Kconfig10
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/caif_usb.c2
-rw-r--r--net/can/Kconfig13
-rw-r--r--net/can/Makefile2
-rw-r--r--net/can/af_can.c382
-rw-r--r--net/can/af_can.h25
-rw-r--r--net/can/bcm.c170
-rw-r--r--net/can/gw.c511
-rw-r--r--net/can/j1939/Kconfig15
-rw-r--r--net/can/j1939/Makefile10
-rw-r--r--net/can/j1939/address-claim.c230
-rw-r--r--net/can/j1939/bus.c333
-rw-r--r--net/can/j1939/j1939-priv.h338
-rw-r--r--net/can/j1939/main.c412
-rw-r--r--net/can/j1939/socket.c1223
-rw-r--r--net/can/j1939/transport.c2063
-rw-r--r--net/can/proc.c164
-rw-r--r--net/can/raw.c44
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/ceph_common.c453
-rw-r--r--net/ceph/ceph_fs.c104
-rw-r--r--net/ceph/crypto.c6
-rw-r--r--net/ceph/messenger.c8
-rw-r--r--net/ceph/mon_client.c10
-rw-r--r--net/ceph/osd_client.c92
-rw-r--r--net/ceph/osdmap.c69
-rw-r--r--net/compat.c2
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/bpf_sk_storage.c109
-rw-r--r--net/core/datagram.c39
-rw-r--r--net/core/dev.c1234
-rw-r--r--net/core/dev_addr_lists.c12
-rw-r--r--net/core/dev_ioctl.c1
-rw-r--r--net/core/devlink.c1735
-rw-r--r--net/core/drop_monitor.c1343
-rw-r--r--net/core/dst.c4
-rw-r--r--net/core/fib_notifier.c116
-rw-r--r--net/core/fib_rules.c23
-rw-r--r--net/core/filter.c525
-rw-r--r--net/core/flow_dissector.c277
-rw-r--r--net/core/flow_offload.c240
-rw-r--r--net/core/gen_estimator.c4
-rw-r--r--net/core/gen_stats.c12
-rw-r--r--net/core/lwt_bpf.c11
-rw-r--r--net/core/neighbour.c14
-rw-r--r--net/core/net-procfs.c4
-rw-r--r--net/core/net-sysfs.c39
-rw-r--r--net/core/net_namespace.c120
-rw-r--r--net/core/netpoll.c6
-rw-r--r--net/core/netprio_cgroup.c8
-rw-r--r--net/core/page_pool.c281
-rw-r--r--net/core/pktgen.c53
-rw-r--r--net/core/request_sock.c2
-rw-r--r--net/core/rtnetlink.c282
-rw-r--r--net/core/scm.c6
-rw-r--r--net/core/skbuff.c275
-rw-r--r--net/core/skmsg.c38
-rw-r--r--net/core/sock.c113
-rw-r--r--net/core/sock_map.c40
-rw-r--r--net/core/sock_reuseport.c20
-rw-r--r--net/core/stream.c16
-rw-r--r--net/core/sysctl_net_core.c10
-rw-r--r--net/core/timestamping.c20
-rw-r--r--net/core/tso.c8
-rw-r--r--net/core/utils.c20
-rw-r--r--net/core/xdp.c128
-rw-r--r--net/dccp/ipv4.c6
-rw-r--r--net/dccp/ipv6.c11
-rw-r--r--net/dccp/proto.c4
-rw-r--r--net/decnet/af_decnet.c6
-rw-r--r--net/decnet/dn_nsp_in.c2
-rw-r--r--net/decnet/dn_route.c6
-rw-r--r--net/dsa/Kconfig27
-rw-r--r--net/dsa/Makefile4
-rw-r--r--net/dsa/dsa.c93
-rw-r--r--net/dsa/dsa2.c547
-rw-r--r--net/dsa/dsa_priv.h42
-rw-r--r--net/dsa/master.c101
-rw-r--r--net/dsa/port.c84
-rw-r--r--net/dsa/slave.c180
-rw-r--r--net/dsa/switch.c139
-rw-r--r--net/dsa/tag_8021q.c118
-rw-r--r--net/dsa/tag_ar9331.c96
-rw-r--r--net/dsa/tag_gswip.c2
-rw-r--r--net/dsa/tag_ksz.c60
-rw-r--r--net/dsa/tag_ocelot.c241
-rw-r--r--net/dsa/tag_qca.c5
-rw-r--r--net/dsa/tag_sja1105.c33
-rw-r--r--net/ethernet/eth.c23
-rw-r--r--net/ethtool/Makefile8
-rw-r--r--net/ethtool/bitset.c735
-rw-r--r--net/ethtool/bitset.h28
-rw-r--r--net/ethtool/common.c259
-rw-r--r--net/ethtool/common.h31
-rw-r--r--net/ethtool/debug.c134
-rw-r--r--net/ethtool/ioctl.c (renamed from net/core/ethtool.c)179
-rw-r--r--net/ethtool/linkinfo.c167
-rw-r--r--net/ethtool/linkmodes.c375
-rw-r--r--net/ethtool/linkstate.c74
-rw-r--r--net/ethtool/netlink.c729
-rw-r--r--net/ethtool/netlink.h345
-rw-r--r--net/ethtool/strset.c437
-rw-r--r--net/ethtool/wol.c177
-rw-r--r--net/hsr/hsr_debugfs.c52
-rw-r--r--net/hsr/hsr_device.c37
-rw-r--r--net/hsr/hsr_framereg.c74
-rw-r--r--net/hsr/hsr_framereg.h6
-rw-r--r--net/hsr/hsr_main.c7
-rw-r--r--net/hsr/hsr_main.h22
-rw-r--r--net/hsr/hsr_netlink.c1
-rw-r--r--net/hsr/hsr_slave.c2
-rw-r--r--net/ieee802154/6lowpan/core.c8
-rw-r--r--net/ieee802154/core.c7
-rw-r--r--net/ieee802154/nl802154.c39
-rw-r--r--net/ieee802154/socket.c5
-rw-r--r--net/ife/Kconfig2
-rw-r--r--net/ipv4/Kconfig233
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c9
-rw-r--r--net/ipv4/bpf_tcp_ca.c252
-rw-r--r--net/ipv4/datagram.c4
-rw-r--r--net/ipv4/devinet.c5
-rw-r--r--net/ipv4/esp4.c264
-rw-r--r--net/ipv4/esp4_offload.c2
-rw-r--r--net/ipv4/fib_frontend.c15
-rw-r--r--net/ipv4/fib_lookup.h8
-rw-r--r--net/ipv4/fib_notifier.c13
-rw-r--r--net/ipv4/fib_rules.c5
-rw-r--r--net/ipv4/fib_semantics.c50
-rw-r--r--net/ipv4/fib_trie.c242
-rw-r--r--net/ipv4/fou.c4
-rw-r--r--net/ipv4/gre_demux.c2
-rw-r--r--net/ipv4/gre_offload.c2
-rw-r--r--net/ipv4/icmp.c57
-rw-r--r--net/ipv4/igmp.c6
-rw-r--r--net/ipv4/inet_connection_sock.c34
-rw-r--r--net/ipv4/inet_diag.c20
-rw-r--r--net/ipv4/inet_hashtables.c18
-rw-r--r--net/ipv4/inetpeer.c12
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/ip_input.c40
-rw-r--r--net/ipv4/ip_output.c45
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c440
-rw-r--r--net/ipv4/ip_vti.c19
-rw-r--r--net/ipv4/ipconfig.c23
-rw-r--r--net/ipv4/ipmr.c24
-rw-r--r--net/ipv4/ipmr_base.c30
-rw-r--r--net/ipv4/netfilter/Kconfig8
-rw-r--r--net/ipv4/netfilter/Makefile2
-rw-r--r--net/ipv4/netfilter/arp_tables.c46
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c16
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c10
-rw-r--r--net/ipv4/nexthop.c7
-rw-r--r--net/ipv4/ping.c2
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c181
-rw-r--r--net/ipv4/syncookies.c8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c24
-rw-r--r--net/ipv4/tcp.c229
-rw-r--r--net/ipv4/tcp_bbr.c20
-rw-r--r--net/ipv4/tcp_bpf.c19
-rw-r--r--net/ipv4/tcp_cong.c22
-rw-r--r--net/ipv4/tcp_cubic.c83
-rw-r--r--net/ipv4/tcp_diag.c61
-rw-r--r--net/ipv4/tcp_fastopen.c7
-rw-r--r--net/ipv4/tcp_input.c185
-rw-r--r--net/ipv4/tcp_ipv4.c208
-rw-r--r--net/ipv4/tcp_metrics.c13
-rw-r--r--net/ipv4/tcp_minisocks.c22
-rw-r--r--net/ipv4/tcp_output.c143
-rw-r--r--net/ipv4/tcp_timer.c43
-rw-r--r--net/ipv4/tcp_ulp.c9
-rw-r--r--net/ipv4/udp.c91
-rw-r--r--net/ipv4/udp_offload.c106
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv4/xfrm4_policy.c6
-rw-r--r--net/ipv4/xfrm4_protocol.c9
-rw-r--r--net/ipv6/addrconf.c54
-rw-r--r--net/ipv6/addrconf_core.c12
-rw-r--r--net/ipv6/af_inet6.c6
-rw-r--r--net/ipv6/datagram.c4
-rw-r--r--net/ipv6/esp6_offload.c2
-rw-r--r--net/ipv6/exthdrs_core.c4
-rw-r--r--net/ipv6/fib6_notifier.c11
-rw-r--r--net/ipv6/fib6_rules.c8
-rw-r--r--net/ipv6/icmp.c22
-rw-r--r--net/ipv6/inet6_connection_sock.c8
-rw-r--r--net/ipv6/inet6_hashtables.c2
-rw-r--r--net/ipv6/ip6_fib.c155
-rw-r--r--net/ipv6/ip6_gre.c16
-rw-r--r--net/ipv6/ip6_icmp.c34
-rw-r--r--net/ipv6/ip6_input.c45
-rw-r--r--net/ipv6/ip6_output.c22
-rw-r--r--net/ipv6/ip6_tunnel.c76
-rw-r--r--net/ipv6/ip6_vti.c15
-rw-r--r--net/ipv6/ip6mr.c17
-rw-r--r--net/ipv6/ipv6_sockglue.c4
-rw-r--r--net/ipv6/mcast.c5
-rw-r--r--net/ipv6/ndisc.c1
-rw-r--r--net/ipv6/netfilter.c7
-rw-r--r--net/ipv6/netfilter/Kconfig38
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c4
-rw-r--r--net/ipv6/netfilter/ip6t_ipv6header.c4
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c4
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c1
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c2
-rw-r--r--net/ipv6/ping.c2
-rw-r--r--net/ipv6/raw.c8
-rw-r--r--net/ipv6/route.c179
-rw-r--r--net/ipv6/seg6_local.c37
-rw-r--r--net/ipv6/sit.c2
-rw-r--r--net/ipv6/syncookies.c5
-rw-r--r--net/ipv6/tcp_ipv6.c187
-rw-r--r--net/ipv6/udp.c22
-rw-r--r--net/ipv6/udp_offload.c29
-rw-r--r--net/ipv6/xfrm6_output.c2
-rw-r--r--net/ipv6/xfrm6_policy.c5
-rw-r--r--net/iucv/af_iucv.c2
-rw-r--r--net/kcm/kcmsock.c16
-rw-r--r--net/l2tp/l2tp_core.c11
-rw-r--r--net/l2tp/l2tp_eth.c3
-rw-r--r--net/l2tp/l2tp_ip.c2
-rw-r--r--net/l2tp/l2tp_ip6.c4
-rw-r--r--net/llc/af_llc.c38
-rw-r--r--net/llc/llc_c_ac.c8
-rw-r--r--net/llc/llc_conn.c69
-rw-r--r--net/llc/llc_if.c12
-rw-r--r--net/llc/llc_s_ac.c12
-rw-r--r--net/llc/llc_sap.c23
-rw-r--r--net/llc/llc_station.c4
-rw-r--r--net/mac80211/Makefile3
-rw-r--r--net/mac80211/agg-rx.c72
-rw-r--r--net/mac80211/agg-tx.c9
-rw-r--r--net/mac80211/airtime.c597
-rw-r--r--net/mac80211/cfg.c55
-rw-r--r--net/mac80211/debugfs.c91
-rw-r--r--net/mac80211/debugfs_netdev.c11
-rw-r--r--net/mac80211/debugfs_sta.c96
-rw-r--r--net/mac80211/driver-ops.h8
-rw-r--r--net/mac80211/he.c40
-rw-r--r--net/mac80211/ht.c2
-rw-r--r--net/mac80211/ibss.c17
-rw-r--r--net/mac80211/ieee80211_i.h28
-rw-r--r--net/mac80211/iface.c2
-rw-r--r--net/mac80211/key.c64
-rw-r--r--net/mac80211/key.h4
-rw-r--r--net/mac80211/main.c27
-rw-r--r--net/mac80211/mesh.c62
-rw-r--r--net/mac80211/mesh.h4
-rw-r--r--net/mac80211/mesh_hwmp.c3
-rw-r--r--net/mac80211/mesh_plink.c12
-rw-r--r--net/mac80211/mlme.c134
-rw-r--r--net/mac80211/offchannel.c5
-rw-r--r--net/mac80211/rate.h9
-rw-r--r--net/mac80211/rc80211_minstrel.c48
-rw-r--r--net/mac80211/rc80211_minstrel.h58
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c8
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c340
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h14
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c8
-rw-r--r--net/mac80211/rx.c17
-rw-r--r--net/mac80211/scan.c30
-rw-r--r--net/mac80211/sta_info.c66
-rw-r--r--net/mac80211/sta_info.h13
-rw-r--r--net/mac80211/status.c234
-rw-r--r--net/mac80211/tkip.c18
-rw-r--r--net/mac80211/trace.h35
-rw-r--r--net/mac80211/tx.c164
-rw-r--r--net/mac80211/util.c118
-rw-r--r--net/mac80211/vht.c10
-rw-r--r--net/mac80211/wpa.c6
-rw-r--r--net/mpls/af_mpls.c7
-rw-r--r--net/mpls/mpls_iptunnel.c8
-rw-r--r--net/mptcp/Kconfig28
-rw-r--r--net/mptcp/Makefile4
-rw-r--r--net/mptcp/crypto.c152
-rw-r--r--net/mptcp/ctrl.c130
-rw-r--r--net/mptcp/options.c586
-rw-r--r--net/mptcp/protocol.c1252
-rw-r--r--net/mptcp/protocol.h240
-rw-r--r--net/mptcp/subflow.c865
-rw-r--r--net/mptcp/token.c195
-rw-r--r--net/ncsi/internal.h27
-rw-r--r--net/ncsi/ncsi-cmd.c23
-rw-r--r--net/ncsi/ncsi-manage.c170
-rw-r--r--net/ncsi/ncsi-pkt.h5
-rw-r--r--net/ncsi/ncsi-rsp.c32
-rw-r--r--net/netfilter/Kconfig10
-rw-r--r--net/netfilter/Makefile8
-rw-r--r--net/netfilter/core.c20
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h6
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c32
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c24
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c47
-rw-r--r--net/netfilter/ipset/ip_set_core.c305
-rw-r--r--net/netfilter/ipset/ip_set_getport.c28
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h8
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c10
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c8
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c24
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c25
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c47
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c29
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c24
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c28
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c6
-rw-r--r--net/netfilter/ipvs/Kconfig6
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c49
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c116
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_ovf.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c15
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c22
-rw-r--r--net/netfilter/nf_conntrack_core.c20
-rw-r--r--net/netfilter/nf_conntrack_ecache.c24
-rw-r--r--net/netfilter/nf_conntrack_expect.c2
-rw-r--r--net/netfilter/nf_conntrack_extend.c22
-rw-r--r--net/netfilter/nf_conntrack_ftp.c4
-rw-r--r--net/netfilter/nf_conntrack_helper.c5
-rw-r--r--net/netfilter/nf_conntrack_labels.c3
-rw-r--r--net/netfilter/nf_conntrack_netlink.c78
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c6
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c9
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c2
-rw-r--r--net/netfilter/nf_conntrack_standalone.c40
-rw-r--r--net/netfilter/nf_conntrack_timeout.c1
-rw-r--r--net/netfilter/nf_dup_netdev.c21
-rw-r--r--net/netfilter/nf_flow_table_core.c200
-rw-r--r--net/netfilter/nf_flow_table_inet.c25
-rw-r--r--net/netfilter/nf_flow_table_ip.c28
-rw-r--r--net/netfilter/nf_flow_table_offload.c905
-rw-r--r--net/netfilter/nf_nat_core.c6
-rw-r--r--net/netfilter/nf_nat_proto.c17
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/nf_synproxy_core.c8
-rw-r--r--net/netfilter/nf_tables_api.c1194
-rw-r--r--net/netfilter/nf_tables_offload.c480
-rw-r--r--net/netfilter/nf_tables_set_core.c2
-rw-r--r--net/netfilter/nfnetlink.c6
-rw-r--r--net/netfilter/nfnetlink_cthelper.c2
-rw-r--r--net/netfilter/nfnetlink_log.c61
-rw-r--r--net/netfilter/nfnetlink_queue.c12
-rw-r--r--net/netfilter/nft_bitwise.c238
-rw-r--r--net/netfilter/nft_byteorder.c9
-rw-r--r--net/netfilter/nft_chain_filter.c45
-rw-r--r--net/netfilter/nft_cmp.c14
-rw-r--r--net/netfilter/nft_connlimit.c7
-rw-r--r--net/netfilter/nft_ct.c12
-rw-r--r--net/netfilter/nft_dup_netdev.c12
-rw-r--r--net/netfilter/nft_dynset.c8
-rw-r--r--net/netfilter/nft_fib_netdev.c3
-rw-r--r--net/netfilter/nft_flow_offload.c36
-rw-r--r--net/netfilter/nft_fwd_netdev.c12
-rw-r--r--net/netfilter/nft_immediate.c24
-rw-r--r--net/netfilter/nft_lookup.c3
-rw-r--r--net/netfilter/nft_masq.c2
-rw-r--r--net/netfilter/nft_meta.c482
-rw-r--r--net/netfilter/nft_nat.c6
-rw-r--r--net/netfilter/nft_osf.c3
-rw-r--r--net/netfilter/nft_payload.c132
-rw-r--r--net/netfilter/nft_quota.c29
-rw-r--r--net/netfilter/nft_range.c10
-rw-r--r--net/netfilter/nft_redir.c2
-rw-r--r--net/netfilter/nft_set_bitmap.c6
-rw-r--r--net/netfilter/nft_set_hash.c23
-rw-r--r--net/netfilter/nft_set_pipapo.c2102
-rw-r--r--net/netfilter/nft_set_rbtree.c26
-rw-r--r--net/netfilter/nft_socket.c6
-rw-r--r--net/netfilter/nft_synproxy.c147
-rw-r--r--net/netfilter/nft_tproxy.c8
-rw-r--r--net/netfilter/nft_tunnel.c57
-rw-r--r--net/netfilter/x_tables.c4
-rw-r--r--net/netfilter/xt_HMARK.c6
-rw-r--r--net/netfilter/xt_IDLETIMER.c2
-rw-r--r--net/netfilter/xt_RATEEST.c2
-rw-r--r--net/netfilter/xt_connlimit.c2
-rw-r--r--net/netfilter/xt_hashlimit.c29
-rw-r--r--net/netfilter/xt_nfacct.c36
-rw-r--r--net/netfilter/xt_physdev.c11
-rw-r--r--net/netfilter/xt_recent.c17
-rw-r--r--net/netfilter/xt_set.c1
-rw-r--r--net/netfilter/xt_time.c19
-rw-r--r--net/netlabel/netlabel_kapi.c2
-rw-r--r--net/netlink/af_netlink.c2
-rw-r--r--net/netlink/genetlink.c303
-rw-r--r--net/netrom/af_netrom.c23
-rw-r--r--net/nfc/hci/Kconfig14
-rw-r--r--net/nfc/llcp_sock.c18
-rw-r--r--net/nfc/nci/spi.c6
-rw-r--r--net/nfc/nci/uart.c2
-rw-r--r--net/nfc/netlink.c25
-rw-r--r--net/openvswitch/actions.c33
-rw-r--r--net/openvswitch/conntrack.c52
-rw-r--r--net/openvswitch/datapath.c210
-rw-r--r--net/openvswitch/datapath.h14
-rw-r--r--net/openvswitch/flow.c193
-rw-r--r--net/openvswitch/flow.h15
-rw-r--r--net/openvswitch/flow_netlink.c121
-rw-r--r--net/openvswitch/flow_table.c381
-rw-r--r--net/openvswitch/flow_table.h19
-rw-r--r--net/openvswitch/vport-internal_dev.c13
-rw-r--r--net/openvswitch/vport.c5
-rw-r--r--net/packet/af_packet.c74
-rw-r--r--net/phonet/pn_dev.c2
-rw-r--r--net/phonet/socket.c4
-rw-r--r--net/psample/psample.c24
-rw-r--r--net/qrtr/qrtr.c320
-rw-r--r--net/qrtr/tun.c11
-rw-r--r--net/rds/Kconfig4
-rw-r--r--net/rds/af_rds.c99
-rw-r--r--net/rds/bind.c49
-rw-r--r--net/rds/ib.c39
-rw-r--r--net/rds/ib.h19
-rw-r--r--net/rds/ib_cm.c193
-rw-r--r--net/rds/ib_mr.h7
-rw-r--r--net/rds/ib_rdma.c84
-rw-r--r--net/rds/ib_recv.c36
-rw-r--r--net/rds/ib_send.c63
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/rdma.c157
-rw-r--r--net/rds/rdma_transport.c10
-rw-r--r--net/rds/rds.h24
-rw-r--r--net/rds/recv.c27
-rw-r--r--net/rds/send.c16
-rw-r--r--net/rds/stats.c3
-rw-r--r--net/rfkill/core.c18
-rw-r--r--net/rose/af_rose.c29
-rw-r--r--net/rose/rose_route.c1
-rw-r--r--net/rxrpc/Kconfig2
-rw-r--r--net/rxrpc/af_rxrpc.c7
-rw-r--r--net/rxrpc/ar-internal.h44
-rw-r--r--net/rxrpc/call_accept.c65
-rw-r--r--net/rxrpc/call_event.c8
-rw-r--r--net/rxrpc/call_object.c89
-rw-r--r--net/rxrpc/conn_client.c56
-rw-r--r--net/rxrpc/conn_event.c52
-rw-r--r--net/rxrpc/conn_object.c18
-rw-r--r--net/rxrpc/conn_service.c6
-rw-r--r--net/rxrpc/input.c328
-rw-r--r--net/rxrpc/insecure.c5
-rw-r--r--net/rxrpc/local_event.c4
-rw-r--r--net/rxrpc/local_object.c28
-rw-r--r--net/rxrpc/output.c33
-rw-r--r--net/rxrpc/peer_event.c61
-rw-r--r--net/rxrpc/peer_object.c18
-rw-r--r--net/rxrpc/protocol.h9
-rw-r--r--net/rxrpc/recvmsg.c71
-rw-r--r--net/rxrpc/rxkad.c140
-rw-r--r--net/rxrpc/security.c70
-rw-r--r--net/rxrpc/sendmsg.c16
-rw-r--r--net/rxrpc/skbuff.c40
-rw-r--r--net/sched/Kconfig186
-rw-r--r--net/sched/Makefile2
-rw-r--r--net/sched/act_api.c101
-rw-r--r--net/sched/act_bpf.c7
-rw-r--r--net/sched/act_connmark.c6
-rw-r--r--net/sched/act_csum.c16
-rw-r--r--net/sched/act_ct.c41
-rw-r--r--net/sched/act_ctinfo.c21
-rw-r--r--net/sched/act_gact.c23
-rw-r--r--net/sched/act_ife.c16
-rw-r--r--net/sched/act_ipt.c23
-rw-r--r--net/sched/act_mirred.c74
-rw-r--r--net/sched/act_mpls.c25
-rw-r--r--net/sched/act_nat.c10
-rw-r--r--net/sched/act_pedit.c19
-rw-r--r--net/sched/act_police.c49
-rw-r--r--net/sched/act_sample.c40
-rw-r--r--net/sched/act_simple.c9
-rw-r--r--net/sched/act_skbedit.c10
-rw-r--r--net/sched/act_skbmod.c6
-rw-r--r--net/sched/act_tunnel_key.c222
-rw-r--r--net/sched/act_vlan.c26
-rw-r--r--net/sched/cls_api.c847
-rw-r--r--net/sched/cls_basic.c11
-rw-r--r--net/sched/cls_bpf.c55
-rw-r--r--net/sched/cls_flower.c522
-rw-r--r--net/sched/cls_fw.c11
-rw-r--r--net/sched/cls_matchall.c45
-rw-r--r--net/sched/cls_route.c11
-rw-r--r--net/sched/cls_rsvp.h17
-rw-r--r--net/sched/cls_tcindex.c54
-rw-r--r--net/sched/cls_u32.c40
-rw-r--r--net/sched/em_meta.c8
-rw-r--r--net/sched/ematch.c5
-rw-r--r--net/sched/sch_api.c45
-rw-r--r--net/sched/sch_cake.c66
-rw-r--r--net/sched/sch_cbq.c43
-rw-r--r--net/sched/sch_cbs.c51
-rw-r--r--net/sched/sch_choke.c2
-rw-r--r--net/sched/sch_dsmark.c2
-rw-r--r--net/sched/sch_etf.c2
-rw-r--r--net/sched/sch_ets.c828
-rw-r--r--net/sched/sch_fq.c26
-rw-r--r--net/sched/sch_fq_codel.c15
-rw-r--r--net/sched/sch_fq_pie.c562
-rw-r--r--net/sched/sch_generic.c70
-rw-r--r--net/sched/sch_hhf.c10
-rw-r--r--net/sched/sch_htb.c4
-rw-r--r--net/sched/sch_mq.c4
-rw-r--r--net/sched/sch_mqprio.c7
-rw-r--r--net/sched/sch_multiq.c25
-rw-r--r--net/sched/sch_netem.c15
-rw-r--r--net/sched/sch_pie.c315
-rw-r--r--net/sched/sch_prio.c12
-rw-r--r--net/sched/sch_sfb.c20
-rw-r--r--net/sched/sch_sfq.c14
-rw-r--r--net/sched/sch_taprio.c552
-rw-r--r--net/sched/sch_tbf.c60
-rw-r--r--net/sctp/associola.c80
-rw-r--r--net/sctp/auth.c101
-rw-r--r--net/sctp/chunk.c42
-rw-r--r--net/sctp/diag.c6
-rw-r--r--net/sctp/endpointola.c74
-rw-r--r--net/sctp/input.c27
-rw-r--r--net/sctp/ipv6.c6
-rw-r--r--net/sctp/output.c2
-rw-r--r--net/sctp/outqueue.c13
-rw-r--r--net/sctp/protocol.c16
-rw-r--r--net/sctp/sm_make_chunk.c53
-rw-r--r--net/sctp/sm_sideeffect.c49
-rw-r--r--net/sctp/sm_statefuns.c30
-rw-r--r--net/sctp/sm_statetable.c28
-rw-r--r--net/sctp/socket.c892
-rw-r--r--net/sctp/stream.c25
-rw-r--r--net/sctp/stream_interleave.c23
-rw-r--r--net/sctp/sysctl.c29
-rw-r--r--net/sctp/transport.c6
-rw-r--r--net/sctp/ulpevent.c57
-rw-r--r--net/sctp/ulpqueue.c15
-rw-r--r--net/smc/af_smc.c56
-rw-r--r--net/smc/smc.h1
-rw-r--r--net/smc/smc_cdc.c7
-rw-r--r--net/smc/smc_clc.c6
-rw-r--r--net/smc/smc_close.c97
-rw-r--r--net/smc/smc_close.h2
-rw-r--r--net/smc/smc_core.c450
-rw-r--r--net/smc/smc_core.h16
-rw-r--r--net/smc/smc_diag.c5
-rw-r--r--net/smc/smc_ib.c24
-rw-r--r--net/smc/smc_ib.h4
-rw-r--r--net/smc/smc_ism.c27
-rw-r--r--net/smc/smc_llc.c11
-rw-r--r--net/smc/smc_pnet.c11
-rw-r--r--net/smc/smc_rx.c37
-rw-r--r--net/smc/smc_tx.c34
-rw-r--r--net/smc/smc_wr.c45
-rw-r--r--net/smc/smc_wr.h10
-rw-r--r--net/socket.c297
-rw-r--r--net/sunrpc/addr.c2
-rw-r--r--net/sunrpc/auth.c49
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c3
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c12
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c8
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c6
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c16
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c35
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c112
-rw-r--r--net/sunrpc/backchannel_rqst.c7
-rw-r--r--net/sunrpc/cache.c130
-rw-r--r--net/sunrpc/clnt.c95
-rw-r--r--net/sunrpc/rpc_pipe.c5
-rw-r--r--net/sunrpc/sched.c39
-rw-r--r--net/sunrpc/stats.c21
-rw-r--r--net/sunrpc/svc.c6
-rw-r--r--net/sunrpc/svcauth.c2
-rw-r--r--net/sunrpc/svcauth_unix.c10
-rw-r--r--net/sunrpc/xdr.c78
-rw-r--r--net/sunrpc/xprt.c95
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c12
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c266
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c488
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c1
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c24
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c8
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c14
-rw-r--r--net/sunrpc/xprtrdma/transport.c65
-rw-r--r--net/sunrpc/xprtrdma/verbs.c636
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h91
-rw-r--r--net/sunrpc/xprtsock.c30
-rw-r--r--net/tipc/Kconfig15
-rw-r--r--net/tipc/Makefile5
-rw-r--r--net/tipc/bcast.c55
-rw-r--r--net/tipc/bcast.h2
-rw-r--r--net/tipc/bearer.c124
-rw-r--r--net/tipc/bearer.h22
-rw-r--r--net/tipc/core.c66
-rw-r--r--net/tipc/core.h23
-rw-r--r--net/tipc/crypto.c1983
-rw-r--r--net/tipc/crypto.h167
-rw-r--r--net/tipc/discover.c4
-rw-r--r--net/tipc/eth_media.c3
-rw-r--r--net/tipc/group.c4
-rw-r--r--net/tipc/ib_media.c5
-rw-r--r--net/tipc/link.c453
-rw-r--r--net/tipc/link.h10
-rw-r--r--net/tipc/monitor.c15
-rw-r--r--net/tipc/monitor.h1
-rw-r--r--net/tipc/msg.c283
-rw-r--r--net/tipc/msg.h105
-rw-r--r--net/tipc/name_distr.c7
-rw-r--r--net/tipc/name_table.c330
-rw-r--r--net/tipc/name_table.h4
-rw-r--r--net/tipc/net.c58
-rw-r--r--net/tipc/net.h1
-rw-r--r--net/tipc/netlink.c45
-rw-r--r--net/tipc/netlink.h1
-rw-r--r--net/tipc/netlink_compat.c32
-rw-r--r--net/tipc/node.c525
-rw-r--r--net/tipc/node.h27
-rw-r--r--net/tipc/socket.c254
-rw-r--r--net/tipc/sysctl.c11
-rw-r--r--net/tipc/topsrv.c2
-rw-r--r--net/tipc/udp_media.c19
-rw-r--r--net/tls/Kconfig10
-rw-r--r--net/tls/Makefile5
-rw-r--r--net/tls/tls_device.c297
-rw-r--r--net/tls/tls_device_fallback.c2
-rw-r--r--net/tls/tls_main.c318
-rw-r--r--net/tls/tls_proc.c49
-rw-r--r--net/tls/tls_sw.c138
-rw-r--r--net/tls/tls_toe.c139
-rw-r--r--net/tls/trace.c10
-rw-r--r--net/tls/trace.h202
-rw-r--r--net/unix/af_unix.c103
-rw-r--r--net/vmw_vsock/Kconfig12
-rw-r--r--net/vmw_vsock/Makefile1
-rw-r--r--net/vmw_vsock/af_vsock.c450
-rw-r--r--net/vmw_vsock/hyperv_transport.c199
-rw-r--r--net/vmw_vsock/virtio_transport.c185
-rw-r--r--net/vmw_vsock/virtio_transport_common.c370
-rw-r--r--net/vmw_vsock/vmci_transport.c144
-rw-r--r--net/vmw_vsock/vmci_transport.h3
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h1
-rw-r--r--net/vmw_vsock/vsock_loopback.c180
-rw-r--r--net/wimax/debugfs.c42
-rw-r--r--net/wimax/stack.c11
-rw-r--r--net/wimax/wimax-internal.h7
-rw-r--r--net/wireless/Kconfig2
-rw-r--r--net/wireless/chan.c167
-rw-r--r--net/wireless/core.c31
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/ethtool.c8
-rw-r--r--net/wireless/ibss.c16
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c197
-rw-r--r--net/wireless/nl80211.c310
-rw-r--r--net/wireless/rdev-ops.h14
-rw-r--r--net/wireless/reg.c62
-rw-r--r--net/wireless/reg.h10
-rw-r--r--net/wireless/scan.c299
-rw-r--r--net/wireless/sme.c6
-rw-r--r--net/wireless/trace.h14
-rw-r--r--net/wireless/util.c85
-rw-r--r--net/wireless/wext-compat.c7
-rw-r--r--net/wireless/wext-core.c3
-rw-r--r--net/wireless/wext-sme.c8
-rw-r--r--net/x25/af_x25.c18
-rw-r--r--net/x25/x25_dev.c2
-rw-r--r--net/x25/x25_in.c32
-rw-r--r--net/xdp/xdp_umem.c93
-rw-r--r--net/xdp/xsk.c483
-rw-r--r--net/xdp/xsk.h13
-rw-r--r--net/xdp/xsk_diag.c5
-rw-r--r--net/xdp/xsk_queue.c15
-rw-r--r--net/xdp/xsk_queue.h344
-rw-r--r--net/xfrm/Kconfig12
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/espintcp.c509
-rw-r--r--net/xfrm/xfrm_algo.c4
-rw-r--r--net/xfrm/xfrm_device.c15
-rw-r--r--net/xfrm/xfrm_input.c26
-rw-r--r--net/xfrm/xfrm_interface.c121
-rw-r--r--net/xfrm/xfrm_ipcomp.c2
-rw-r--r--net/xfrm/xfrm_output.c11
-rw-r--r--net/xfrm/xfrm_policy.c21
-rw-r--r--net/xfrm/xfrm_state.c5
821 files changed, 52608 insertions, 14028 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c
index 2cfdfbfbb2ed..bea6e43d45a0 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -523,7 +523,7 @@ int mrp_request_join(const struct net_device *dev,
struct mrp_attr *attr;
if (sizeof(struct mrp_skb_cb) + len >
- FIELD_SIZEOF(struct sk_buff, cb))
+ sizeof_field(struct sk_buff, cb))
return -ENOMEM;
spin_lock_bh(&app->lock);
@@ -548,7 +548,7 @@ void mrp_request_leave(const struct net_device *dev,
struct mrp_attr *attr;
if (sizeof(struct mrp_skb_cb) + len >
- FIELD_SIZEOF(struct sk_buff, cb))
+ sizeof_field(struct sk_buff, cb))
return;
spin_lock_bh(&app->lock);
@@ -692,7 +692,7 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app,
* advance to the next event in its Vector.
*/
if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen >
- FIELD_SIZEOF(struct sk_buff, cb))
+ sizeof_field(struct sk_buff, cb))
return -1;
if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue,
mrp_cb(skb)->mh->attrlen) < 0)
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 54728d2eda18..d4bcfd8f95bf 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -172,7 +172,6 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
if (err < 0)
goto out_uninit_mvrp;
- vlan->nest_level = dev_get_nest_level(real_dev) + 1;
err = register_netdevice(dev);
if (err < 0)
goto out_uninit_mvrp;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index c46daf09a501..bb7ec1a3915d 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -126,6 +126,7 @@ int vlan_check_real_dev(struct net_device *real_dev,
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+void vlan_dev_uninit(struct net_device *dev);
bool vlan_dev_inherit_address(struct net_device *dev,
struct net_device *real_dev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 93eadf179123..990b9fde28c6 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -489,36 +489,6 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
}
-/*
- * vlan network devices have devices nesting below it, and are a special
- * "super class" of normal network devices; split their locks off into a
- * separate class since they always nest.
- */
-static struct lock_class_key vlan_netdev_xmit_lock_key;
-static struct lock_class_key vlan_netdev_addr_lock_key;
-
-static void vlan_dev_set_lockdep_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_subclass)
-{
- lockdep_set_class_and_subclass(&txq->_xmit_lock,
- &vlan_netdev_xmit_lock_key,
- *(int *)_subclass);
-}
-
-static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass)
-{
- lockdep_set_class_and_subclass(&dev->addr_list_lock,
- &vlan_netdev_addr_lock_key,
- subclass);
- netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass);
-}
-
-static int vlan_dev_get_lock_subclass(struct net_device *dev)
-{
- return vlan_dev_priv(dev)->nest_level;
-}
-
static const struct header_ops vlan_header_ops = {
.create = vlan_dev_hard_header,
.parse = eth_header_parse,
@@ -609,8 +579,6 @@ static int vlan_dev_init(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &vlan_type);
- vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev));
-
vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
if (!vlan->vlan_pcpu_stats)
return -ENOMEM;
@@ -618,7 +586,8 @@ static int vlan_dev_init(struct net_device *dev)
return 0;
}
-static void vlan_dev_uninit(struct net_device *dev)
+/* Note: this function might be called multiple times for the same device. */
+void vlan_dev_uninit(struct net_device *dev)
{
struct vlan_priority_tci_mapping *pm;
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -678,8 +647,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops;
struct phy_device *phydev = vlan->real_dev->phydev;
- if (phydev && phydev->drv && phydev->drv->ts_info) {
- return phydev->drv->ts_info(phydev, info);
+ if (phy_has_tsinfo(phydev)) {
+ return phy_ts_info(phydev, info);
} else if (ops->get_ts_info) {
return ops->get_ts_info(vlan->real_dev, info);
} else {
@@ -812,7 +781,6 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
#endif
.ndo_fix_features = vlan_dev_fix_features,
- .ndo_get_lock_subclass = vlan_dev_get_lock_subclass,
.ndo_get_iflink = vlan_dev_get_iflink,
};
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index c482a6fe9393..0db85aeb119b 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -108,11 +108,13 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
struct ifla_vlan_flags *flags;
struct ifla_vlan_qos_mapping *m;
struct nlattr *attr;
- int rem;
+ int rem, err;
if (data[IFLA_VLAN_FLAGS]) {
flags = nla_data(data[IFLA_VLAN_FLAGS]);
- vlan_dev_change_flags(dev, flags->flags, flags->mask);
+ err = vlan_dev_change_flags(dev, flags->flags, flags->mask);
+ if (err)
+ return err;
}
if (data[IFLA_VLAN_INGRESS_QOS]) {
nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
@@ -123,7 +125,9 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
if (data[IFLA_VLAN_EGRESS_QOS]) {
nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
m = nla_data(attr);
- vlan_dev_set_egress_priority(dev, m->from, m->to);
+ err = vlan_dev_set_egress_priority(dev, m->from, m->to);
+ if (err)
+ return err;
}
}
return 0;
@@ -179,10 +183,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
return -EINVAL;
err = vlan_changelink(dev, tb, data, extack);
- if (err < 0)
- return err;
-
- return register_vlan_dev(dev, extack);
+ if (!err)
+ err = register_vlan_dev(dev, extack);
+ if (err)
+ vlan_dev_uninit(dev);
+ return err;
}
static inline size_t vlan_qos_map_size(unsigned int n)
diff --git a/net/9p/client.c b/net/9p/client.c
index 9622f3e469f6..1d48afc7033c 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -281,6 +281,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
p9pdu_reset(&req->tc);
p9pdu_reset(&req->rc);
+ req->t_err = 0;
req->status = REQ_STATUS_ALLOC;
init_waitqueue_head(&req->wq);
INIT_LIST_HEAD(&req->req_list);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index bac8dad5dd69..b21c3c209815 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -685,9 +685,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
goto error;
/* Create the Completion Queue */
- rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
- opts.sq_depth + opts.rq_depth + 1,
- 0, IB_POLL_SOFTIRQ);
+ rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
+ opts.sq_depth + opts.rq_depth + 1,
+ IB_POLL_SOFTIRQ);
if (IS_ERR(rdma->cq))
goto error;
diff --git a/net/Kconfig b/net/Kconfig
index 57f51a279ad6..b0937a700f01 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -91,6 +91,7 @@ if INET
source "net/ipv4/Kconfig"
source "net/ipv6/Kconfig"
source "net/netlabel/Kconfig"
+source "net/mptcp/Kconfig"
endif # if INET
@@ -108,9 +109,10 @@ config NETWORK_PHY_TIMESTAMPING
bool "Timestamping in PHY devices"
select NET_PTP_CLASSIFY
help
- This allows timestamping of network packets by PHYs with
- hardware timestamping capabilities. This option adds some
- overhead in the transmit and receive paths.
+ This allows timestamping of network packets by PHYs (or
+ other MII bus snooping devices) with hardware timestamping
+ capabilities. This option adds some overhead in the transmit
+ and receive paths.
If you are unsure how to answer this question, answer N.
@@ -258,7 +260,7 @@ config XPS
default y
config HWBM
- bool
+ bool
config CGROUP_NET_PRIO
bool "Network priority cgroup"
@@ -309,12 +311,12 @@ config BPF_STREAM_PARSER
select STREAM_PARSER
select NET_SOCK_MSG
---help---
- Enabling this allows a stream parser to be used with
- BPF_MAP_TYPE_SOCKMAP.
+ Enabling this allows a stream parser to be used with
+ BPF_MAP_TYPE_SOCKMAP.
- BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
- It can be used to enforce socket policy, implement socket redirects,
- etc.
+ BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
+ It can be used to enforce socket policy, implement socket redirects,
+ etc.
config NET_FLOW_LIMIT
bool
@@ -349,12 +351,12 @@ config NET_DROP_MONITOR
tristate "Network packet drop alerting service"
depends on INET && TRACEPOINTS
---help---
- This feature provides an alerting service to userspace in the
- event that packets are discarded in the network stack. Alerts
- are broadcast via netlink socket to any listening user space
- process. If you don't need network drop alerts, or if you are ok
- just checking the various proc files and other utilities for
- drop statistics, say N here.
+ This feature provides an alerting service to userspace in the
+ event that packets are discarded in the network stack. Alerts
+ are broadcast via netlink socket to any listening user space
+ process. If you don't need network drop alerts, or if you are ok
+ just checking the various proc files and other utilities for
+ drop statistics, say N here.
endmenu
@@ -430,9 +432,10 @@ config NET_SOCK_MSG
config NET_DEVLINK
bool
default n
+ imply NET_DROP_MONITOR
config PAGE_POOL
- bool
+ bool
config FAILOVER
tristate "Generic failover module"
@@ -447,6 +450,14 @@ config FAILOVER
migration of VMs with direct attached VFs by failing over to the
paravirtual datapath when the VF is unplugged.
+config ETHTOOL_NETLINK
+ bool "Netlink interface for ethtool"
+ default y
+ help
+ An alternative userspace interface for ethtool based on generic
+ netlink. It provides better extensibility and some new features,
+ e.g. notification messages.
+
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/Makefile b/net/Makefile
index 449fc0b221f8..07ea48160874 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_NET) += $(tmp-y)
# LLC has to be linked before the files in net/802/
obj-$(CONFIG_LLC) += llc/
-obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/
+obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_TLS) += tls/
@@ -87,3 +87,4 @@ endif
obj-$(CONFIG_QRTR) += qrtr/
obj-$(CONFIG_NET_NCSI) += ncsi/
obj-$(CONFIG_XDP_SOCKETS) += xdp/
+obj-$(CONFIG_MPTCP) += mptcp/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index a8cb6b2e20c1..b41375d4d295 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -953,8 +953,8 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
vaddr = kmap_atomic(skb_frag_page(frag));
- sum = atalk_sum_partial(vaddr + frag->page_offset +
- offset - start, copy, sum);
+ sum = atalk_sum_partial(vaddr + skb_frag_off(frag) +
+ offset - start, copy, sum);
kunmap_atomic(vaddr);
if (!(len -= copy))
@@ -1023,6 +1023,11 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol,
*/
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
goto out;
+
+ rc = -EPERM;
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ goto out;
+
rc = -ENOMEM;
sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
if (!sk)
diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c
index 39b94ca5f65d..aa1b57161f3b 100644
--- a/net/atm/atm_sysfs.c
+++ b/net/atm/atm_sysfs.c
@@ -33,23 +33,17 @@ static ssize_t show_atmaddress(struct device *cdev,
unsigned long flags;
struct atm_dev *adev = to_atm_dev(cdev);
struct atm_dev_addr *aaddr;
- int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
- int i, j, count = 0;
+ int count = 0;
spin_lock_irqsave(&adev->lock, flags);
list_for_each_entry(aaddr, &adev->local, entry) {
- for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
- if (j == *fmt) {
- count += scnprintf(buf + count,
- PAGE_SIZE - count, ".");
- ++fmt;
- j = 0;
- }
- count += scnprintf(buf + count,
- PAGE_SIZE - count, "%02x",
- aaddr->addr.sas_addr.prv[i]);
- }
- count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%1phN.%2phN.%10phN.%6phN.%1phN\n",
+ &aaddr->addr.sas_addr.prv[0],
+ &aaddr->addr.sas_addr.prv[1],
+ &aaddr->addr.sas_addr.prv[3],
+ &aaddr->addr.sas_addr.prv[13],
+ &aaddr->addr.sas_addr.prv[19]);
}
spin_unlock_irqrestore(&adev->lock, flags);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index a7972da7235d..294cb9efe3d3 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -89,7 +89,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
struct clip_vcc **walk;
if (!entry) {
- pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
+ pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
return;
}
netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
@@ -109,10 +109,10 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
error = neigh_update(entry->neigh, NULL, NUD_NONE,
NEIGH_UPDATE_F_ADMIN, 0);
if (error)
- pr_crit("neigh_update failed with %d\n", error);
+ pr_err("neigh_update failed with %d\n", error);
goto out;
}
- pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
+ pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
out:
netif_tx_unlock_bh(entry->neigh->dev);
}
diff --git a/net/atm/common.c b/net/atm/common.c
index b7528e77997c..0ce530af534d 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -668,7 +668,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* writable? */
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5a77c235a212..25fa3a7b72bd 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
dev->stats.tx_bytes += skb->len;
}
-static void lec_tx_timeout(struct net_device *dev)
+static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue)
{
pr_info("%s\n", dev->name);
netif_trans_update(dev);
@@ -799,14 +799,9 @@ static const char *lec_arp_get_status_string(unsigned char status)
static void lec_info(struct seq_file *seq, struct lec_arp_table *entry)
{
- int i;
-
- for (i = 0; i < ETH_ALEN; i++)
- seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff);
- seq_printf(seq, " ");
- for (i = 0; i < ATM_ESA_LEN; i++)
- seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff);
- seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status),
+ seq_printf(seq, "%pM ", entry->mac_addr);
+ seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr);
+ seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status),
entry->flags & 0xffff);
if (entry->vcc)
seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci);
@@ -1354,7 +1349,7 @@ static void dump_arp_table(struct lec_priv *priv)
{
struct lec_arp_table *rulla;
char buf[256];
- int i, j, offset;
+ int i, offset;
pr_info("Dump %p:\n", priv);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
@@ -1362,14 +1357,10 @@ static void dump_arp_table(struct lec_priv *priv)
&priv->lec_arp_tables[i], next) {
offset = 0;
offset += sprintf(buf, "%d: %p\n", i, rulla);
- offset += sprintf(buf + offset, "Mac: %pM",
+ offset += sprintf(buf + offset, "Mac: %pM ",
rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset,
- "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1392,12 +1383,9 @@ static void dump_arp_table(struct lec_priv *priv)
pr_info("No forward\n");
hlist_for_each_entry(rulla, &priv->lec_no_forward, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1417,12 +1405,9 @@ static void dump_arp_table(struct lec_priv *priv)
pr_info("Empty ones\n");
hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1442,12 +1427,9 @@ static void dump_arp_table(struct lec_priv *priv)
pr_info("Multicast Forward VCCs\n");
hlist_for_each_entry(rulla, &priv->mcast_fwds, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1973,17 +1955,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
* Vcc which we don't want to make default vcc,
* attach it anyway.
*/
- pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- ioc_data->atm_addr[0], ioc_data->atm_addr[1],
- ioc_data->atm_addr[2], ioc_data->atm_addr[3],
- ioc_data->atm_addr[4], ioc_data->atm_addr[5],
- ioc_data->atm_addr[6], ioc_data->atm_addr[7],
- ioc_data->atm_addr[8], ioc_data->atm_addr[9],
- ioc_data->atm_addr[10], ioc_data->atm_addr[11],
- ioc_data->atm_addr[12], ioc_data->atm_addr[13],
- ioc_data->atm_addr[14], ioc_data->atm_addr[15],
- ioc_data->atm_addr[16], ioc_data->atm_addr[17],
- ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n",
+ ATM_ESA_LEN, ioc_data->atm_addr);
entry = make_entry(priv, bus_mac);
if (entry == NULL)
goto out;
@@ -1999,17 +1972,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
dump_arp_table(priv);
goto out;
}
- pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- ioc_data->atm_addr[0], ioc_data->atm_addr[1],
- ioc_data->atm_addr[2], ioc_data->atm_addr[3],
- ioc_data->atm_addr[4], ioc_data->atm_addr[5],
- ioc_data->atm_addr[6], ioc_data->atm_addr[7],
- ioc_data->atm_addr[8], ioc_data->atm_addr[9],
- ioc_data->atm_addr[10], ioc_data->atm_addr[11],
- ioc_data->atm_addr[12], ioc_data->atm_addr[13],
- ioc_data->atm_addr[14], ioc_data->atm_addr[15],
- ioc_data->atm_addr[16], ioc_data->atm_addr[17],
- ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n",
+ ATM_ESA_LEN, ioc_data->atm_addr);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
hlist_for_each_entry(entry,
&priv->lec_arp_tables[i], next) {
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 4bb418313720..3286f9d527d3 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -180,8 +180,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
static void in_cache_put(in_cache_entry *entry)
{
if (refcount_dec_and_test(&entry->use)) {
- memset(entry, 0, sizeof(in_cache_entry));
- kfree(entry);
+ kzfree(entry);
}
}
@@ -416,8 +415,7 @@ static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
static void eg_cache_put(eg_cache_entry *entry)
{
if (refcount_dec_and_test(&entry->use)) {
- memset(entry, 0, sizeof(eg_cache_entry));
- kfree(entry);
+ kzfree(entry);
}
}
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 46d6cd9a36ae..829db9eba0cb 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -53,15 +53,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
static int parse_qos(const char *buff);
-/*
- * Define allowed FILE OPERATIONS
- */
-static const struct file_operations mpc_file_operations = {
- .open = proc_mpc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = proc_mpc_write,
- .release = seq_release,
+static const struct proc_ops mpc_proc_ops = {
+ .proc_open = proc_mpc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = proc_mpc_write,
+ .proc_release = seq_release,
};
/*
@@ -290,7 +287,7 @@ int mpc_proc_init(void)
{
struct proc_dir_entry *p;
- p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_file_operations);
+ p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_proc_ops);
if (!p) {
pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
return -ENOMEM;
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index bd3da9af5ef6..45d8e1d5d033 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -216,9 +216,7 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
pvcc->chan.mtu += LLC_LEN;
break;
}
- pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n",
- skb->data[0], skb->data[1], skb->data[2],
- skb->data[3], skb->data[4], skb->data[5]);
+ pr_debug("Couldn't autodetect yet (skb: %6ph)\n", skb->data);
goto error;
case e_vc:
break;
diff --git a/net/atm/proc.c b/net/atm/proc.c
index d79221fd4dae..4369ffa3302a 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -36,9 +36,9 @@
static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
size_t count, loff_t *pos);
-static const struct file_operations proc_atm_dev_ops = {
- .read = proc_dev_atm_read,
- .llseek = noop_llseek,
+static const struct proc_ops atm_dev_proc_ops = {
+ .proc_read = proc_dev_atm_read,
+ .proc_lseek = noop_llseek,
};
static void add_stats(struct seq_file *seq, const char *aal,
@@ -134,8 +134,7 @@ static void vcc_seq_stop(struct seq_file *seq, void *v)
static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
v = vcc_walk(seq, 1);
- if (v)
- (*pos)++;
+ (*pos)++;
return v;
}
@@ -360,7 +359,7 @@ int atm_proc_dev_register(struct atm_dev *dev)
goto err_out;
dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root,
- &proc_atm_dev_ops, dev);
+ &atm_dev_proc_ops, dev);
if (!dev->proc_entry)
goto err_free_name;
return 0;
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 6c11cdf4dd4c..fbd0c5e7b299 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -109,7 +109,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
dev_kfree_skb(skb);
goto as_indicate_complete;
}
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
skb_queue_tail(&sk->sk_receive_queue, skb);
pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk));
sk->sk_state_change(sk);
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 908cbb8654f5..ba144d035e3d 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -381,7 +381,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
msg->pvc.sap_addr.vpi,
msg->pvc.sap_addr.vci);
dev_kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
if (error) {
sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL,
&old_vcc->qos, error);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index ca5207767dc2..ff57ea89c27e 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -808,7 +808,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
struct sock *sk;
ax25_cb *ax25;
- if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ if (protocol < 0 || protocol > U8_MAX)
return -EINVAL;
if (!net_eq(net, &init_net))
@@ -855,6 +855,8 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
break;
case SOCK_RAW:
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
break;
default:
return -ESOCKTNOSUPPORT;
@@ -1382,7 +1384,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
/* Now attach up the new socket */
kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
newsock->state = SS_CONNECTED;
out:
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index dcdbaeeb2358..cd6afe895db9 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -356,7 +356,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
make->sk_state = TCP_ESTABLISHED;
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
bh_unlock_sock(sk);
} else {
if (!mine)
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index a3d188dfbe75..c762758a4649 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
@@ -12,11 +12,11 @@ config BATMAN_ADV
depends on NET
select LIBCRC32C
help
- B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
- a routing protocol for multi-hop ad-hoc mesh networks. The
- networks may be wired or wireless. See
- https://www.open-mesh.org/ for more information and user space
- tools.
+ B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
+ a routing protocol for multi-hop ad-hoc mesh networks. The
+ networks may be wired or wireless. See
+ https://www.open-mesh.org/ for more information and user space
+ tools.
config BATMAN_ADV_BATMAN_V
bool "B.A.T.M.A.N. V protocol"
@@ -100,7 +100,6 @@ config BATMAN_ADV_DEBUG
config BATMAN_ADV_SYSFS
bool "batman-adv sysfs entries"
depends on BATMAN_ADV
- default y
help
Say Y here if you want to enable batman-adv device configuration and
status interface through sysfs attributes. It is replaced by the
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index fd63e116d9ff..daa49af7ff40 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index fa39eaaab9d7..382fbe51fd34 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 37898da8ad48..686a60bc9492 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Linus LĂĽssing
*/
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 240ed70912d6..f0209505e41a 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -22,6 +22,8 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/pkt_sched.h>
@@ -193,14 +195,18 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
unsigned char *ogm_buff;
u32 random_seqno;
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
/* randomize initial seqno to avoid collision */
get_random_bytes(&random_seqno, sizeof(random_seqno));
atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno);
hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN;
ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC);
- if (!ogm_buff)
+ if (!ogm_buff) {
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
return -ENOMEM;
+ }
hard_iface->bat_iv.ogm_buff = ogm_buff;
@@ -212,35 +218,59 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
batadv_ogm_packet->reserved = 0;
batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE;
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+
return 0;
}
static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
{
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
kfree(hard_iface->bat_iv.ogm_buff);
hard_iface->bat_iv.ogm_buff = NULL;
+
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface)
{
struct batadv_ogm_packet *batadv_ogm_packet;
- unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
+ void *ogm_buff;
- batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ ogm_buff = hard_iface->bat_iv.ogm_buff;
+ if (!ogm_buff)
+ goto unlock;
+
+ batadv_ogm_packet = ogm_buff;
ether_addr_copy(batadv_ogm_packet->orig,
hard_iface->net_dev->dev_addr);
ether_addr_copy(batadv_ogm_packet->prev_sender,
hard_iface->net_dev->dev_addr);
+
+unlock:
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
static void
batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface)
{
struct batadv_ogm_packet *batadv_ogm_packet;
- unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
+ void *ogm_buff;
- batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ ogm_buff = hard_iface->bat_iv.ogm_buff;
+ if (!ogm_buff)
+ goto unlock;
+
+ batadv_ogm_packet = ogm_buff;
batadv_ogm_packet->ttl = BATADV_TTL;
+
+unlock:
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
/* when do we schedule our own ogm to be sent */
@@ -277,17 +307,23 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
* batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached
* @buff_pos: current position in the skb
* @packet_len: total length of the skb
- * @tvlv_len: tvlv length of the previously considered OGM
+ * @ogm_packet: potential OGM in buffer
*
* Return: true if there is enough space for another OGM, false otherwise.
*/
-static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
- __be16 tvlv_len)
+static bool
+batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
+ const struct batadv_ogm_packet *ogm_packet)
{
int next_buff_pos = 0;
- next_buff_pos += buff_pos + BATADV_OGM_HLEN;
- next_buff_pos += ntohs(tvlv_len);
+ /* check if there is enough space for the header */
+ next_buff_pos += buff_pos + sizeof(*ogm_packet);
+ if (next_buff_pos > packet_len)
+ return false;
+
+ /* check if there is enough space for the optional TVLV */
+ next_buff_pos += ntohs(ogm_packet->tvlv_len);
return (next_buff_pos <= packet_len) &&
(next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
@@ -315,7 +351,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
/* adjust all flags and log packets */
while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len,
- batadv_ogm_packet->tvlv_len)) {
+ batadv_ogm_packet)) {
/* we might have aggregated direct link packets with an
* ordinary base packet
*/
@@ -736,7 +772,11 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
}
}
-static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
+/**
+ * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer
+ * @hard_iface: interface whose ogm buffer should be transmitted
+ */
+static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff;
@@ -747,9 +787,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
u16 tvlv_len = 0;
unsigned long send_time;
- if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
- hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
- return;
+ lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex);
/* the interface gets activated here to avoid race conditions between
* the moment of activating the interface in
@@ -817,6 +855,17 @@ out:
batadv_hardif_put(primary_if);
}
+static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
+{
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+ hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
+ return;
+
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+ batadv_iv_ogm_schedule_buff(hard_iface);
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+}
+
/**
* batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface
* @orig_node: originator which reproadcasted the OGMs directly
@@ -1704,7 +1753,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
/* unpack the aggregated packets and process them one by one */
while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
- ogm_packet->tvlv_len)) {
+ ogm_packet)) {
batadv_iv_ogm_process(skb, ogm_offset, if_incoming);
ogm_offset += BATADV_OGM_HLEN;
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index c7a9ba305bfc..0c57c1000c64 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 22672cb3e25d..0ecaf1bb0068 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors:
*
* Linus LĂĽssing, Marek Lindner
*/
@@ -79,6 +79,7 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
{
+ batadv_v_ogm_iface_disable(hard_iface);
batadv_v_elp_iface_disable(hard_iface);
}
@@ -1081,6 +1082,11 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
*/
atomic_set(&hard_iface->bat_v.throughput_override, 0);
atomic_set(&hard_iface->bat_v.elp_interval, 500);
+
+ hard_iface->bat_v.aggr_len = 0;
+ skb_queue_head_init(&hard_iface->bat_v.aggr_list);
+ INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq,
+ batadv_v_ogm_aggr_work);
}
/**
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index 37833db098e6..5e0be10bc84e 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Linus LĂĽssing
*/
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 2614a9caee00..1e3172db7492 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Linus LĂĽssing, Marek Lindner
*/
@@ -107,10 +107,17 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
}
if (ret)
goto default_throughput;
- if (!(sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)))
- goto default_throughput;
- return sinfo.expected_throughput / 100;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT))
+ return sinfo.expected_throughput / 100;
+
+ /* try to estimate the expected throughput based on reported tx
+ * rates
+ */
+ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+ return cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
+
+ goto default_throughput;
}
/* if not a wifi interface, check if this device provides data via
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index 1a29505f4f66..4358d436be2a 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors:
*
* Linus LĂĽssing, Marek Lindner
*/
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index fad95ef64e01..969466218999 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*/
@@ -17,12 +17,15 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -77,6 +80,20 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
}
/**
+ * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer
+ * @hard_iface: the interface to use to send the OGM
+ */
+static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000;
+
+ /* msecs * [0.9, 1.1] */
+ msecs += prandom_u32() % (msecs / 5) - (msecs / 10);
+ queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq,
+ msecs_to_jiffies(msecs / 1000));
+}
+
+/**
* batadv_v_ogm_start_timer() - restart the OGM sending timer
* @bat_priv: the bat priv with all the soft interface information
*/
@@ -116,14 +133,132 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
}
/**
- * batadv_v_ogm_send() - periodic worker broadcasting the own OGM
- * @work: work queue item
+ * batadv_v_ogm_len() - OGMv2 packet length
+ * @skb: the OGM to check
+ *
+ * Return: Length of the given OGMv2 packet, including tvlv length, excluding
+ * ethernet header length.
*/
-static void batadv_v_ogm_send(struct work_struct *work)
+static unsigned int batadv_v_ogm_len(struct sk_buff *skb)
+{
+ struct batadv_ogm2_packet *ogm_packet;
+
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+ return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len);
+}
+
+/**
+ * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue
+ * @skb: the OGM to check
+ * @hard_iface: the interface to use to send the OGM
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ *
+ * Return: True, if the given OGMv2 packet still fits, false otherwise.
+ */
+static bool batadv_v_ogm_queue_left(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu,
+ BATADV_MAX_AGGREGATION_BYTES);
+ unsigned int ogm_len = batadv_v_ogm_len(skb);
+
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ return hard_iface->bat_v.aggr_len + ogm_len <= max;
+}
+
+/**
+ * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue
+ * @hard_iface: the interface holding the aggregation queue
+ *
+ * Empties the OGMv2 aggregation queue and frees all the skbs it contained.
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ */
+static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface)
+{
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ __skb_queue_purge(&hard_iface->bat_v.aggr_list);
+ hard_iface->bat_v.aggr_len = 0;
+}
+
+/**
+ * batadv_v_ogm_aggr_send() - flush & send aggregation queue
+ * @hard_iface: the interface with the aggregation queue to flush
+ *
+ * Aggregates all OGMv2 packets currently in the aggregation queue into a
+ * single OGMv2 packet and transmits this aggregate.
+ *
+ * The aggregation queue is empty after this call.
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ */
+static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int aggr_len = hard_iface->bat_v.aggr_len;
+ struct sk_buff *skb_aggr;
+ unsigned int ogm_len;
+ struct sk_buff *skb;
+
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ if (!aggr_len)
+ return;
+
+ skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN);
+ if (!skb_aggr) {
+ batadv_v_ogm_aggr_list_free(hard_iface);
+ return;
+ }
+
+ skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN);
+ skb_reset_network_header(skb_aggr);
+
+ while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) {
+ hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb);
+
+ ogm_len = batadv_v_ogm_len(skb);
+ skb_put_data(skb_aggr, skb->data, ogm_len);
+
+ consume_skb(skb);
+ }
+
+ batadv_v_ogm_send_to_if(skb_aggr, hard_iface);
+}
+
+/**
+ * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface
+ * @skb: the OGM to queue
+ * @hard_iface: the interface to queue the OGM on
+ */
+static void batadv_v_ogm_queue_on_if(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ if (!atomic_read(&bat_priv->aggregated_ogms)) {
+ batadv_v_ogm_send_to_if(skb, hard_iface);
+ return;
+ }
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ if (!batadv_v_ogm_queue_left(skb, hard_iface))
+ batadv_v_ogm_aggr_send(hard_iface);
+
+ hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb);
+ __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+}
+
+/**
+ * batadv_v_ogm_send_softif() - periodic worker broadcasting the own OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_v_ogm_send_softif(struct batadv_priv *bat_priv)
{
struct batadv_hard_iface *hard_iface;
- struct batadv_priv_bat_v *bat_v;
- struct batadv_priv *bat_priv;
struct batadv_ogm2_packet *ogm_packet;
struct sk_buff *skb, *skb_tmp;
unsigned char *ogm_buff;
@@ -131,8 +266,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
u16 tvlv_len = 0;
int ret;
- bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
- bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+ lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex);
if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
goto out;
@@ -210,7 +344,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
break;
}
- batadv_v_ogm_send_to_if(skb_tmp, hard_iface);
+ batadv_v_ogm_queue_on_if(skb_tmp, hard_iface);
batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -224,6 +358,44 @@ out:
}
/**
+ * batadv_v_ogm_send() - periodic worker broadcasting the own OGM
+ * @work: work queue item
+ */
+static void batadv_v_ogm_send(struct work_struct *work)
+{
+ struct batadv_priv_bat_v *bat_v;
+ struct batadv_priv *bat_priv;
+
+ bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
+ bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+ batadv_v_ogm_send_softif(bat_priv);
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
+}
+
+/**
+ * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface
+ * @work: work queue item
+ *
+ * Emits aggregated OGM message in regular intervals.
+ */
+void batadv_v_ogm_aggr_work(struct work_struct *work)
+{
+ struct batadv_hard_iface_bat_v *batv;
+ struct batadv_hard_iface *hard_iface;
+
+ batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work);
+ hard_iface = container_of(batv, struct batadv_hard_iface, bat_v);
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ batadv_v_ogm_aggr_send(hard_iface);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+
+ batadv_v_ogm_start_queue_timer(hard_iface);
+}
+
+/**
* batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V
* @hard_iface: the interface to prepare
*
@@ -235,12 +407,26 @@ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
{
struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ batadv_v_ogm_start_queue_timer(hard_iface);
batadv_v_ogm_start_timer(bat_priv);
return 0;
}
/**
+ * batadv_v_ogm_iface_disable() - release OGM interface private resources
+ * @hard_iface: interface for which the resources have to be released
+ */
+void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq);
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ batadv_v_ogm_aggr_list_free(hard_iface);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+}
+
+/**
* batadv_v_ogm_primary_iface_set() - set a new primary interface
* @primary_iface: the new primary interface
*/
@@ -249,11 +435,15 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface);
struct batadv_ogm2_packet *ogm_packet;
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
if (!bat_priv->bat_v.ogm_buff)
- return;
+ goto unlock;
ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff;
ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr);
+
+unlock:
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
}
/**
@@ -382,7 +572,7 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
if_outgoing->net_dev->name, ntohl(ogm_forward->throughput),
ogm_forward->ttl, if_incoming->net_dev->name);
- batadv_v_ogm_send_to_if(skb, if_outgoing);
+ batadv_v_ogm_queue_on_if(skb, if_outgoing);
out:
if (orig_ifinfo)
@@ -631,17 +821,23 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
* batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated
* @buff_pos: current position in the skb
* @packet_len: total length of the skb
- * @tvlv_len: tvlv length of the previously considered OGM
+ * @ogm2_packet: potential OGM2 in buffer
*
* Return: true if there is enough space for another OGM, false otherwise.
*/
-static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
- __be16 tvlv_len)
+static bool
+batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
+ const struct batadv_ogm2_packet *ogm2_packet)
{
int next_buff_pos = 0;
- next_buff_pos += buff_pos + BATADV_OGM2_HLEN;
- next_buff_pos += ntohs(tvlv_len);
+ /* check if there is enough space for the header */
+ next_buff_pos += buff_pos + sizeof(*ogm2_packet);
+ if (next_buff_pos > packet_len)
+ return false;
+
+ /* check if there is enough space for the optional TVLV */
+ next_buff_pos += ntohs(ogm2_packet->tvlv_len);
return (next_buff_pos <= packet_len) &&
(next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
@@ -818,7 +1014,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
ogm_packet = (struct batadv_ogm2_packet *)skb->data;
while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
- ogm_packet->tvlv_len)) {
+ ogm_packet)) {
batadv_v_ogm_process(skb, ogm_offset, if_incoming);
ogm_offset += BATADV_OGM2_HLEN;
@@ -869,6 +1065,8 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv)
atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno);
INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send);
+ mutex_init(&bat_priv->bat_v.ogm_buff_mutex);
+
return 0;
}
@@ -880,7 +1078,11 @@ void batadv_v_ogm_free(struct batadv_priv *bat_priv)
{
cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq);
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+
kfree(bat_priv->bat_v.ogm_buff);
bat_priv->bat_v.ogm_buff = NULL;
bat_priv->bat_v.ogm_buff_len = 0;
+
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 2a50df7fc2bf..0ae2575f70bb 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*/
@@ -11,10 +11,13 @@
#include <linux/skbuff.h>
#include <linux/types.h>
+#include <linux/workqueue.h>
int batadv_v_ogm_init(struct batadv_priv *bat_priv);
void batadv_v_ogm_free(struct batadv_priv *bat_priv);
+void batadv_v_ogm_aggr_work(struct work_struct *work);
int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface);
struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
const u8 *addr);
void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 7f04a6acf14e..4bc695cda397 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*/
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 84ad2d2b6ac9..533c6d44cb58 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*/
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 663a53b6d36e..41cc87f06b14 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*/
@@ -844,7 +844,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
/* handle as ANNOUNCE frame */
backbone_gw->lasttime = jiffies;
- crc = ntohs(*((__be16 *)(&an_addr[4])));
+ crc = ntohs(*((__force __be16 *)(&an_addr[4])));
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n",
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 02b24a861a85..41edb2c4a327 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*/
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 38c4d8e51155..452856c27d20 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 1c5afd301ce9..7e2e8f586f42 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index b0af3a11d406..3d21dd83f8cc 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*/
@@ -246,7 +246,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
*/
static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
{
- return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN);
+ return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN);
}
/**
@@ -270,7 +270,9 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
*/
static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
{
- return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4);
+ u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4;
+
+ return *(__force __be32 *)dst;
}
/**
@@ -285,16 +287,18 @@ static u32 batadv_hash_dat(const void *data, u32 size)
u32 hash = 0;
const struct batadv_dat_entry *dat = data;
const unsigned char *key;
+ __be16 vid;
u32 i;
- key = (const unsigned char *)&dat->ip;
+ key = (__force const unsigned char *)&dat->ip;
for (i = 0; i < sizeof(dat->ip); i++) {
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
- key = (const unsigned char *)&dat->vid;
+ vid = htons(dat->vid);
+ key = (__force const unsigned char *)&vid;
for (i = 0; i < sizeof(dat->vid); i++) {
hash += key[i];
hash += (hash << 10);
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 67c7729add55..2bff2f4a325c 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2020 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*/
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 385fccdcf69d..7cad97644d05 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*/
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index abfe8c6556de..881ef328b6cd 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2020 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*/
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 47df4c678988..e22e49289677 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 0be8e7178ec7..88b5dba84354 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index fc55750542e4..16cd9450ceb1 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 211b14b37db8..c3a0c5a7f7e9 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index c90e47342bb0..c7e98a40dd33 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -18,6 +18,7 @@
#include <linux/kref.h>
#include <linux/limits.h>
#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/rculist.h>
@@ -929,6 +930,7 @@ batadv_hardif_add_interface(struct net_device *net_dev)
INIT_LIST_HEAD(&hard_iface->list);
INIT_HLIST_HEAD(&hard_iface->neigh_list);
+ mutex_init(&hard_iface->bat_iv.ogm_buff_mutex);
spin_lock_init(&hard_iface->neigh_list_lock);
kref_init(&hard_iface->refcount);
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index bbb8a6f18d6b..bad2e50135e8 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index a9d4e176f4de..68638e0450a6 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*/
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 57877f0b78e0..91ae9f32b580 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2020 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*/
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 0a70b66e8770..ccb535c77e5d 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index 27fafff586df..6abd0f4742ef 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 11941cf1adcc..a67b2b091447 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 741cfa3719ff..f9884dc56cf3 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -74,7 +74,7 @@ __printf(2, 3);
* @bat_priv: the bat priv with all the soft interface information
* @ratelimited: whether output should be rate limited
* @fmt: format string
- * @arg...: variable arguments
+ * @arg: variable arguments
*/
#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
do { \
@@ -98,7 +98,7 @@ static inline void _batadv_dbg(int type __always_unused,
* batadv_dbg() - Store debug output without ratelimiting
* @type: type of debug message
* @bat_priv: the bat priv with all the soft interface information
- * @arg...: format string and variable arguments
+ * @arg: format string and variable arguments
*/
#define batadv_dbg(type, bat_priv, arg...) \
_batadv_dbg(type, bat_priv, 0, ## arg)
@@ -107,7 +107,7 @@ static inline void _batadv_dbg(int type __always_unused,
* batadv_dbg_ratelimited() - Store debug output with ratelimiting
* @type: type of debug message
* @bat_priv: the bat priv with all the soft interface information
- * @arg...: format string and variable arguments
+ * @arg: format string and variable arguments
*/
#define batadv_dbg_ratelimited(type, bat_priv, arg...) \
_batadv_dbg(type, bat_priv, 1, ## arg)
@@ -116,7 +116,7 @@ static inline void _batadv_dbg(int type __always_unused,
* batadv_info() - Store message in debug buffer and print it to kmsg buffer
* @net_dev: the soft interface net device
* @fmt: format string
- * @arg...: variable arguments
+ * @arg: variable arguments
*/
#define batadv_info(net_dev, fmt, arg...) \
do { \
@@ -130,7 +130,7 @@ static inline void _batadv_dbg(int type __always_unused,
* batadv_err() - Store error in debug buffer and print it to kmsg buffer
* @net_dev: the soft interface net device
* @fmt: format string
- * @arg...: variable arguments
+ * @arg: variable arguments
*/
#define batadv_err(net_dev, fmt, arg...) \
do { \
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 4a89177def64..d8a255c85e77 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -548,7 +548,7 @@ static void batadv_recv_handler_init(void)
BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12);
BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8);
- i = FIELD_SIZEOF(struct sk_buff, cb);
+ i = sizeof_field(struct sk_buff, cb);
BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i);
/* broadcast packet */
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 3d4c04d87ff3..692306df7b6f 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2019.3"
+#define BATADV_SOURCE_VERSION "2020.0"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 1d5bdf3a4b65..9ebdc1e864b9 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors:
*
* Linus LĂĽssing
*/
@@ -1421,7 +1421,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
if (*orig)
return BATADV_FORW_SINGLE;
- /* fall through */
+ fallthrough;
case 0:
return BATADV_FORW_NONE;
default:
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 5d9e2bb29c97..ebf825991ecd 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2020 B.A.T.M.A.N. contributors:
*
* Linus LĂĽssing
*/
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 6f08fd122a8d..02ed073f95a9 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors:
*
* Matthias Schiffer
*/
@@ -164,7 +164,7 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
{
struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
- return attr ? nla_get_u32(attr) : 0;
+ return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0;
}
/**
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index ddc674e47dbb..7ee48f916997 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors:
*
* Matthias Schiffer
*/
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index 580609389f0f..8f0717c3f7b5 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*/
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 753fa49723cf..334289084127 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*/
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 38613487fb1b..5b0c2fffc214 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 512a1f99dd75..7bc01c138b3a 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index f0f864820dea..3632bd976c56 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index c20feac95107..2ed49db6eff5 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 3ce5f7bad369..7f8ade04e08e 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 5fc0fd1e5d08..0d36e15589f6 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index c7a2e77ca1da..5f05a728f347 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -22,7 +22,6 @@
#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
-#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/percpu.h>
@@ -230,7 +229,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
break;
}
- /* fall through */
+ fallthrough;
case ETH_P_BATMAN:
goto dropped;
}
@@ -436,7 +435,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
/* clean the netfilter state now that the batman-adv header has been
* removed
*/
- nf_reset(skb);
+ nf_reset_ct(skb);
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
goto dropped;
@@ -455,7 +454,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN))
break;
- /* fall through */
+ fallthrough;
case ETH_P_BATMAN:
goto dropped;
}
@@ -740,36 +739,6 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
return 0;
}
-/* batman-adv network devices have devices nesting below it and are a special
- * "super class" of normal network devices; split their locks off into a
- * separate class since they always nest.
- */
-static struct lock_class_key batadv_netdev_xmit_lock_key;
-static struct lock_class_key batadv_netdev_addr_lock_key;
-
-/**
- * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue
- * @dev: device which owns the tx queue
- * @txq: tx queue to modify
- * @_unused: always NULL
- */
-static void batadv_set_lockdep_class_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key);
-}
-
-/**
- * batadv_set_lockdep_class() - Set txq and addr_list lockdep class
- * @dev: network device to modify
- */
-static void batadv_set_lockdep_class(struct net_device *dev)
-{
- lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key);
- netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL);
-}
-
/**
* batadv_softif_init_late() - late stage initialization of soft interface
* @dev: registered network device to modify
@@ -783,8 +752,6 @@ static int batadv_softif_init_late(struct net_device *dev)
int ret;
size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM;
- batadv_set_lockdep_class(dev);
-
bat_priv = netdev_priv(dev);
bat_priv->soft_iface = dev;
@@ -943,10 +910,10 @@ static const struct net_device_ops batadv_netdev_ops = {
static void batadv_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver));
- strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version));
- strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
- strlcpy(info->bus_info, "batman", sizeof(info->bus_info));
+ strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver));
+ strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->bus_info, "batman", sizeof(info->bus_info));
}
/* Inspired by drivers/net/ethernet/dlink/sundance.c:1702
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 29139ad769fe..534e08d6ad91 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 1efcb97039cd..c45962d8527b 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
@@ -1070,7 +1070,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
dev_hold(net_dev);
INIT_WORK(&store_work->work, batadv_store_mesh_iface_work);
store_work->net_dev = net_dev;
- strlcpy(store_work->soft_iface_name, buff,
+ strscpy(store_work->soft_iface_name, buff,
sizeof(store_work->soft_iface_name));
queue_work(batadv_event_workqueue, &store_work->work);
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index 5e466093dfa5..d987f8b30a98 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*/
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index dd6a9a40dbb9..bd2ac570c42c 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors:
*
* Edo Monticelli, Antonio Quartulli
*/
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index 78d310da0ad3..140105215aa2 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2020 B.A.T.M.A.N. contributors:
*
* Edo Monticelli, Antonio Quartulli
*/
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
index 3cedd2c36528..3444d9e4e90d 100644
--- a/net/batman-adv/trace.c
+++ b/net/batman-adv/trace.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Sven Eckelmann
*/
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
index d8f764521c0b..f631b1e01b89 100644
--- a/net/batman-adv/trace.h
+++ b/net/batman-adv/trace.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2020 B.A.T.M.A.N. contributors:
*
* Sven Eckelmann
*/
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8a482c5ec67b..852932838ddc 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*/
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 4a98860d7f0e..b24d35b9226a 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*/
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index aae63f0d21eb..0963a43ad996 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index 36985000a0a8..d509d00c7a23 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 6ae139d74e0f..4a17a66cc572 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*/
@@ -17,6 +17,7 @@
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/kref.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/sched.h> /* for linux/wait.h */
@@ -81,6 +82,9 @@ struct batadv_hard_iface_bat_iv {
/** @ogm_seqno: OGM sequence number - used to identify each OGM */
atomic_t ogm_seqno;
+
+ /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
+ struct mutex ogm_buff_mutex;
};
/**
@@ -117,6 +121,15 @@ struct batadv_hard_iface_bat_v {
/** @elp_wq: workqueue used to schedule ELP transmissions */
struct delayed_work elp_wq;
+ /** @aggr_wq: workqueue used to transmit queued OGM packets */
+ struct delayed_work aggr_wq;
+
+ /** @aggr_list: queue for to be aggregated OGM packets */
+ struct sk_buff_head aggr_list;
+
+ /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */
+ unsigned int aggr_len;
+
/**
* @throughput_override: throughput override to disable link
* auto-detection
@@ -444,7 +457,7 @@ struct batadv_orig_node {
/**
* @tt_lock: prevents from updating the table while reading it. Table
* update is made up by two operations (data structure update and
- * metdata -CRC/TTVN-recalculation) and they have to be executed
+ * metadata -CRC/TTVN-recalculation) and they have to be executed
* atomically in order to avoid another thread to read the
* table/metadata between those.
*/
@@ -998,7 +1011,7 @@ struct batadv_priv_tt {
/**
* @commit_lock: prevents from executing a local TT commit while reading
* the local table. The local TT commit is made up by two operations
- * (data structure update and metdata -CRC/TTVN- recalculation) and
+ * (data structure update and metadata -CRC/TTVN- recalculation) and
* they have to be executed atomically in order to avoid another thread
* to read the table/metadata between those.
*/
@@ -1527,6 +1540,9 @@ struct batadv_priv_bat_v {
/** @ogm_seqno: OGM sequence number - used to identify each OGM */
atomic_t ogm_seqno;
+ /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
+ struct mutex ogm_buff_mutex;
+
/** @ogm_wq: workqueue used to schedule OGM transmissions */
struct delayed_work ogm_wq;
};
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 9d41de1ec90f..4febc82a7c76 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -571,19 +571,11 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
return err < 0 ? NET_XMIT_DROP : err;
}
-static int bt_dev_init(struct net_device *dev)
-{
- netdev_lockdep_set_classes(dev);
-
- return 0;
-}
-
static const struct net_device_ops netdev_ops = {
- .ndo_init = bt_dev_init,
.ndo_start_xmit = bt_xmit,
};
-static struct header_ops header_ops = {
+static const struct header_ops header_ops = {
.create = header_create,
};
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 2efac049ad4c..165148c7c4ce 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -9,8 +9,9 @@ menuconfig BT
depends on RFKILL || !RFKILL
select CRC16
select CRYPTO
- select CRYPTO_BLKCIPHER
- select CRYPTO_AES
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_AES
+ imply CRYPTO_AES
select CRYPTO_CMAC
select CRYPTO_ECB
select CRYPTO_SHA256
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 94ddf19998c7..3fd124927d4d 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -173,7 +173,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
else
release_sock(sk);
- parent->sk_ack_backlog++;
+ sk_acceptq_added(parent);
}
EXPORT_SYMBOL(bt_accept_enqueue);
@@ -185,7 +185,7 @@ void bt_accept_unlink(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
list_del_init(&bt_sk(sk)->accept_q);
- bt_sk(sk)->parent->sk_ack_backlog--;
+ sk_acceptq_removed(bt_sk(sk)->parent);
bt_sk(sk)->parent = NULL;
sock_put(sk);
}
@@ -460,7 +460,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
if (sk->sk_state == BT_LISTEN)
return bt_accept_poll(sk);
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
@@ -470,7 +470,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= EPOLLHUP;
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
if (sk->sk_state == BT_CLOSED)
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index 1d4d7d415730..cc1cff63194f 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -112,7 +112,7 @@ static int bnep_net_set_mac_addr(struct net_device *dev, void *arg)
return 0;
}
-static void bnep_net_timeout(struct net_device *dev)
+static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue)
{
BT_DBG("net_timeout");
netif_wake_queue(dev);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index ad5b0ac1f9ce..87691404d0c6 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -934,6 +934,14 @@ static void hci_req_directed_advertising(struct hci_request *req,
return;
memset(&cp, 0, sizeof(cp));
+
+ /* Some controllers might reject command if intervals are not
+ * within range for undirected advertising.
+ * BCM20702A0 is known to be affected by this.
+ */
+ cp.min_interval = cpu_to_le16(0x0020);
+ cp.max_interval = cpu_to_le16(0x0020);
+
cp.type = LE_ADV_DIRECT_IND;
cp.own_address_type = own_addr_type;
cp.direct_addr_type = conn->dst_type;
@@ -1168,8 +1176,10 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
if (!conn)
return ERR_PTR(-ENOMEM);
- if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0)
+ if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) {
+ hci_conn_del(conn);
return ERR_PTR(-EBUSY);
+ }
conn->state = BT_CONNECT;
set_bit(HCI_CONN_SCANNING, &conn->flags);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 04bc79359a17..cbbc34a006d1 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -842,8 +842,8 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
struct hci_cp_le_write_def_data_len cp;
- cp.tx_len = hdev->le_max_tx_len;
- cp.tx_time = hdev->le_max_tx_time;
+ cp.tx_len = cpu_to_le16(hdev->le_max_tx_len);
+ cp.tx_time = cpu_to_le16(hdev->le_max_tx_time);
hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp);
}
@@ -1444,11 +1444,20 @@ static int hci_dev_do_open(struct hci_dev *hdev)
if (hci_dev_test_flag(hdev, HCI_SETUP) ||
test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
+ bool invalid_bdaddr;
+
hci_sock_dev_event(hdev, HCI_DEV_SETUP);
if (hdev->setup)
ret = hdev->setup(hdev);
+ /* The transport driver can set the quirk to mark the
+ * BD_ADDR invalid before creating the HCI device or in
+ * its setup callback.
+ */
+ invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR,
+ &hdev->quirks);
+
if (ret)
goto setup_failed;
@@ -1457,20 +1466,33 @@ static int hci_dev_do_open(struct hci_dev *hdev)
hci_dev_get_bd_addr_from_property(hdev);
if (bacmp(&hdev->public_addr, BDADDR_ANY) &&
- hdev->set_bdaddr)
+ hdev->set_bdaddr) {
ret = hdev->set_bdaddr(hdev,
&hdev->public_addr);
+
+ /* If setting of the BD_ADDR from the device
+ * property succeeds, then treat the address
+ * as valid even if the invalid BD_ADDR
+ * quirk indicates otherwise.
+ */
+ if (!ret)
+ invalid_bdaddr = false;
+ }
}
setup_failed:
/* The transport driver can set these quirks before
* creating the HCI device or in its setup callback.
*
+ * For the invalid BD_ADDR quirk it is possible that
+ * it becomes a valid address if the bootloader does
+ * provide it (see above).
+ *
* In case any of them is set, the controller has to
* start up as unconfigured.
*/
if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
- test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks))
+ invalid_bdaddr)
hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
/* For an unconfigured controller it is required to
@@ -2289,6 +2311,33 @@ void hci_smp_irks_clear(struct hci_dev *hdev)
}
}
+void hci_blocked_keys_clear(struct hci_dev *hdev)
+{
+ struct blocked_key *b;
+
+ list_for_each_entry_rcu(b, &hdev->blocked_keys, list) {
+ list_del_rcu(&b->list);
+ kfree_rcu(b, rcu);
+ }
+}
+
+bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16])
+{
+ bool blocked = false;
+ struct blocked_key *b;
+
+ rcu_read_lock();
+ list_for_each_entry(b, &hdev->blocked_keys, list) {
+ if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) {
+ blocked = true;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return blocked;
+}
+
struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
{
struct link_key *k;
@@ -2297,6 +2346,16 @@ struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
list_for_each_entry_rcu(k, &hdev->link_keys, list) {
if (bacmp(bdaddr, &k->bdaddr) == 0) {
rcu_read_unlock();
+
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LINKKEY,
+ k->val)) {
+ bt_dev_warn_ratelimited(hdev,
+ "Link key blocked for %pMR",
+ &k->bdaddr);
+ return NULL;
+ }
+
return k;
}
}
@@ -2365,6 +2424,15 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) {
rcu_read_unlock();
+
+ if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK,
+ k->val)) {
+ bt_dev_warn_ratelimited(hdev,
+ "LTK blocked for %pMR",
+ &k->bdaddr);
+ return NULL;
+ }
+
return k;
}
}
@@ -2375,31 +2443,42 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa)
{
+ struct smp_irk *irk_to_return = NULL;
struct smp_irk *irk;
rcu_read_lock();
list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
if (!bacmp(&irk->rpa, rpa)) {
- rcu_read_unlock();
- return irk;
+ irk_to_return = irk;
+ goto done;
}
}
list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
if (smp_irk_matches(hdev, irk->val, rpa)) {
bacpy(&irk->rpa, rpa);
- rcu_read_unlock();
- return irk;
+ irk_to_return = irk;
+ goto done;
}
}
+
+done:
+ if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ irk_to_return->val)) {
+ bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR",
+ &irk_to_return->bdaddr);
+ irk_to_return = NULL;
+ }
+
rcu_read_unlock();
- return NULL;
+ return irk_to_return;
}
struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
u8 addr_type)
{
+ struct smp_irk *irk_to_return = NULL;
struct smp_irk *irk;
/* Identity Address must be public or static random */
@@ -2410,13 +2489,23 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
if (addr_type == irk->addr_type &&
bacmp(bdaddr, &irk->bdaddr) == 0) {
- rcu_read_unlock();
- return irk;
+ irk_to_return = irk;
+ goto done;
}
}
+
+done:
+
+ if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ irk_to_return->val)) {
+ bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR",
+ &irk_to_return->bdaddr);
+ irk_to_return = NULL;
+ }
+
rcu_read_unlock();
- return NULL;
+ return irk_to_return;
}
struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
@@ -3222,6 +3311,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_LIST_HEAD(&hdev->pend_le_reports);
INIT_LIST_HEAD(&hdev->conn_hash.list);
INIT_LIST_HEAD(&hdev->adv_instances);
+ INIT_LIST_HEAD(&hdev->blocked_keys);
INIT_WORK(&hdev->rx_work, hci_rx_work);
INIT_WORK(&hdev->cmd_work, hci_cmd_work);
@@ -3421,6 +3511,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
hci_bdaddr_list_clear(&hdev->le_resolv_list);
hci_conn_params_clear_all(hdev);
hci_discovery_filter_clear(hdev);
+ hci_blocked_keys_clear(hdev);
hci_dev_unlock(hdev);
hci_dev_put(hdev);
@@ -3474,7 +3565,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) {
kfree_skb(skb);
return -EINVAL;
}
@@ -4196,15 +4288,10 @@ static void hci_sched_le(struct hci_dev *hdev)
if (!hci_conn_num(hdev, LE_LINK))
return;
- if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
- /* LE tx timeout must be longer than maximum
- * link supervision timeout (40.9 seconds) */
- if (!hdev->le_cnt && hdev->le_pkts &&
- time_after(jiffies, hdev->le_last_tx + HZ * 45))
- hci_link_tx_to(hdev, LE_LINK);
- }
-
cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt;
+
+ __check_timeout(hdev, cnt);
+
tmp = cnt;
while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
u32 priority = (skb_peek(&chan->data_q))->priority;
@@ -4440,7 +4527,14 @@ static void hci_rx_work(struct work_struct *work)
hci_send_to_sock(hdev, skb);
}
- if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ /* If the device has been opened in HCI_USER_CHANNEL,
+ * the userspace has exclusive access to device.
+ * When device is HCI_INIT, we still need to process
+ * the data packets to the driver in order
+ * to complete its setup().
+ */
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !test_bit(HCI_INIT, &hdev->flags)) {
kfree_skb(skb);
continue;
}
@@ -4450,6 +4544,7 @@ static void hci_rx_work(struct work_struct *work)
switch (hci_skb_pkt_type(skb)) {
case HCI_ACLDATA_PKT:
case HCI_SCODATA_PKT:
+ case HCI_ISODATA_PKT:
kfree_skb(skb);
continue;
}
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 402e2cc54044..6b1314c738b8 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -26,6 +26,7 @@
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include "smp.h"
#include "hci_debugfs.h"
#define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \
@@ -152,6 +153,21 @@ static int blacklist_show(struct seq_file *f, void *p)
DEFINE_SHOW_ATTRIBUTE(blacklist);
+static int blocked_keys_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct blocked_key *key;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(key, &hdev->blocked_keys, list)
+ seq_printf(f, "%u %*phN\n", key->type, 16, key->val);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(blocked_keys);
+
static int uuids_show(struct seq_file *f, void *p)
{
struct hci_dev *hdev = f->private;
@@ -308,6 +324,8 @@ void hci_debugfs_create_common(struct hci_dev *hdev)
&device_list_fops);
debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev,
&blacklist_fops);
+ debugfs_create_file("blocked_keys", 0444, hdev->debugfs, hdev,
+ &blocked_keys_fops);
debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops);
debugfs_create_file("remote_oob", 0400, hdev->debugfs, hdev,
&remote_oob_fops);
@@ -972,6 +990,62 @@ static int adv_max_interval_get(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
adv_max_interval_set, "%llu\n");
+static int min_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val > hdev->le_max_key_size || val < SMP_MIN_ENC_KEY_SIZE)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_min_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int min_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_min_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(min_key_size_fops, min_key_size_get,
+ min_key_size_set, "%llu\n");
+
+static int max_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val > SMP_MAX_ENC_KEY_SIZE || val < hdev->le_min_key_size)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->le_max_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int max_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_max_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(max_key_size_fops, max_key_size_get,
+ max_key_size_set, "%llu\n");
+
static int auth_payload_timeout_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
@@ -1054,6 +1128,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
&adv_max_interval_fops);
debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs,
&hdev->discov_interleaved_timeout);
+ debugfs_create_file("min_key_size", 0644, hdev->debugfs, hdev,
+ &min_key_size_fops);
+ debugfs_create_file("max_key_size", 0644, hdev->debugfs, hdev,
+ &max_key_size_fops);
debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev,
&auth_payload_timeout_fops);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index cdb00c2ef242..6ddc4a74a5e4 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5451,7 +5451,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static u8 ext_evt_type_to_legacy(u16 evt_type)
+static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type)
{
if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
switch (evt_type) {
@@ -5468,10 +5468,7 @@ static u8 ext_evt_type_to_legacy(u16 evt_type)
return LE_ADV_SCAN_RSP;
}
- BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
- evt_type);
-
- return LE_ADV_INVALID;
+ goto invalid;
}
if (evt_type & LE_EXT_ADV_CONN_IND) {
@@ -5491,8 +5488,9 @@ static u8 ext_evt_type_to_legacy(u16 evt_type)
evt_type & LE_EXT_ADV_DIRECT_IND)
return LE_ADV_NONCONN_IND;
- BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
- evt_type);
+invalid:
+ bt_dev_err_ratelimited(hdev, "Unknown advertising packet type: 0x%02x",
+ evt_type);
return LE_ADV_INVALID;
}
@@ -5510,7 +5508,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
u16 evt_type;
evt_type = __le16_to_cpu(ev->evt_type);
- legacy_evt_type = ext_evt_type_to_legacy(evt_type);
+ legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type);
if (legacy_evt_type != LE_ADV_INVALID) {
process_adv_report(hdev, legacy_evt_type, &ev->bdaddr,
ev->bdaddr_type, NULL, 0, ev->rssi,
@@ -5660,11 +5658,6 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
return send_conn_param_neg_reply(hdev, handle,
HCI_ERROR_UNKNOWN_CONN_ID);
- if (min < hcon->le_conn_min_interval ||
- max > hcon->le_conn_max_interval)
- return send_conn_param_neg_reply(hdev, handle,
- HCI_ERROR_INVALID_LL_PARAMS);
-
if (hci_check_conn_params(min, max, latency, timeout))
return send_conn_param_neg_reply(hdev, handle,
HCI_ERROR_INVALID_LL_PARAMS);
@@ -5725,6 +5718,29 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev,
hci_dev_unlock(hdev);
}
+static void hci_le_phy_update_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_le_phy_update_complete *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (!ev->status)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
+
+ conn->le_tx_phy = ev->tx_phy;
+ conn->le_rx_phy = ev->rx_phy;
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_ev_le_meta *le_ev = (void *) skb->data;
@@ -5760,6 +5776,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_le_direct_adv_report_evt(hdev, skb);
break;
+ case HCI_EV_LE_PHY_UPDATE_COMPLETE:
+ hci_le_phy_update_evt(hdev, skb);
+ break;
+
case HCI_EV_LE_EXT_ADV_REPORT:
hci_le_ext_adv_report_evt(hdev, skb);
break;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 621f1a97d803..2a1b64dbf76e 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -904,9 +904,9 @@ static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
{
struct adv_info *adv_instance;
- /* Ignore instance 0 */
+ /* Instance 0x00 always set local name */
if (instance == 0x00)
- return 0;
+ return 1;
adv_instance = hci_find_adv_instance(hdev, instance);
if (!adv_instance)
@@ -923,9 +923,9 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
u8 instance = hdev->cur_adv_instance;
struct adv_info *adv_instance;
- /* Ignore instance 0 */
+ /* Instance 0x00 always set local name */
if (instance == 0x00)
- return 0;
+ return 1;
adv_instance = hci_find_adv_instance(hdev, instance);
if (!adv_instance)
@@ -1054,6 +1054,7 @@ void __hci_req_enable_advertising(struct hci_request *req)
struct hci_cp_le_set_adv_param cp;
u8 own_addr_type, enable = 0x01;
bool connectable;
+ u16 adv_min_interval, adv_max_interval;
u32 flags;
flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance);
@@ -1087,16 +1088,30 @@ void __hci_req_enable_advertising(struct hci_request *req)
return;
memset(&cp, 0, sizeof(cp));
- cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval);
- cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval);
- if (connectable)
+ if (connectable) {
cp.type = LE_ADV_IND;
- else if (get_cur_adv_instance_scan_rsp_len(hdev))
- cp.type = LE_ADV_SCAN_IND;
- else
- cp.type = LE_ADV_NONCONN_IND;
+ adv_min_interval = hdev->le_adv_min_interval;
+ adv_max_interval = hdev->le_adv_max_interval;
+ } else {
+ if (get_cur_adv_instance_scan_rsp_len(hdev))
+ cp.type = LE_ADV_SCAN_IND;
+ else
+ cp.type = LE_ADV_NONCONN_IND;
+
+ if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) ||
+ hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
+ adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN;
+ adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX;
+ } else {
+ adv_min_interval = hdev->le_adv_min_interval;
+ adv_max_interval = hdev->le_adv_max_interval;
+ }
+ }
+
+ cp.min_interval = cpu_to_le16(adv_min_interval);
+ cp.max_interval = cpu_to_le16(adv_max_interval);
cp.own_address_type = own_addr_type;
cp.channel_map = hdev->le_adv_channel_map;
@@ -1258,6 +1273,14 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
instance_flags = get_adv_instance_flags(hdev, instance);
+ /* If instance already has the flags set skip adding it once
+ * again.
+ */
+ if (adv_instance && eir_get_data(adv_instance->adv_data,
+ adv_instance->adv_data_len, EIR_FLAGS,
+ NULL))
+ goto skip_flags;
+
/* The Add Advertising command allows userspace to set both the general
* and limited discoverable flags.
*/
@@ -1290,6 +1313,7 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
}
}
+skip_flags:
if (adv_instance) {
memcpy(ptr, adv_instance->adv_data,
adv_instance->adv_data_len);
@@ -1675,7 +1699,7 @@ int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance)
* scheduling it.
*/
if (adv_instance && adv_instance->duration) {
- u16 duration = adv_instance->duration * MSEC_PER_SEC;
+ u16 duration = adv_instance->timeout * MSEC_PER_SEC;
/* Time = N * 10 ms */
adv_set->duration = cpu_to_le16(duration / 10);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index d32077b28433..9c4a093f8960 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -23,7 +23,7 @@
*/
/* Bluetooth HCI sockets. */
-
+#include <linux/compat.h>
#include <linux/export.h>
#include <linux/utsname.h>
#include <linux/sched.h>
@@ -211,7 +211,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT)
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT)
continue;
if (is_filtered_packet(sk, skb))
continue;
@@ -220,7 +221,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT)
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT)
continue;
} else {
/* Don't send frame to other channel types */
@@ -324,6 +326,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
else
opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT);
break;
+ case HCI_ISODATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT);
+ break;
case HCI_DIAG_PKT:
opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG);
break;
@@ -831,6 +839,8 @@ static int hci_sock_release(struct socket *sock)
if (!sk)
return 0;
+ lock_sock(sk);
+
switch (hci_pi(sk)->channel) {
case HCI_CHANNEL_MONITOR:
atomic_dec(&monitor_promisc);
@@ -878,6 +888,7 @@ static int hci_sock_release(struct socket *sock)
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_write_queue);
+ release_sock(sk);
sock_put(sk);
return 0;
}
@@ -1054,6 +1065,22 @@ done:
return err;
}
+#ifdef CONFIG_COMPAT
+static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case HCIDEVUP:
+ case HCIDEVDOWN:
+ case HCIDEVRESET:
+ case HCIDEVRESTAT:
+ return hci_sock_ioctl(sock, cmd, arg);
+ }
+
+ return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
int addr_len)
{
@@ -1746,7 +1773,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
*/
if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) {
err = -EINVAL;
goto drop;
}
@@ -1790,7 +1818,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
}
if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) {
err = -EINVAL;
goto drop;
}
@@ -1974,6 +2003,9 @@ static const struct proto_ops hci_sock_ops = {
.sendmsg = hci_sock_sendmsg,
.recvmsg = hci_sock_recvmsg,
.ioctl = hci_sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = hci_sock_compat_ioctl,
+#endif
.poll = datagram_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 8d889969ae7e..bef84b95e2c4 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -267,7 +267,7 @@ static int hidp_get_raw_report(struct hid_device *hid,
set_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
data[0] = report_number;
ret = hidp_send_ctrl_message(session, report_type, data, 1);
- if (ret)
+ if (ret < 0)
goto err;
/* Wait for the return of the report. The returned report
@@ -343,7 +343,7 @@ static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum,
data[0] = reportnum;
set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
ret = hidp_send_ctrl_message(session, report_type, data, count);
- if (ret)
+ if (ret < 0)
goto err;
/* Wait for the ACK from the device. */
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index dfc1edb168b7..195459a1e53e 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1289,6 +1289,9 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
return;
+ if (!chan->imtu)
+ chan->imtu = chan->conn->mtu;
+
l2cap_le_flowctl_init(chan, 0);
req.psm = chan->psm;
@@ -3226,6 +3229,49 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
chan->ack_win = chan->tx_win;
}
+static void l2cap_mtu_auto(struct l2cap_chan *chan)
+{
+ struct hci_conn *conn = chan->conn->hcon;
+
+ chan->imtu = L2CAP_DEFAULT_MIN_MTU;
+
+ /* The 2-DH1 packet has between 2 and 56 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH1))
+ chan->imtu = 54;
+
+ /* The 3-DH1 packet has between 2 and 85 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH1))
+ chan->imtu = 83;
+
+ /* The 2-DH3 packet has between 2 and 369 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH3))
+ chan->imtu = 367;
+
+ /* The 3-DH3 packet has between 2 and 554 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH3))
+ chan->imtu = 552;
+
+ /* The 2-DH5 packet has between 2 and 681 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH5))
+ chan->imtu = 679;
+
+ /* The 3-DH5 packet has between 2 and 1023 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH5))
+ chan->imtu = 1021;
+}
+
static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
{
struct l2cap_conf_req *req = data;
@@ -3255,8 +3301,12 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data
}
done:
- if (chan->imtu != L2CAP_DEFAULT_MTU)
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
+ if (chan->imtu != L2CAP_DEFAULT_MTU) {
+ if (!chan->imtu)
+ l2cap_mtu_auto(chan);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu,
+ endptr - ptr);
+ }
switch (chan->mode) {
case L2CAP_MODE_BASIC:
@@ -4936,10 +4986,8 @@ void __l2cap_physical_cfm(struct l2cap_chan *chan, int result)
BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d",
chan, result, local_amp_id, remote_amp_id);
- if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) {
- l2cap_chan_unlock(chan);
+ if (chan->state == BT_DISCONN || chan->state == BT_CLOSED)
return;
- }
if (chan->state != BT_CONNECTED) {
l2cap_do_create(chan, result, local_amp_id, remote_amp_id);
@@ -5033,7 +5081,6 @@ static inline int l2cap_move_channel_req(struct l2cap_conn *conn,
chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
l2cap_move_setup(chan);
chan->move_id = req->dest_amp_id;
- icid = chan->dcid;
if (req->dest_amp_id == AMP_ID_BREDR) {
/* Moving to BR/EDR */
@@ -5305,14 +5352,7 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
memset(&rsp, 0, sizeof(rsp));
- if (min < hcon->le_conn_min_interval ||
- max > hcon->le_conn_max_interval) {
- BT_DBG("requested connection interval exceeds current bounds.");
- err = -EINVAL;
- } else {
- err = hci_check_conn_params(min, max, latency, to_multiplier);
- }
-
+ err = hci_check_conn_params(min, max, latency, to_multiplier);
if (err)
rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
else
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 63e65d9b4b24..c09e0a3a0ed9 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -183,6 +183,22 @@ void bt_err(const char *format, ...)
}
EXPORT_SYMBOL(bt_err);
+void bt_warn_ratelimited(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_warn_ratelimited("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_warn_ratelimited);
+
void bt_err_ratelimited(const char *format, ...)
{
struct va_format vaf;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 150114e33b20..3074363c68df 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 14
+#define MGMT_REVISION 15
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -106,6 +106,7 @@ static const u16 mgmt_commands[] = {
MGMT_OP_START_LIMITED_DISCOVERY,
MGMT_OP_READ_EXT_INFO,
MGMT_OP_SET_APPEARANCE,
+ MGMT_OP_SET_BLOCKED_KEYS,
};
static const u16 mgmt_events[] = {
@@ -175,7 +176,7 @@ static const u16 mgmt_untrusted_events[] = {
"\x00\x00\x00\x00\x00\x00\x00\x00"
/* HCI to MGMT error code conversion table */
-static u8 mgmt_status_table[] = {
+static const u8 mgmt_status_table[] = {
MGMT_STATUS_SUCCESS,
MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */
MGMT_STATUS_NOT_CONNECTED, /* No Connection */
@@ -2341,6 +2342,14 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
for (i = 0; i < key_count; i++) {
struct mgmt_link_key_info *key = &cp->keys[i];
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LINKKEY,
+ key->val)) {
+ bt_dev_warn(hdev, "Skipping blocked link key for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
/* Always ignore debug keys and require a new pairing if
* the user wants to use them.
*/
@@ -2588,7 +2597,6 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_rp_get_connections *rp;
struct hci_conn *c;
- size_t rp_len;
int err;
u16 i;
@@ -2608,8 +2616,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
i++;
}
- rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
- rp = kmalloc(rp_len, GFP_KERNEL);
+ rp = kmalloc(struct_size(rp, addr, i), GFP_KERNEL);
if (!rp) {
err = -ENOMEM;
goto unlock;
@@ -2629,10 +2636,8 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
rp->conn_count = cpu_to_le16(i);
/* Recalculate length in case of filtered SCO connections, etc */
- rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
-
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp,
- rp_len);
+ struct_size(rp, addr, i));
kfree(rp);
@@ -3286,7 +3291,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
struct mgmt_cp_set_appearance *cp = data;
- u16 apperance;
+ u16 appearance;
int err;
BT_DBG("");
@@ -3295,12 +3300,12 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
MGMT_STATUS_NOT_SUPPORTED);
- apperance = le16_to_cpu(cp->appearance);
+ appearance = le16_to_cpu(cp->appearance);
hci_dev_lock(hdev);
- if (hdev->appearance != apperance) {
- hdev->appearance = apperance;
+ if (hdev->appearance != appearance) {
+ hdev->appearance = appearance;
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
@@ -3535,6 +3540,55 @@ unlock:
return err;
}
+static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ int err = MGMT_STATUS_SUCCESS;
+ struct mgmt_cp_set_blocked_keys *keys = data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*keys)) /
+ sizeof(struct mgmt_blocked_key_info));
+ u16 key_count, expected_len;
+ int i;
+
+ BT_DBG("request for %s", hdev->name);
+
+ key_count = __le16_to_cpu(keys->key_count);
+ if (key_count > max_key_count) {
+ bt_dev_err(hdev, "too big key_count value %u", key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ expected_len = struct_size(keys, keys, key_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ hci_dev_lock(hdev);
+
+ hci_blocked_keys_clear(hdev);
+
+ for (i = 0; i < keys->key_count; ++i) {
+ struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL);
+
+ if (!b) {
+ err = MGMT_STATUS_NO_RESOURCES;
+ break;
+ }
+
+ b->type = keys->keys[i].type;
+ memcpy(b->val, keys->keys[i].val, sizeof(b->val));
+ list_add_rcu(&b->list, &hdev->blocked_keys);
+ }
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ err, NULL, 0);
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -5055,6 +5109,14 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
for (i = 0; i < irk_count; i++) {
struct mgmt_irk_info *irk = &cp->irks[i];
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_IRK,
+ irk->val)) {
+ bt_dev_warn(hdev, "Skipping blocked IRK for %pMR",
+ &irk->addr.bdaddr);
+ continue;
+ }
+
hci_add_irk(hdev, &irk->addr.bdaddr,
le_addr_type(irk->addr.type), irk->val,
BDADDR_ANY);
@@ -5138,6 +5200,14 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
struct mgmt_ltk_info *key = &cp->keys[i];
u8 type, authenticated;
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LTK,
+ key->val)) {
+ bt_dev_warn(hdev, "Skipping blocked LTK for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
switch (key->type) {
case MGMT_LTK_UNAUTHENTICATED:
authenticated = 0x00;
@@ -6918,6 +6988,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ set_appearance, MGMT_SET_APPEARANCE_SIZE },
{ get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE },
{ set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
+ { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
};
void mgmt_index_added(struct hci_dev *hdev)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 90bb53aa4bee..b4eaf21360ef 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -24,7 +24,7 @@
/*
* RFCOMM sockets.
*/
-
+#include <linux/compat.h>
#include <linux/export.h>
#include <linux/debugfs.h>
#include <linux/sched/signal.h>
@@ -909,6 +909,13 @@ static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
return err;
}
+#ifdef CONFIG_COMPAT
+static int rfcomm_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return rfcomm_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
static int rfcomm_sock_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
@@ -1042,7 +1049,10 @@ static const struct proto_ops rfcomm_sock_ops = {
.gettstamp = sock_gettstamp,
.poll = bt_sock_poll,
.socketpair = sock_no_socketpair,
- .mmap = sock_no_mmap
+ .mmap = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = rfcomm_sock_compat_ioctl,
+#endif
};
static const struct net_proto_family rfcomm_sock_family_ops = {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 6c2b4e6e87ba..204f14f8b507 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -23,6 +23,7 @@
#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/crypto.h>
+#include <crypto/aes.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
#include <crypto/hash.h>
@@ -88,7 +89,6 @@ struct smp_dev {
u8 local_rand[16];
bool debug_key;
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
};
@@ -127,7 +127,6 @@ struct smp_chan {
u8 dhkey[32];
u8 mackey[16];
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
};
@@ -377,22 +376,18 @@ static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
* s1 and ah.
*/
-static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r)
+static int smp_e(const u8 *k, u8 *r)
{
+ struct crypto_aes_ctx ctx;
uint8_t tmp[16], data[16];
int err;
SMP_DBG("k %16phN r %16phN", k, r);
- if (!tfm) {
- BT_ERR("tfm %p", tfm);
- return -EINVAL;
- }
-
/* The most significant octet of key corresponds to k[0] */
swap_buf(k, tmp, 16);
- err = crypto_cipher_setkey(tfm, tmp, 16);
+ err = aes_expandkey(&ctx, tmp, 16);
if (err) {
BT_ERR("cipher setkey failed: %d", err);
return err;
@@ -401,17 +396,18 @@ static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r)
/* Most significant octet of plaintextData corresponds to data[0] */
swap_buf(r, data, 16);
- crypto_cipher_encrypt_one(tfm, data, data);
+ aes_encrypt(&ctx, data, data);
/* Most significant octet of encryptedData corresponds to data[0] */
swap_buf(data, r, 16);
SMP_DBG("r %16phN", r);
+ memzero_explicit(&ctx, sizeof (ctx));
return err;
}
-static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
+static int smp_c1(const u8 k[16],
const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
{
@@ -436,7 +432,7 @@ static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
u128_xor((u128 *) res, (u128 *) r, (u128 *) p1);
/* res = e(k, res) */
- err = smp_e(tfm_aes, k, res);
+ err = smp_e(k, res);
if (err) {
BT_ERR("Encrypt data error");
return err;
@@ -453,14 +449,14 @@ static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
u128_xor((u128 *) res, (u128 *) res, (u128 *) p2);
/* res = e(k, res) */
- err = smp_e(tfm_aes, k, res);
+ err = smp_e(k, res);
if (err)
BT_ERR("Encrypt data error");
return err;
}
-static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16],
+static int smp_s1(const u8 k[16],
const u8 r1[16], const u8 r2[16], u8 _r[16])
{
int err;
@@ -469,15 +465,14 @@ static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16],
memcpy(_r, r2, 8);
memcpy(_r + 8, r1, 8);
- err = smp_e(tfm_aes, k, _r);
+ err = smp_e(k, _r);
if (err)
BT_ERR("Encrypt data error");
return err;
}
-static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16],
- const u8 r[3], u8 res[3])
+static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3])
{
u8 _res[16];
int err;
@@ -486,7 +481,7 @@ static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16],
memcpy(_res, r, 3);
memset(_res + 3, 0, 13);
- err = smp_e(tfm, irk, _res);
+ err = smp_e(irk, _res);
if (err) {
BT_ERR("Encrypt error");
return err;
@@ -507,18 +502,15 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
const bdaddr_t *bdaddr)
{
struct l2cap_chan *chan = hdev->smp_data;
- struct smp_dev *smp;
u8 hash[3];
int err;
if (!chan || !chan->data)
return false;
- smp = chan->data;
-
BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk);
- err = smp_ah(smp->tfm_aes, irk, &bdaddr->b[3], hash);
+ err = smp_ah(irk, &bdaddr->b[3], hash);
if (err)
return false;
@@ -528,20 +520,17 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa)
{
struct l2cap_chan *chan = hdev->smp_data;
- struct smp_dev *smp;
int err;
if (!chan || !chan->data)
return -EOPNOTSUPP;
- smp = chan->data;
-
get_random_bytes(&rpa->b[3], 3);
rpa->b[5] &= 0x3f; /* Clear two most significant bits */
rpa->b[5] |= 0x40; /* Set second most significant bit */
- err = smp_ah(smp->tfm_aes, irk, &rpa->b[3], rpa->b);
+ err = smp_ah(irk, &rpa->b[3], rpa->b);
if (err < 0)
return err;
@@ -768,7 +757,6 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
kzfree(smp->slave_csrk);
kzfree(smp->link_key);
- crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
@@ -957,7 +945,7 @@ static u8 smp_confirm(struct smp_chan *smp)
BT_DBG("conn %p", conn);
- ret = smp_c1(smp->tfm_aes, smp->tk, smp->prnd, smp->preq, smp->prsp,
+ ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp,
conn->hcon->init_addr_type, &conn->hcon->init_addr,
conn->hcon->resp_addr_type, &conn->hcon->resp_addr,
cp.confirm_val);
@@ -983,12 +971,9 @@ static u8 smp_random(struct smp_chan *smp)
u8 confirm[16];
int ret;
- if (IS_ERR_OR_NULL(smp->tfm_aes))
- return SMP_UNSPECIFIED;
-
BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
- ret = smp_c1(smp->tfm_aes, smp->tk, smp->rrnd, smp->preq, smp->prsp,
+ ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp,
hcon->init_addr_type, &hcon->init_addr,
hcon->resp_addr_type, &hcon->resp_addr, confirm);
if (ret)
@@ -1005,7 +990,7 @@ static u8 smp_random(struct smp_chan *smp)
__le64 rand = 0;
__le16 ediv = 0;
- smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk);
+ smp_s1(smp->tk, smp->rrnd, smp->prnd, stk);
if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
return SMP_UNSPECIFIED;
@@ -1021,7 +1006,7 @@ static u8 smp_random(struct smp_chan *smp)
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
smp->prnd);
- smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk);
+ smp_s1(smp->tk, smp->prnd, smp->rrnd, stk);
if (hcon->pending_sec_level == BT_SECURITY_HIGH)
auth = 1;
@@ -1389,16 +1374,10 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
if (!smp)
return NULL;
- smp->tfm_aes = crypto_alloc_cipher("aes", 0, 0);
- if (IS_ERR(smp->tfm_aes)) {
- BT_ERR("Unable to create AES crypto context");
- goto zfree_smp;
- }
-
smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(smp->tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- goto free_cipher;
+ goto zfree_smp;
}
smp->tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0);
@@ -1420,8 +1399,6 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
free_shash:
crypto_free_shash(smp->tfm_cmac);
-free_cipher:
- crypto_free_cipher(smp->tfm_aes);
zfree_smp:
kzfree(smp);
return NULL;
@@ -2476,6 +2453,15 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
if (skb->len < sizeof(*rp))
return SMP_INVALID_PARAMS;
+ /* Pairing is aborted if any blocked keys are distributed */
+ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK,
+ rp->ltk)) {
+ bt_dev_warn_ratelimited(conn->hcon->hdev,
+ "LTK blocked for %pMR",
+ &conn->hcon->dst);
+ return SMP_INVALID_PARAMS;
+ }
+
SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT);
skb_pull(skb, sizeof(*rp));
@@ -2532,6 +2518,15 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb)
if (skb->len < sizeof(*info))
return SMP_INVALID_PARAMS;
+ /* Pairing is aborted if any blocked keys are distributed */
+ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ info->irk)) {
+ bt_dev_warn_ratelimited(conn->hcon->hdev,
+ "Identity key blocked for %pMR",
+ &conn->hcon->dst);
+ return SMP_INVALID_PARAMS;
+ }
+
SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO);
skb_pull(skb, sizeof(*info));
@@ -3232,7 +3227,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
{
struct l2cap_chan *chan;
struct smp_dev *smp;
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
@@ -3245,17 +3239,9 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (!smp)
return ERR_PTR(-ENOMEM);
- tfm_aes = crypto_alloc_cipher("aes", 0, 0);
- if (IS_ERR(tfm_aes)) {
- BT_ERR("Unable to create AES crypto context");
- kzfree(smp);
- return ERR_CAST(tfm_aes);
- }
-
tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_cipher(tfm_aes);
kzfree(smp);
return ERR_CAST(tfm_cmac);
}
@@ -3264,13 +3250,11 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (IS_ERR(tfm_ecdh)) {
BT_ERR("Unable to create ECDH crypto context");
crypto_free_shash(tfm_cmac);
- crypto_free_cipher(tfm_aes);
kzfree(smp);
return ERR_CAST(tfm_ecdh);
}
smp->local_oob = false;
- smp->tfm_aes = tfm_aes;
smp->tfm_cmac = tfm_cmac;
smp->tfm_ecdh = tfm_ecdh;
@@ -3278,7 +3262,6 @@ create_chan:
chan = l2cap_chan_create();
if (!chan) {
if (smp) {
- crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
kzfree(smp);
@@ -3326,7 +3309,6 @@ static void smp_del_chan(struct l2cap_chan *chan)
smp = chan->data;
if (smp) {
chan->data = NULL;
- crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
kzfree(smp);
@@ -3391,94 +3373,6 @@ static const struct file_operations force_bredr_smp_fops = {
.llseek = default_llseek,
};
-static ssize_t le_min_key_size_read(struct file *file,
- char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[4];
-
- snprintf(buf, sizeof(buf), "%2u\n", hdev->le_min_key_size);
-
- return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
-}
-
-static ssize_t le_min_key_size_write(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf) - 1));
- u8 key_size;
-
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
-
- sscanf(buf, "%hhu", &key_size);
-
- if (key_size > hdev->le_max_key_size ||
- key_size < SMP_MIN_ENC_KEY_SIZE)
- return -EINVAL;
-
- hdev->le_min_key_size = key_size;
-
- return count;
-}
-
-static const struct file_operations le_min_key_size_fops = {
- .open = simple_open,
- .read = le_min_key_size_read,
- .write = le_min_key_size_write,
- .llseek = default_llseek,
-};
-
-static ssize_t le_max_key_size_read(struct file *file,
- char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[4];
-
- snprintf(buf, sizeof(buf), "%2u\n", hdev->le_max_key_size);
-
- return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
-}
-
-static ssize_t le_max_key_size_write(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf) - 1));
- u8 key_size;
-
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
-
- sscanf(buf, "%hhu", &key_size);
-
- if (key_size > SMP_MAX_ENC_KEY_SIZE ||
- key_size < hdev->le_min_key_size)
- return -EINVAL;
-
- hdev->le_max_key_size = key_size;
-
- return count;
-}
-
-static const struct file_operations le_max_key_size_fops = {
- .open = simple_open,
- .read = le_max_key_size_read,
- .write = le_max_key_size_write,
- .llseek = default_llseek,
-};
-
int smp_register(struct hci_dev *hdev)
{
struct l2cap_chan *chan;
@@ -3503,11 +3397,6 @@ int smp_register(struct hci_dev *hdev)
hdev->smp_data = chan;
- debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev,
- &le_min_key_size_fops);
- debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev,
- &le_max_key_size_fops);
-
/* If the controller does not support BR/EDR Secure Connections
* feature, then the BR/EDR SMP channel shall not be present.
*
@@ -3582,7 +3471,7 @@ static int __init test_debug_key(struct crypto_kpp *tfm_ecdh)
return 0;
}
-static int __init test_ah(struct crypto_cipher *tfm_aes)
+static int __init test_ah(void)
{
const u8 irk[16] = {
0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3592,7 +3481,7 @@ static int __init test_ah(struct crypto_cipher *tfm_aes)
u8 res[3];
int err;
- err = smp_ah(tfm_aes, irk, r, res);
+ err = smp_ah(irk, r, res);
if (err)
return err;
@@ -3602,7 +3491,7 @@ static int __init test_ah(struct crypto_cipher *tfm_aes)
return 0;
}
-static int __init test_c1(struct crypto_cipher *tfm_aes)
+static int __init test_c1(void)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3622,7 +3511,7 @@ static int __init test_c1(struct crypto_cipher *tfm_aes)
u8 res[16];
int err;
- err = smp_c1(tfm_aes, k, r, preq, pres, _iat, &ia, _rat, &ra, res);
+ err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res);
if (err)
return err;
@@ -3632,7 +3521,7 @@ static int __init test_c1(struct crypto_cipher *tfm_aes)
return 0;
}
-static int __init test_s1(struct crypto_cipher *tfm_aes)
+static int __init test_s1(void)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3647,7 +3536,7 @@ static int __init test_s1(struct crypto_cipher *tfm_aes)
u8 res[16];
int err;
- err = smp_s1(tfm_aes, k, r1, r2, res);
+ err = smp_s1(k, r1, r2, res);
if (err)
return err;
@@ -3828,8 +3717,7 @@ static const struct file_operations test_smp_fops = {
.llseek = default_llseek,
};
-static int __init run_selftests(struct crypto_cipher *tfm_aes,
- struct crypto_shash *tfm_cmac,
+static int __init run_selftests(struct crypto_shash *tfm_cmac,
struct crypto_kpp *tfm_ecdh)
{
ktime_t calltime, delta, rettime;
@@ -3844,19 +3732,19 @@ static int __init run_selftests(struct crypto_cipher *tfm_aes,
goto done;
}
- err = test_ah(tfm_aes);
+ err = test_ah();
if (err) {
BT_ERR("smp_ah test failed");
goto done;
}
- err = test_c1(tfm_aes);
+ err = test_c1();
if (err) {
BT_ERR("smp_c1 test failed");
goto done;
}
- err = test_s1(tfm_aes);
+ err = test_s1();
if (err) {
BT_ERR("smp_s1 test failed");
goto done;
@@ -3913,21 +3801,13 @@ done:
int __init bt_selftest_smp(void)
{
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
int err;
- tfm_aes = crypto_alloc_cipher("aes", 0, 0);
- if (IS_ERR(tfm_aes)) {
- BT_ERR("Unable to create AES crypto context");
- return PTR_ERR(tfm_aes);
- }
-
tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_cipher(tfm_aes);
return PTR_ERR(tfm_cmac);
}
@@ -3935,14 +3815,12 @@ int __init bt_selftest_smp(void)
if (IS_ERR(tfm_ecdh)) {
BT_ERR("Unable to create ECDH crypto context");
crypto_free_shash(tfm_cmac);
- crypto_free_cipher(tfm_aes);
return PTR_ERR(tfm_ecdh);
}
- err = run_selftests(tfm_aes, tfm_cmac, tfm_ecdh);
+ err = run_selftests(tfm_cmac, tfm_ecdh);
crypto_free_shash(tfm_cmac);
- crypto_free_cipher(tfm_aes);
crypto_free_kpp(tfm_ecdh);
return err;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 80e6f3a6864d..d555c0d8657d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -15,7 +15,7 @@
#include <trace/events/bpf_test_run.h>
static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
- u32 *retval, u32 *time)
+ u32 *retval, u32 *time, bool xdp)
{
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
enum bpf_cgroup_storage_type stype;
@@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
bpf_cgroup_storage_set(storage);
- *retval = BPF_PROG_RUN(prog, ctx);
+
+ if (xdp)
+ *retval = bpf_prog_run_xdp(prog, ctx);
+ else
+ *retval = BPF_PROG_RUN(prog, ctx);
if (signal_pending(current)) {
ret = -EINTR;
@@ -105,6 +109,40 @@ out:
return err;
}
+/* Integer types of various sizes and pointer combinations cover variety of
+ * architecture dependent calling conventions. 7+ can be supported in the
+ * future.
+ */
+int noinline bpf_fentry_test1(int a)
+{
+ return a + 1;
+}
+
+int noinline bpf_fentry_test2(int a, u64 b)
+{
+ return a + b;
+}
+
+int noinline bpf_fentry_test3(char a, int b, u64 c)
+{
+ return a + b + c;
+}
+
+int noinline bpf_fentry_test4(void *a, char b, int c, u64 d)
+{
+ return (long)a + b + c + d;
+}
+
+int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e)
+{
+ return a + (long)b + c + d + e;
+}
+
+int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f)
+{
+ return a + (long)b + c + d + (long)e + f;
+}
+
static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
@@ -122,6 +160,15 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
kfree(data);
return ERR_PTR(-EFAULT);
}
+ if (bpf_fentry_test1(1) != 2 ||
+ bpf_fentry_test2(2, 3) != 5 ||
+ bpf_fentry_test3(4, 5, 6) != 15 ||
+ bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
+ bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
+ bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) {
+ kfree(data);
+ return ERR_PTR(-EFAULT);
+ }
return data;
}
@@ -204,26 +251,53 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
return 0;
/* make sure the fields we don't use are zeroed */
- if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, priority)))
+ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark)))
+ return -EINVAL;
+
+ /* mark is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark),
+ offsetof(struct __sk_buff, priority)))
return -EINVAL;
/* priority is allowed */
- if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) +
- FIELD_SIZEOF(struct __sk_buff, priority),
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority),
offsetof(struct __sk_buff, cb)))
return -EINVAL;
/* cb is allowed */
- if (!range_is_zero(__skb, offsetof(struct __sk_buff, cb) +
- FIELD_SIZEOF(struct __sk_buff, cb),
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb),
+ offsetof(struct __sk_buff, tstamp)))
+ return -EINVAL;
+
+ /* tstamp is allowed */
+ /* wire_len is allowed */
+ /* gso_segs is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs),
sizeof(struct __sk_buff)))
return -EINVAL;
+ skb->mark = __skb->mark;
skb->priority = __skb->priority;
+ skb->tstamp = __skb->tstamp;
memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN);
+ if (__skb->wire_len == 0) {
+ cb->pkt_len = skb->len;
+ } else {
+ if (__skb->wire_len < skb->len ||
+ __skb->wire_len > GSO_MAX_SIZE)
+ return -EINVAL;
+ cb->pkt_len = __skb->wire_len;
+ }
+
+ if (__skb->gso_segs > GSO_MAX_SEGS)
+ return -EINVAL;
+ skb_shinfo(skb)->gso_segs = __skb->gso_segs;
+
return 0;
}
@@ -234,8 +308,12 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
if (!__skb)
return;
+ __skb->mark = skb->mark;
__skb->priority = skb->priority;
+ __skb->tstamp = skb->tstamp;
memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
+ __skb->wire_len = cb->pkt_len;
+ __skb->gso_segs = skb_shinfo(skb)->gso_segs;
}
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
@@ -307,7 +385,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
ret = convert___skb_to_skb(skb, ctx);
if (ret)
goto out;
- ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
+ ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false);
if (ret)
goto out;
if (!is_l2) {
@@ -364,8 +442,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
xdp.rxq = &rxqueue->xdp_rxq;
-
- ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration);
+ bpf_prog_change_xdp(NULL, prog);
+ ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
if (ret)
goto out;
if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
@@ -373,10 +451,26 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
out:
+ bpf_prog_change_xdp(prog, NULL);
kfree(data);
return ret;
}
+static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
+{
+ /* make sure the fields we don't use are zeroed */
+ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
+ return -EINVAL;
+
+ /* flags is allowed */
+
+ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags),
+ sizeof(struct bpf_flow_keys)))
+ return -EINVAL;
+
+ return 0;
+}
+
int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
const union bpf_attr *kattr,
union bpf_attr __user *uattr)
@@ -384,9 +478,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
u32 size = kattr->test.data_size_in;
struct bpf_flow_dissector ctx = {};
u32 repeat = kattr->test.repeat;
+ struct bpf_flow_keys *user_ctx;
struct bpf_flow_keys flow_keys;
u64 time_start, time_spent = 0;
const struct ethhdr *eth;
+ unsigned int flags = 0;
u32 retval, duration;
void *data;
int ret;
@@ -395,9 +491,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
return -EINVAL;
- if (kattr->test.ctx_in || kattr->test.ctx_out)
- return -EINVAL;
-
if (size < ETH_HLEN)
return -EINVAL;
@@ -410,6 +503,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (!repeat)
repeat = 1;
+ user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
+ if (IS_ERR(user_ctx)) {
+ kfree(data);
+ return PTR_ERR(user_ctx);
+ }
+ if (user_ctx) {
+ ret = verify_user_bpf_flow_keys(user_ctx);
+ if (ret)
+ goto out;
+ flags = user_ctx->flags;
+ }
+
ctx.flow_keys = &flow_keys;
ctx.data = data;
ctx.data_end = (__u8 *)data + size;
@@ -419,7 +524,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
- size);
+ size, flags);
if (signal_pending(current)) {
preempt_enable();
@@ -450,8 +555,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+ sizeof(struct bpf_flow_keys));
out:
+ kfree(user_ctx);
kfree(data);
return ret;
}
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index aa945ab5b655..36580301da70 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -3,7 +3,7 @@
# Makefile for the Linux BPFILTER layer.
#
-hostprogs-y := bpfilter_umh
+hostprogs := bpfilter_umh
bpfilter_umh-objs := main.o
KBUILD_HOSTCFLAGS += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi
HOSTCC := $(CC)
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index ac9ef337f0fa..49da7ae6f077 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o
bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 8a8f9e5f264f..b6fe30e3768f 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -312,7 +312,7 @@ static int __init br_init(void)
{
int err;
- BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb));
err = stp_proto_register(&br_stp_proto);
if (err < 0) {
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 681b72862c16..dc3d2c1dd9d5 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -24,8 +24,6 @@
const struct nf_br_ops __rcu *nf_br_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_br_ops);
-static struct lock_class_key bridge_netdev_addr_lock_key;
-
/* net device transmit always called with BH disabled */
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -34,6 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct net_bridge_mdb_entry *mdst;
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
const struct nf_br_ops *nf_ops;
+ u8 state = BR_STATE_FORWARDING;
const unsigned char *dest;
struct ethhdr *eth;
u16 vid = 0;
@@ -58,7 +57,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
eth = eth_hdr(skb);
skb_pull(skb, ETH_HLEN);
- if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
+ if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state))
goto out;
if (IS_ENABLED(CONFIG_INET) &&
@@ -108,11 +107,6 @@ out:
return NETDEV_TX_OK;
}
-static void br_set_lockdep_class(struct net_device *dev)
-{
- lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key);
-}
-
static int br_dev_init(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
@@ -150,7 +144,6 @@ static int br_dev_init(struct net_device *dev)
br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
}
- br_set_lockdep_class(dev);
return err;
}
@@ -253,6 +246,12 @@ static int br_set_mac_address(struct net_device *dev, void *p)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
+ /* dev_set_mac_addr() can be called by a master device on bridge's
+ * NETDEV_UNREGISTER, but since it's being destroyed do nothing
+ */
+ if (dev->reg_state != NETREG_REGISTERED)
+ return -EBUSY;
+
spin_lock_bh(&br->lock);
if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) {
/* Mac address will be changed in br_stp_change_bridge_id(). */
@@ -271,6 +270,37 @@ static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
strlcpy(info->bus_info, "N/A", sizeof(info->bus_info));
}
+static int br_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+
+ cmd->base.duplex = DUPLEX_UNKNOWN;
+ cmd->base.port = PORT_OTHER;
+ cmd->base.speed = SPEED_UNKNOWN;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ struct ethtool_link_ksettings ecmd;
+ struct net_device *pdev = p->dev;
+
+ if (!netif_running(pdev) || !netif_oper_up(pdev))
+ continue;
+
+ if (__ethtool_get_link_ksettings(pdev, &ecmd))
+ continue;
+
+ if (ecmd.base.speed == (__u32)SPEED_UNKNOWN)
+ continue;
+
+ if (cmd->base.speed == (__u32)SPEED_UNKNOWN ||
+ cmd->base.speed < ecmd.base.speed)
+ cmd->base.speed = ecmd.base.speed;
+ }
+
+ return 0;
+}
+
static netdev_features_t br_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -373,8 +403,9 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
}
static const struct ethtool_ops br_ethtool_ops = {
- .get_drvinfo = br_getinfo,
- .get_link = ethtool_op_get_link,
+ .get_drvinfo = br_getinfo,
+ .get_link = ethtool_op_get_link,
+ .get_link_ksettings = br_get_link_ksettings,
};
static const struct net_device_ops br_netdev_ops = {
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b1d3248c0252..4877a0db16c6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -75,8 +75,9 @@ static inline unsigned long hold_time(const struct net_bridge *br)
static inline int has_expired(const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb)
{
- return !fdb->is_static && !fdb->added_by_external_learn &&
- time_before_eq(fdb->updated + hold_time(br), jiffies);
+ return !test_bit(BR_FDB_STATIC, &fdb->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) &&
+ time_before_eq(fdb->updated + hold_time(br), jiffies);
}
static void fdb_rcu_free(struct rcu_head *head)
@@ -197,7 +198,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
{
trace_fdb_delete(br, f);
- if (f->is_static)
+ if (test_bit(BR_FDB_STATIC, &f->flags))
fdb_del_hw_addr(br, f->key.addr.addr);
hlist_del_init_rcu(&f->fdb_node);
@@ -224,7 +225,7 @@ static void fdb_delete_local(struct net_bridge *br,
if (op != p && ether_addr_equal(op->dev->dev_addr, addr) &&
(!vid || br_vlan_find(vg, vid))) {
f->dst = op;
- f->added_by_user = 0;
+ clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
return;
}
}
@@ -235,7 +236,7 @@ static void fdb_delete_local(struct net_bridge *br,
if (p && ether_addr_equal(br->dev->dev_addr, addr) &&
(!vid || (v && br_vlan_should_use(v)))) {
f->dst = NULL;
- f->added_by_user = 0;
+ clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
return;
}
@@ -250,7 +251,8 @@ void br_fdb_find_delete_local(struct net_bridge *br,
spin_lock_bh(&br->hash_lock);
f = br_fdb_find(br, addr, vid);
- if (f && f->is_local && !f->added_by_user && f->dst == p)
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p)
fdb_delete_local(br, p, f);
spin_unlock_bh(&br->hash_lock);
}
@@ -265,7 +267,8 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
spin_lock_bh(&br->hash_lock);
vg = nbp_vlan_group(p);
hlist_for_each_entry(f, &br->fdb_list, fdb_node) {
- if (f->dst == p && f->is_local && !f->added_by_user) {
+ if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) {
/* delete old one */
fdb_delete_local(br, p, f);
@@ -306,7 +309,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
/* If old entry was unassociated with any port, then delete it. */
f = br_fdb_find(br, br->dev->dev_addr, 0);
- if (f && f->is_local && !f->dst && !f->added_by_user)
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags))
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, 0);
@@ -321,7 +325,8 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
if (!br_vlan_should_use(v))
continue;
f = br_fdb_find(br, br->dev->dev_addr, v->vid);
- if (f && f->is_local && !f->dst && !f->added_by_user)
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags))
fdb_delete_local(br, NULL, f);
fdb_insert(br, NULL, newaddr, v->vid);
}
@@ -346,7 +351,8 @@ void br_fdb_cleanup(struct work_struct *work)
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
unsigned long this_timer;
- if (f->is_static || f->added_by_external_learn)
+ if (test_bit(BR_FDB_STATIC, &f->flags) ||
+ test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags))
continue;
this_timer = f->updated + delay;
if (time_after(this_timer, now)) {
@@ -373,7 +379,7 @@ void br_fdb_flush(struct net_bridge *br)
spin_lock_bh(&br->hash_lock);
hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
- if (!f->is_static)
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
fdb_delete(br, f, true);
}
spin_unlock_bh(&br->hash_lock);
@@ -397,10 +403,11 @@ void br_fdb_delete_by_port(struct net_bridge *br,
continue;
if (!do_all)
- if (f->is_static || (vid && f->key.vlan_id != vid))
+ if (test_bit(BR_FDB_STATIC, &f->flags) ||
+ (vid && f->key.vlan_id != vid))
continue;
- if (f->is_local)
+ if (test_bit(BR_FDB_LOCAL, &f->flags))
fdb_delete_local(br, p, f);
else
fdb_delete(br, f, true);
@@ -469,8 +476,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
fe->port_no = f->dst->port_no;
fe->port_hi = f->dst->port_no >> 8;
- fe->is_local = f->is_local;
- if (!f->is_static)
+ fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags);
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
++fe;
++num;
@@ -484,8 +491,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
struct net_bridge_port *source,
const unsigned char *addr,
__u16 vid,
- unsigned char is_local,
- unsigned char is_static)
+ unsigned long flags)
{
struct net_bridge_fdb_entry *fdb;
@@ -494,12 +500,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
fdb->dst = source;
fdb->key.vlan_id = vid;
- fdb->is_local = is_local;
- fdb->is_static = is_static;
- fdb->added_by_user = 0;
- fdb->added_by_external_learn = 0;
- fdb->offloaded = 0;
- fdb->is_sticky = 0;
+ fdb->flags = flags;
fdb->updated = fdb->used = jiffies;
if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
&fdb->rhnode,
@@ -526,14 +527,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
/* it is okay to have multiple ports with same
* address, just use the first one.
*/
- if (fdb->is_local)
+ if (test_bit(BR_FDB_LOCAL, &fdb->flags))
return 0;
br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
source ? source->dev->name : br->dev->name, addr, vid);
fdb_delete(br, fdb, true);
}
- fdb = fdb_create(br, source, addr, vid, 1, 1);
+ fdb = fdb_create(br, source, addr, vid,
+ BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC));
if (!fdb)
return -ENOMEM;
@@ -555,7 +557,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
}
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid, bool added_by_user)
+ const unsigned char *addr, u16 vid, unsigned long flags)
{
struct net_bridge_fdb_entry *fdb;
bool fdb_modified = false;
@@ -564,15 +566,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
if (hold_time(br) == 0)
return;
- /* ignore packets unless we are using this port */
- if (!(source->state == BR_STATE_LEARNING ||
- source->state == BR_STATE_FORWARDING))
- return;
-
fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
if (likely(fdb)) {
/* attempt to update an entry for a local interface */
- if (unlikely(fdb->is_local)) {
+ if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) {
if (net_ratelimit())
br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
source->dev->name, addr, vid);
@@ -580,30 +577,30 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
unsigned long now = jiffies;
/* fastpath: update of existing entry */
- if (unlikely(source != fdb->dst && !fdb->is_sticky)) {
+ if (unlikely(source != fdb->dst &&
+ !test_bit(BR_FDB_STICKY, &fdb->flags))) {
fdb->dst = source;
fdb_modified = true;
/* Take over HW learned entry */
- if (unlikely(fdb->added_by_external_learn))
- fdb->added_by_external_learn = 0;
+ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN,
+ &fdb->flags)))
+ clear_bit(BR_FDB_ADDED_BY_EXT_LEARN,
+ &fdb->flags);
}
if (now != fdb->updated)
fdb->updated = now;
- if (unlikely(added_by_user))
- fdb->added_by_user = 1;
+ if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags)))
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
if (unlikely(fdb_modified)) {
- trace_br_fdb_update(br, source, addr, vid, added_by_user);
+ trace_br_fdb_update(br, source, addr, vid, flags);
fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
}
} else {
spin_lock(&br->hash_lock);
- fdb = fdb_create(br, source, addr, vid, 0, 0);
+ fdb = fdb_create(br, source, addr, vid, flags);
if (fdb) {
- if (unlikely(added_by_user))
- fdb->added_by_user = 1;
- trace_br_fdb_update(br, source, addr, vid,
- added_by_user);
+ trace_br_fdb_update(br, source, addr, vid, flags);
fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
/* else we lose race and someone else inserts
@@ -616,9 +613,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
static int fdb_to_nud(const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb)
{
- if (fdb->is_local)
+ if (test_bit(BR_FDB_LOCAL, &fdb->flags))
return NUD_PERMANENT;
- else if (fdb->is_static)
+ else if (test_bit(BR_FDB_STATIC, &fdb->flags))
return NUD_NOARP;
else if (has_expired(br, fdb))
return NUD_STALE;
@@ -648,11 +645,11 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
ndm->ndm_state = fdb_to_nud(br, fdb);
- if (fdb->offloaded)
+ if (test_bit(BR_FDB_OFFLOADED, &fdb->flags))
ndm->ndm_flags |= NTF_OFFLOADED;
- if (fdb->added_by_external_learn)
+ if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
ndm->ndm_flags |= NTF_EXT_LEARNED;
- if (fdb->is_sticky)
+ if (test_bit(BR_FDB_STICKY, &fdb->flags))
ndm->ndm_flags |= NTF_STICKY;
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
@@ -799,7 +796,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
const u8 *addr, u16 state, u16 flags, u16 vid,
u8 ndm_flags)
{
- u8 is_sticky = !!(ndm_flags & NTF_STICKY);
+ bool is_sticky = !!(ndm_flags & NTF_STICKY);
struct net_bridge_fdb_entry *fdb;
bool modified = false;
@@ -823,7 +820,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
if (!(flags & NLM_F_CREATE))
return -ENOENT;
- fdb = fdb_create(br, source, addr, vid, 0, 0);
+ fdb = fdb_create(br, source, addr, vid, 0);
if (!fdb)
return -ENOMEM;
@@ -840,34 +837,28 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
if (fdb_to_nud(br, fdb) != state) {
if (state & NUD_PERMANENT) {
- fdb->is_local = 1;
- if (!fdb->is_static) {
- fdb->is_static = 1;
+ set_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags))
fdb_add_hw_addr(br, addr);
- }
} else if (state & NUD_NOARP) {
- fdb->is_local = 0;
- if (!fdb->is_static) {
- fdb->is_static = 1;
+ clear_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags))
fdb_add_hw_addr(br, addr);
- }
} else {
- fdb->is_local = 0;
- if (fdb->is_static) {
- fdb->is_static = 0;
+ clear_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags))
fdb_del_hw_addr(br, addr);
- }
}
modified = true;
}
- if (is_sticky != fdb->is_sticky) {
- fdb->is_sticky = is_sticky;
+ if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) {
+ change_bit(BR_FDB_STICKY, &fdb->flags);
modified = true;
}
- fdb->added_by_user = 1;
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
fdb->used = jiffies;
if (modified) {
@@ -890,9 +881,12 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
br->dev->name);
return -EINVAL;
}
+ if (!nbp_state_should_learn(p))
+ return 0;
+
local_bh_disable();
rcu_read_lock();
- br_fdb_update(br, p, addr, vid, true);
+ br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER));
rcu_read_unlock();
local_bh_enable();
} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
@@ -1064,7 +1058,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
/* We only care for static entries */
- if (!f->is_static)
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
continue;
err = dev_uc_add(p->dev, f->key.addr.addr);
if (err)
@@ -1078,7 +1072,7 @@ done:
rollback:
hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) {
/* We only care for static entries */
- if (!tmp->is_static)
+ if (!test_bit(BR_FDB_STATIC, &tmp->flags))
continue;
if (tmp == f)
break;
@@ -1097,7 +1091,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
/* We only care for static entries */
- if (!f->is_static)
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
continue;
dev_uc_del(p->dev, f->key.addr.addr);
@@ -1119,14 +1113,15 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
fdb = br_fdb_find(br, addr, vid);
if (!fdb) {
- fdb = fdb_create(br, p, addr, vid, 0, 0);
+ unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN);
+
+ if (swdev_notify)
+ flags |= BIT(BR_FDB_ADDED_BY_USER);
+ fdb = fdb_create(br, p, addr, vid, flags);
if (!fdb) {
err = -ENOMEM;
goto err_unlock;
}
- if (swdev_notify)
- fdb->added_by_user = 1;
- fdb->added_by_external_learn = 1;
fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
} else {
fdb->updated = jiffies;
@@ -1136,17 +1131,17 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
modified = true;
}
- if (fdb->added_by_external_learn) {
+ if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) {
/* Refresh entry */
fdb->used = jiffies;
- } else if (!fdb->added_by_user) {
+ } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) {
/* Take over SW learned entry */
- fdb->added_by_external_learn = 1;
+ set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags);
modified = true;
}
if (swdev_notify)
- fdb->added_by_user = 1;
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
if (modified)
fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
@@ -1168,7 +1163,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
spin_lock_bh(&br->hash_lock);
fdb = br_fdb_find(br, addr, vid);
- if (fdb && fdb->added_by_external_learn)
+ if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
fdb_delete(br, fdb, swdev_notify);
else
err = -ENOENT;
@@ -1186,8 +1181,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
spin_lock_bh(&br->hash_lock);
fdb = br_fdb_find(br, addr, vid);
- if (fdb)
- fdb->offloaded = offloaded;
+ if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags))
+ change_bit(BR_FDB_OFFLOADED, &fdb->flags);
spin_unlock_bh(&br->hash_lock);
}
@@ -1206,7 +1201,7 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid)
spin_lock_bh(&p->br->hash_lock);
hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) {
if (f->dst == p && f->key.vlan_id == vid)
- f->offloaded = 0;
+ clear_bit(BR_FDB_OFFLOADED, &f->flags);
}
spin_unlock_bh(&p->br->hash_lock);
}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 86637000f275..7629b63f6f30 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p,
vg = nbp_vlan_group_rcu(p);
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
- br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
+ p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) &&
nbp_switchdev_allowed_egress(p, skb) &&
!br_skb_isolated(p, skb);
}
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 09b1dd8cd853..fcc260840028 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -76,11 +76,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
bool local_rcv, mcast_hit = false;
struct net_bridge *br;
u16 vid = 0;
+ u8 state;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
- if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
+ state = p->state;
+ if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
+ &state))
goto out;
nbp_switchdev_frame_mark(p, skb);
@@ -88,7 +91,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
if (p->flags & BR_LEARNING)
- br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
local_rcv = !!(br->dev->flags & IFF_PROMISC);
if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
@@ -103,7 +106,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
}
}
- if (p->state == BR_STATE_LEARNING)
+ if (state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
@@ -151,7 +154,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
if (dst) {
unsigned long now = jiffies;
- if (dst->is_local)
+ if (test_bit(BR_FDB_LOCAL, &dst->flags))
return br_pass_frame_up(skb);
if (now != dst->used)
@@ -182,9 +185,10 @@ static void __br_handle_local_finish(struct sk_buff *skb)
/* check if vlan is allowed, to avoid spoofing */
if ((p->flags & BR_LEARNING) &&
+ nbp_state_should_learn(p) &&
!br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
br_should_learn(p, skb, &vid))
- br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+ br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0);
}
/* note: already called with rcu_read_lock */
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index bf6acd34234d..da5ed4cf9233 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -60,6 +60,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
e->flags = 0;
if (flags & MDB_PG_FLAGS_OFFLOAD)
e->flags |= MDB_FLAGS_OFFLOAD;
+ if (flags & MDB_PG_FLAGS_FAST_LEAVE)
+ e->flags |= MDB_FLAGS_FAST_LEAVE;
}
static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
@@ -75,6 +77,53 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
#endif
}
+static int __mdb_fill_info(struct sk_buff *skb,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *p)
+{
+ struct timer_list *mtimer;
+ struct nlattr *nest_ent;
+ struct br_mdb_entry e;
+ u8 flags = 0;
+ int ifindex;
+
+ memset(&e, 0, sizeof(e));
+ if (p) {
+ ifindex = p->port->dev->ifindex;
+ mtimer = &p->timer;
+ flags = p->flags;
+ } else {
+ ifindex = mp->br->dev->ifindex;
+ mtimer = &mp->timer;
+ }
+
+ __mdb_entry_fill_flags(&e, flags);
+ e.ifindex = ifindex;
+ e.vid = mp->addr.vid;
+ if (mp->addr.proto == htons(ETH_P_IP))
+ e.addr.u.ip4 = mp->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (mp->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = mp->addr.u.ip6;
+#endif
+ e.addr.proto = mp->addr.proto;
+ nest_ent = nla_nest_start_noflag(skb,
+ MDBA_MDB_ENTRY_INFO);
+ if (!nest_ent)
+ return -EMSGSIZE;
+
+ if (nla_put_nohdr(skb, sizeof(e), &e) ||
+ nla_put_u32(skb,
+ MDBA_MDB_EATTR_TIMER,
+ br_timer_value(mtimer))) {
+ nla_nest_cancel(skb, nest_ent);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, nest_ent);
+
+ return 0;
+}
+
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
@@ -93,7 +142,6 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct net_bridge_port *port;
if (idx < s_idx)
goto skip;
@@ -104,43 +152,24 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
break;
}
+ if (mp->host_joined) {
+ err = __mdb_fill_info(skb, mp, NULL);
+ if (err) {
+ nla_nest_cancel(skb, nest2);
+ break;
+ }
+ }
+
for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
pp = &p->next) {
- struct nlattr *nest_ent;
- struct br_mdb_entry e;
-
- port = p->port;
- if (!port)
+ if (!p->port)
continue;
- memset(&e, 0, sizeof(e));
- e.ifindex = port->dev->ifindex;
- e.vid = p->addr.vid;
- __mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = p->addr.u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = p->addr.u.ip6;
-#endif
- e.addr.proto = p->addr.proto;
- nest_ent = nla_nest_start_noflag(skb,
- MDBA_MDB_ENTRY_INFO);
- if (!nest_ent) {
+ err = __mdb_fill_info(skb, mp, p);
+ if (err) {
nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
goto out;
}
- if (nla_put_nohdr(skb, sizeof(e), &e) ||
- nla_put_u32(skb,
- MDBA_MDB_EATTR_TIMER,
- br_timer_value(&p->timer))) {
- nla_nest_cancel(skb, nest_ent);
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
- nla_nest_end(skb, nest_ent);
}
nla_nest_end(skb, nest2);
skip:
@@ -437,7 +466,7 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
struct nlmsghdr *nlh;
struct nlattr *nest;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
if (!nlh)
return -EMSGSIZE;
@@ -587,6 +616,19 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
return err;
}
+ /* host join */
+ if (!port) {
+ /* don't allow any flags for host-joined groups */
+ if (state)
+ return -EINVAL;
+ if (mp->host_joined)
+ return -EEXIST;
+
+ br_multicast_host_join(mp, false);
+
+ return 0;
+ }
+
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
@@ -611,19 +653,21 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
{
struct br_ip ip;
struct net_device *dev;
- struct net_bridge_port *p;
+ struct net_bridge_port *p = NULL;
int ret;
if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
return -EINVAL;
- dev = __dev_get_by_index(net, entry->ifindex);
- if (!dev)
- return -ENODEV;
+ if (entry->ifindex != br->dev->ifindex) {
+ dev = __dev_get_by_index(net, entry->ifindex);
+ if (!dev)
+ return -ENODEV;
- p = br_port_get_rtnl(dev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
+ p = br_port_get_rtnl(dev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+ }
__mdb_entry_to_br_ip(entry, &ip);
@@ -638,9 +682,9 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
- struct net_bridge_port *p;
struct net_bridge_vlan *v;
struct net_bridge *br;
int err;
@@ -651,18 +695,22 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
br = netdev_priv(dev);
+ if (entry->ifindex != br->dev->ifindex) {
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+ vg = nbp_vlan_group(p);
+ } else {
+ vg = br_vlan_group(br);
+ }
+
/* If vlan filtering is enabled and VLAN is not specified
* install mdb entry on all vlans configured on the port.
*/
- pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
- return -ENODEV;
-
- p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
-
- vg = nbp_vlan_group(p);
if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
@@ -698,6 +746,15 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (!mp)
goto unlock;
+ /* host leave */
+ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) {
+ br_multicast_host_leave(mp, false);
+ err = 0;
+ if (!mp->ports && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ goto unlock;
+ }
+
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
@@ -730,9 +787,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
- struct net_bridge_port *p;
struct net_bridge_vlan *v;
struct net_bridge *br;
int err;
@@ -743,18 +800,22 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
br = netdev_priv(dev);
+ if (entry->ifindex != br->dev->ifindex) {
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
+
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+ vg = nbp_vlan_group(p);
+ } else {
+ vg = br_vlan_group(br);
+ }
+
/* If vlan filtering is enabled and VLAN is not specified
* delete mdb entry on all vlans configured on the port.
*/
- pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
- return -ENODEV;
-
- p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
-
- vg = nbp_vlan_group(p);
if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f8cac3702712..ad12fe3fca8c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -148,8 +148,7 @@ static void br_multicast_group_expired(struct timer_list *t)
if (!netif_running(br->dev) || timer_pending(&mp->timer))
goto out;
- mp->host_joined = false;
- br_mdb_notify(br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+ br_multicast_host_leave(mp, true);
if (mp->ports)
goto out;
@@ -512,6 +511,27 @@ static bool br_port_group_equal(struct net_bridge_port_group *p,
return ether_addr_equal(src, p->eth_addr);
}
+void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined) {
+ mp->host_joined = true;
+ if (notify)
+ br_mdb_notify(mp->br->dev, NULL, &mp->addr,
+ RTM_NEWMDB, 0);
+ }
+ mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval);
+}
+
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined)
+ return;
+
+ mp->host_joined = false;
+ if (notify)
+ br_mdb_notify(mp->br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+}
+
static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group,
@@ -534,11 +554,7 @@ static int br_multicast_add_group(struct net_bridge *br,
goto err;
if (!port) {
- if (!mp->host_joined) {
- mp->host_joined = true;
- br_mdb_notify(br->dev, NULL, &mp->addr, RTM_NEWMDB, 0);
- }
- mod_timer(&mp->timer, now + br->multicast_membership_interval);
+ br_multicast_host_join(mp, true);
goto out;
}
@@ -1396,7 +1412,7 @@ br_multicast_leave_group(struct net_bridge *br,
del_timer(&p->timer);
kfree_rcu(p, rcu);
br_mdb_notify(br->dev, port, group, RTM_DELMDB,
- p->flags);
+ p->flags | MDB_PG_FLAGS_FAST_LEAVE);
if (!mp->ports && !mp->host_joined &&
netif_running(br->dev))
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index d3f9592f4ff8..59980ecfc962 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -496,6 +496,10 @@ static unsigned int br_nf_pre_routing(void *priv,
if (!brnet->call_ip6tables &&
!br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
return NF_ACCEPT;
+ if (!ipv6_mod_enabled()) {
+ pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported.");
+ return NF_DROP;
+ }
nf_bridge_pull_encap_header_rcsum(skb);
return br_nf_pre_routing_ipv6(priv, skb, state);
@@ -658,6 +662,9 @@ static unsigned int br_nf_forward_arp(void *priv,
nf_bridge_pull_encap_header(skb);
}
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr))))
+ return NF_DROP;
+
if (arp_hdr(skb)->ar_pln != 4) {
if (is_vlan_arp(skb, state->net))
nf_bridge_push_encap_header(skb);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a0a54482aabc..43dab4066f91 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -561,52 +561,73 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
return err;
}
-static int br_process_vlan_info(struct net_bridge *br,
- struct net_bridge_port *p, int cmd,
- struct bridge_vlan_info *vinfo_curr,
- struct bridge_vlan_info **vinfo_last,
- bool *changed,
- struct netlink_ext_ack *extack)
+int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last,
+ bool *changed,
+ struct netlink_ext_ack *extack)
{
- if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
+ int err, rtm_cmd;
+
+ if (!br_vlan_valid_id(vinfo_curr->vid, extack))
return -EINVAL;
+ /* needed for vlan-only NEWVLAN/DELVLAN notifications */
+ rtm_cmd = br_afspec_cmd_to_rtm(cmd);
+
if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
- /* check if we are already processing a range */
- if (*vinfo_last)
+ if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack))
return -EINVAL;
*vinfo_last = vinfo_curr;
- /* don't allow range of pvids */
- if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
- return -EINVAL;
return 0;
}
if (*vinfo_last) {
struct bridge_vlan_info tmp_vinfo;
- int v, err;
-
- if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
- return -EINVAL;
+ int v, v_change_start = 0;
- if (vinfo_curr->vid <= (*vinfo_last)->vid)
+ if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack))
return -EINVAL;
memcpy(&tmp_vinfo, *vinfo_last,
sizeof(struct bridge_vlan_info));
for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
+ bool curr_change = false;
+
tmp_vinfo.vid = v;
- err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed,
+ err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change,
extack);
if (err)
break;
+ if (curr_change) {
+ *changed = curr_change;
+ if (!v_change_start)
+ v_change_start = v;
+ } else {
+ /* nothing to notify yet */
+ if (!v_change_start)
+ continue;
+ br_vlan_notify(br, p, v_change_start,
+ v - 1, rtm_cmd);
+ v_change_start = 0;
+ }
}
+ /* v_change_start is set only if the last/whole range changed */
+ if (v_change_start)
+ br_vlan_notify(br, p, v_change_start,
+ v - 1, rtm_cmd);
+
*vinfo_last = NULL;
return err;
}
- return br_vlan_info(br, p, cmd, vinfo_curr, changed, extack);
+ err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack);
+ if (*changed)
+ br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd);
+
+ return err;
}
static int br_afspec(struct net_bridge *br,
@@ -1607,6 +1628,19 @@ static int br_fill_linkxstats(struct sk_buff *skb,
br_multicast_get_stats(br, p, nla_data(nla));
}
#endif
+
+ if (p) {
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP,
+ sizeof(p->stp_xstats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla)
+ goto nla_put_failure;
+
+ spin_lock_bh(&br->lock);
+ memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats));
+ spin_unlock_bh(&br->lock);
+ }
+
nla_nest_end(skb, nest);
*prividx = 0;
@@ -1651,6 +1685,7 @@ int __init br_netlink_init(void)
int err;
br_mdb_init();
+ br_vlan_rtnl_init();
rtnl_af_register(&br_af_ops);
err = rtnl_link_register(&br_link_ops);
@@ -1668,6 +1703,7 @@ out_af:
void br_netlink_fini(void)
{
br_mdb_uninit();
+ br_vlan_rtnl_uninit();
rtnl_af_unregister(&br_af_ops);
rtnl_link_unregister(&br_link_ops);
}
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 2cdfc5d6c25d..8c69f0c95a8e 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -22,7 +22,8 @@
#endif
static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 646504db0220..5153ffe79a01 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -113,6 +113,7 @@ enum {
* @vid: VLAN id
* @flags: bridge vlan flags
* @priv_flags: private (in-kernel) bridge vlan flags
+ * @state: STP state (e.g. blocking, learning, forwarding)
* @stats: per-cpu VLAN statistics
* @br: if MASTER flag set, this points to a bridge struct
* @port: if MASTER flag unset, this points to a port struct
@@ -133,6 +134,7 @@ struct net_bridge_vlan {
u16 vid;
u16 flags;
u16 priv_flags;
+ u8 state;
struct br_vlan_stats __percpu *stats;
union {
struct net_bridge *br;
@@ -157,6 +159,7 @@ struct net_bridge_vlan {
* @vlan_list: sorted VLAN entry list
* @num_vlans: number of total VLAN entries
* @pvid: PVID VLAN id
+ * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking)
*
* IMPORTANT: Be careful when checking if there're VLAN entries using list
* primitives because the bridge can have entries in its list which
@@ -170,6 +173,17 @@ struct net_bridge_vlan_group {
struct list_head vlan_list;
u16 num_vlans;
u16 pvid;
+ u8 pvid_state;
+};
+
+/* bridge fdb flags */
+enum {
+ BR_FDB_LOCAL,
+ BR_FDB_STATIC,
+ BR_FDB_STICKY,
+ BR_FDB_ADDED_BY_USER,
+ BR_FDB_ADDED_BY_EXT_LEARN,
+ BR_FDB_OFFLOADED,
};
struct net_bridge_fdb_key {
@@ -183,12 +197,7 @@ struct net_bridge_fdb_entry {
struct net_bridge_fdb_key key;
struct hlist_node fdb_node;
- unsigned char is_local:1,
- is_static:1,
- is_sticky:1,
- added_by_user:1,
- added_by_external_learn:1,
- offloaded:1;
+ unsigned long flags;
/* write-heavy members should not affect lookups */
unsigned long updated ____cacheline_aligned_in_smp;
@@ -199,6 +208,7 @@ struct net_bridge_fdb_entry {
#define MDB_PG_FLAGS_PERMANENT BIT(0)
#define MDB_PG_FLAGS_OFFLOAD BIT(1)
+#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
struct net_bridge_port_group {
struct net_bridge_port *port;
@@ -277,6 +287,8 @@ struct net_bridge_port {
#endif
u16 group_fwd_mask;
u16 backup_redirected_cnt;
+
+ struct bridge_stp_xstats stp_xstats;
};
#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
@@ -494,6 +506,70 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
return true;
}
+static inline bool nbp_state_should_learn(const struct net_bridge_port *p)
+{
+ return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING;
+}
+
+static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack)
+{
+ bool ret = vid > 0 && vid < VLAN_VID_MASK;
+
+ if (!ret)
+ NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid");
+
+ return ret;
+}
+
+static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur,
+ const struct bridge_vlan_info *last,
+ struct netlink_ext_ack *extack)
+{
+ /* pvid flag is not allowed in ranges */
+ if (cur->flags & BRIDGE_VLAN_INFO_PVID) {
+ NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range");
+ return false;
+ }
+
+ /* when cur is the range end, check if:
+ * - it has range start flag
+ * - range ids are invalid (end is equal to or before start)
+ */
+ if (last) {
+ if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one");
+ return false;
+ } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing");
+ return false;
+ } else if (cur->vid <= last->vid) {
+ NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+ return false;
+ }
+ }
+
+ /* check for required range flags */
+ if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
+ BRIDGE_VLAN_INFO_RANGE_END))) {
+ NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing");
+ return false;
+ }
+
+ return true;
+}
+
+static inline int br_afspec_cmd_to_rtm(int cmd)
+{
+ switch (cmd) {
+ case RTM_SETLINK:
+ return RTM_NEWVLAN;
+ case RTM_DELLINK:
+ return RTM_DELVLAN;
+ }
+
+ return 0;
+}
+
static inline int br_opt_get(const struct net_bridge *br,
enum net_bridge_opts opt)
{
@@ -565,7 +641,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid);
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid, bool added_by_user);
+ const unsigned char *addr, u16 vid, unsigned long flags);
int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev, const unsigned char *addr, u16 vid);
@@ -701,6 +777,8 @@ void br_multicast_get_stats(const struct net_bridge *br,
struct br_mcast_stats *dest);
void br_mdb_init(void);
void br_mdb_uninit(void);
+void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
@@ -861,7 +939,7 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb)
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool br_allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg, struct sk_buff *skb,
- u16 *vid);
+ u16 *vid, u8 *state);
bool br_allowed_egress(struct net_bridge_vlan_group *vg,
const struct sk_buff *skb);
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
@@ -896,6 +974,14 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
int br_vlan_bridge_event(struct net_device *dev, unsigned long event,
void *ptr);
+void br_vlan_rtnl_init(void);
+void br_vlan_rtnl_uninit(void);
+void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd);
+bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
@@ -947,11 +1033,15 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
return vg->pvid;
}
+static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
+{
+ return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags;
+}
#else
static inline bool br_allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb,
- u16 *vid)
+ u16 *vid, u8 *state)
{
return true;
}
@@ -1090,6 +1180,70 @@ static inline int br_vlan_bridge_event(struct net_device *dev,
{
return 0;
}
+
+static inline void br_vlan_rtnl_init(void)
+{
+}
+
+static inline void br_vlan_rtnl_uninit(void)
+{
+}
+
+static inline void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd)
+{
+}
+#endif
+
+/* br_vlan_options.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+bool br_vlan_opts_eq(const struct net_bridge_vlan *v1,
+ const struct net_bridge_vlan *v2);
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v);
+size_t br_vlan_opts_nl_size(void);
+int br_vlan_process_options(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan *range_start,
+ struct net_bridge_vlan *range_end,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack);
+
+/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */
+static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
+{
+ return READ_ONCE(v->state);
+}
+
+static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state)
+{
+ WRITE_ONCE(v->state, state);
+}
+
+static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg)
+{
+ return READ_ONCE(vg->pvid_state);
+}
+
+static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg,
+ u8 state)
+{
+ WRITE_ONCE(vg->pvid_state, state);
+}
+
+/* learn_allow is true at ingress and false at egress */
+static inline bool br_vlan_state_allowed(u8 state, bool learn_allow)
+{
+ switch (state) {
+ case BR_STATE_LEARNING:
+ return learn_allow;
+ case BR_STATE_FORWARDING:
+ return true;
+ default:
+ return false;
+ }
+}
#endif
struct nf_br_ops {
@@ -1161,6 +1315,12 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags,
int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
u32 filter_mask, int nlflags);
+int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last,
+ bool *changed,
+ struct netlink_ext_ack *extack);
#ifdef CONFIG_SYSFS
/* br_sysfs_if.c */
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 1f1410f8d312..6856a6d9282b 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -45,6 +45,17 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
br_info(p->br, "port %u(%s) entered %s state\n",
(unsigned int) p->port_no, p->dev->name,
br_port_state_names[p->state]);
+
+ if (p->br->stp_enabled == BR_KERNEL_STP) {
+ switch (p->state) {
+ case BR_STATE_BLOCKING:
+ p->stp_xstats.transition_blk++;
+ break;
+ case BR_STATE_FORWARDING:
+ p->stp_xstats.transition_fwd++;
+ break;
+ }
+ }
}
/* called under bridge lock */
@@ -484,6 +495,8 @@ void br_received_config_bpdu(struct net_bridge_port *p,
struct net_bridge *br;
int was_root;
+ p->stp_xstats.rx_bpdu++;
+
br = p->br;
was_root = br_is_root_bridge(br);
@@ -517,6 +530,8 @@ void br_received_config_bpdu(struct net_bridge_port *p,
/* called under bridge lock */
void br_received_tcn_bpdu(struct net_bridge_port *p)
{
+ p->stp_xstats.rx_tcn++;
+
if (br_is_designated_port(p)) {
br_info(p->br, "port %u(%s) received tcn bpdu\n",
(unsigned int) p->port_no, p->dev->name);
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 7796dd9d42d7..0e4572f31330 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -118,6 +118,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
br_set_ticks(buf+33, bpdu->forward_delay);
br_send_bpdu(p, buf, 35);
+
+ p->stp_xstats.tx_bpdu++;
}
/* called under bridge lock */
@@ -133,6 +135,8 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
buf[2] = 0;
buf[3] = BPDU_TYPE_TCN;
br_send_bpdu(p, buf, 4);
+
+ p->stp_xstats.tx_tcn++;
}
/*
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 921310d3cbae..015209bf44aa 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -129,15 +129,19 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
fdb->key.vlan_id,
fdb->dst->dev,
- fdb->added_by_user,
- fdb->offloaded);
+ test_bit(BR_FDB_ADDED_BY_USER,
+ &fdb->flags),
+ test_bit(BR_FDB_OFFLOADED,
+ &fdb->flags));
break;
case RTM_NEWNEIGH:
br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
fdb->key.vlan_id,
fdb->dst->dev,
- fdb->added_by_user,
- fdb->offloaded);
+ test_bit(BR_FDB_ADDED_BY_USER,
+ &fdb->flags),
+ test_bit(BR_FDB_OFFLOADED,
+ &fdb->flags));
break;
}
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index f5b2aeebbfe9..6b5deca08b89 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -34,13 +34,15 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
}
-static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg,
+ const struct net_bridge_vlan *v)
{
- if (vg->pvid == vid)
+ if (vg->pvid == v->vid)
return false;
smp_wmb();
- vg->pvid = vid;
+ br_vlan_set_pvid_state(vg, v->state);
+ vg->pvid = v->vid;
return true;
}
@@ -69,7 +71,7 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
vg = nbp_vlan_group(v->port);
if (flags & BRIDGE_VLAN_INFO_PVID)
- ret = __vlan_add_pvid(vg, v->vid);
+ ret = __vlan_add_pvid(vg, v);
else
ret = __vlan_delete_pvid(vg, v->vid);
@@ -257,6 +259,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
&changed, extack);
if (err)
goto out_filt;
+
+ if (changed)
+ br_vlan_notify(br, NULL, v->vid, 0,
+ RTM_NEWVLAN);
}
masterv = br_vlan_get_master(br, v->vid, extack);
@@ -289,6 +295,9 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
vg->num_vlans++;
}
+ /* set the state before publishing */
+ v->state = BR_STATE_FORWARDING;
+
err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
br_vlan_rht_params);
if (err)
@@ -380,13 +389,31 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
kfree(vg);
}
-static void __vlan_flush(struct net_bridge_vlan_group *vg)
+static void __vlan_flush(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
{
struct net_bridge_vlan *vlan, *tmp;
+ u16 v_start = 0, v_end = 0;
__vlan_delete_pvid(vg, vg->pvid);
- list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
+ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) {
+ /* take care of disjoint ranges */
+ if (!v_start) {
+ v_start = vlan->vid;
+ } else if (vlan->vid - v_end != 1) {
+ /* found range end, notify and start next one */
+ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
+ v_start = vlan->vid;
+ }
+ v_end = vlan->vid;
+
__vlan_del(vlan);
+ }
+
+ /* notify about the last/whole vlan range */
+ if (v_start)
+ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
}
struct sk_buff *br_handle_vlan(struct net_bridge *br,
@@ -444,7 +471,8 @@ out:
/* Called under RCU */
static bool __allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg,
- struct sk_buff *skb, u16 *vid)
+ struct sk_buff *skb, u16 *vid,
+ u8 *state)
{
struct br_vlan_stats *stats;
struct net_bridge_vlan *v;
@@ -510,13 +538,25 @@ static bool __allowed_ingress(const struct net_bridge *br,
skb->vlan_tci |= pvid;
/* if stats are disabled we can avoid the lookup */
- if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED))
- return true;
+ if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+ if (*state == BR_STATE_FORWARDING) {
+ *state = br_vlan_get_pvid_state(vg);
+ return br_vlan_state_allowed(*state, true);
+ } else {
+ return true;
+ }
+ }
}
v = br_vlan_find(vg, *vid);
if (!v || !br_vlan_should_use(v))
goto drop;
+ if (*state == BR_STATE_FORWARDING) {
+ *state = br_vlan_get_state(v);
+ if (!br_vlan_state_allowed(*state, true))
+ goto drop;
+ }
+
if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
stats = this_cpu_ptr(v->stats);
u64_stats_update_begin(&stats->syncp);
@@ -534,7 +574,7 @@ drop:
bool br_allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg, struct sk_buff *skb,
- u16 *vid)
+ u16 *vid, u8 *state)
{
/* If VLAN filtering is disabled on the bridge, all packets are
* permitted.
@@ -544,7 +584,7 @@ bool br_allowed_ingress(const struct net_bridge *br,
return true;
}
- return __allowed_ingress(br, vg, skb, vid);
+ return __allowed_ingress(br, vg, skb, vid, state);
}
/* Called under RCU. */
@@ -560,7 +600,8 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
br_vlan_get_tag(skb, &vid);
v = br_vlan_find(vg, vid);
- if (v && br_vlan_should_use(v))
+ if (v && br_vlan_should_use(v) &&
+ br_vlan_state_allowed(br_vlan_get_state(v), false))
return true;
return false;
@@ -571,6 +612,7 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
{
struct net_bridge_vlan_group *vg;
struct net_bridge *br = p->br;
+ struct net_bridge_vlan *v;
/* If filtering was disabled at input, let it pass. */
if (!br_opt_get(br, BROPT_VLAN_ENABLED))
@@ -585,13 +627,15 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
if (!*vid) {
*vid = br_get_pvid(vg);
- if (!*vid)
+ if (!*vid ||
+ !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true))
return false;
return true;
}
- if (br_vlan_find(vg, *vid))
+ v = br_vlan_find(vg, *vid);
+ if (v && br_vlan_state_allowed(br_vlan_get_state(v), true))
return true;
return false;
@@ -716,7 +760,7 @@ void br_vlan_flush(struct net_bridge *br)
ASSERT_RTNL();
vg = br_vlan_group(br);
- __vlan_flush(vg);
+ __vlan_flush(br, NULL, vg);
RCU_INIT_POINTER(br->vlgrp, NULL);
synchronize_rcu();
__vlan_group_free(vg);
@@ -925,12 +969,15 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br)
/* Disable default_pvid on all ports where it is still
* configured.
*/
- if (vlan_default_pvid(br_vlan_group(br), pvid))
- br_vlan_delete(br, pvid);
+ if (vlan_default_pvid(br_vlan_group(br), pvid)) {
+ if (!br_vlan_delete(br, pvid))
+ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
+ }
list_for_each_entry(p, &br->port_list, list) {
- if (vlan_default_pvid(nbp_vlan_group(p), pvid))
- nbp_vlan_delete(p, pvid);
+ if (vlan_default_pvid(nbp_vlan_group(p), pvid) &&
+ !nbp_vlan_delete(p, pvid))
+ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
}
br->default_pvid = 0;
@@ -972,7 +1019,10 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
&vlchange, extack);
if (err)
goto out;
- br_vlan_delete(br, old_pvid);
+
+ if (br_vlan_delete(br, old_pvid))
+ br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN);
+ br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN);
set_bit(0, changed);
}
@@ -992,7 +1042,9 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
&vlchange, extack);
if (err)
goto err_port;
- nbp_vlan_delete(p, old_pvid);
+ if (nbp_vlan_delete(p, old_pvid))
+ br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN);
+ br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN);
set_bit(p->port_no, changed);
}
@@ -1007,22 +1059,28 @@ err_port:
if (!test_bit(p->port_no, changed))
continue;
- if (old_pvid)
+ if (old_pvid) {
nbp_vlan_add(p, old_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
&vlchange, NULL);
+ br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN);
+ }
nbp_vlan_delete(p, pvid);
+ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
}
if (test_bit(0, changed)) {
- if (old_pvid)
+ if (old_pvid) {
br_vlan_add(br, old_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED |
BRIDGE_VLAN_INFO_BRENTRY,
&vlchange, NULL);
+ br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN);
+ }
br_vlan_delete(br, pvid);
+ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
}
goto out;
}
@@ -1115,6 +1173,7 @@ int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack)
&changed, extack);
if (ret)
goto err_vlan_add;
+ br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN);
}
out:
return ret;
@@ -1196,7 +1255,7 @@ void nbp_vlan_flush(struct net_bridge_port *port)
ASSERT_RTNL();
vg = nbp_vlan_group(port);
- __vlan_flush(vg);
+ __vlan_flush(port->br, port, vg);
RCU_INIT_POINTER(port->vlgrp, NULL);
synchronize_rcu();
__vlan_group_free(vg);
@@ -1281,6 +1340,8 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid,
p_vinfo->vid = vid;
p_vinfo->flags = v->flags;
+ if (vid == br_get_pvid(vg))
+ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
return 0;
}
EXPORT_SYMBOL_GPL(br_vlan_get_info);
@@ -1460,8 +1521,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
{
struct netdev_notifier_changeupper_info *info;
struct net_bridge *br = netdev_priv(dev);
- bool changed;
- int ret = 0;
+ int vlcmd = 0, ret = 0;
+ bool changed = false;
switch (event) {
case NETDEV_REGISTER:
@@ -1469,9 +1530,11 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED |
BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL);
+ vlcmd = RTM_NEWVLAN;
break;
case NETDEV_UNREGISTER:
- br_vlan_delete(br, br->default_pvid);
+ changed = !br_vlan_delete(br, br->default_pvid);
+ vlcmd = RTM_DELVLAN;
break;
case NETDEV_CHANGEUPPER:
info = ptr;
@@ -1485,6 +1548,8 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
br_vlan_link_state_change(dev, br);
break;
}
+ if (changed)
+ br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd);
return ret;
}
@@ -1503,3 +1568,441 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
break;
}
}
+
+/* v_opts is used to dump the options which must be equal in the whole range */
+static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts,
+ u16 flags)
+{
+ struct bridge_vlan_info info;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY);
+ if (!nest)
+ return false;
+
+ memset(&info, 0, sizeof(info));
+ info.vid = vid;
+ if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ info.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+ if (flags & BRIDGE_VLAN_INFO_PVID)
+ info.flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info))
+ goto out_err;
+
+ if (vid_range && vid < vid_range &&
+ !(flags & BRIDGE_VLAN_INFO_PVID) &&
+ nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range))
+ goto out_err;
+
+ if (v_opts && !br_vlan_opts_fill(skb, v_opts))
+ goto out_err;
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+static size_t rtnl_vlan_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
+ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */
+ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */
+ + br_vlan_opts_nl_size(); /* bridge vlan options */
+}
+
+void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v = NULL;
+ struct br_vlan_msg *bvm;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ struct net *net;
+ u16 flags = 0;
+ int ifindex;
+
+ /* right now notifications are done only with rtnl held */
+ ASSERT_RTNL();
+
+ if (p) {
+ ifindex = p->dev->ifindex;
+ vg = nbp_vlan_group(p);
+ net = dev_net(p->dev);
+ } else {
+ ifindex = br->dev->ifindex;
+ vg = br_vlan_group(br);
+ net = dev_net(br->dev);
+ }
+
+ skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out_err;
+
+ err = -EMSGSIZE;
+ nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0);
+ if (!nlh)
+ goto out_err;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = AF_BRIDGE;
+ bvm->ifindex = ifindex;
+
+ switch (cmd) {
+ case RTM_NEWVLAN:
+ /* need to find the vlan due to flags/options */
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v))
+ goto out_kfree;
+
+ flags = v->flags;
+ if (br_get_pvid(vg) == v->vid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+ break;
+ case RTM_DELVLAN:
+ break;
+ default:
+ goto out_kfree;
+ }
+
+ if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags))
+ goto out_err;
+
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
+ return;
+
+out_err:
+ rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err);
+out_kfree:
+ kfree_skb(skb);
+}
+
+/* check if v_curr can enter a range ending in range_end */
+bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return v_curr->vid - range_end->vid == 1 &&
+ range_end->flags == v_curr->flags &&
+ br_vlan_opts_eq(v_curr, range_end);
+}
+
+static int br_vlan_dump_dev(const struct net_device *dev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
+ struct net_bridge_vlan_group *vg;
+ int idx = 0, s_idx = cb->args[1];
+ struct nlmsghdr *nlh = NULL;
+ struct net_bridge_port *p;
+ struct br_vlan_msg *bvm;
+ struct net_bridge *br;
+ int err = 0;
+ u16 pvid;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group_rcu(br);
+ p = NULL;
+ } else {
+ p = br_port_get_rcu(dev);
+ if (WARN_ON(!p))
+ return -EINVAL;
+ vg = nbp_vlan_group_rcu(p);
+ br = p->br;
+ }
+
+ if (!vg)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = PF_BRIDGE;
+ bvm->ifindex = dev->ifindex;
+ pvid = br_get_pvid(vg);
+
+ /* idx must stay at range's beginning until it is filled in */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+ if (idx < s_idx) {
+ idx++;
+ continue;
+ }
+
+ if (!range_start) {
+ range_start = v;
+ range_end = v;
+ continue;
+ }
+
+ if (v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) {
+ u16 flags = br_vlan_flags(range_start, pvid);
+
+ if (!br_vlan_fill_vids(skb, range_start->vid,
+ range_end->vid, range_start,
+ flags)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ /* advance number of filled vlans */
+ idx += range_end->vid - range_start->vid + 1;
+
+ range_start = v;
+ }
+ range_end = v;
+ }
+
+ /* err will be 0 and range_start will be set in 3 cases here:
+ * - first vlan (range_start == range_end)
+ * - last vlan (range_start == range_end, not in range)
+ * - last vlan range (range_start != range_end, in range)
+ */
+ if (!err && range_start &&
+ !br_vlan_fill_vids(skb, range_start->vid, range_end->vid,
+ range_start, br_vlan_flags(range_start, pvid)))
+ err = -EMSGSIZE;
+
+ cb->args[1] = err ? idx : 0;
+
+ nlmsg_end(skb, nlh);
+
+ return err;
+}
+
+static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int idx = 0, err = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct br_vlan_msg *bvm;
+ struct net_device *dev;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*bvm), NULL, 0, NULL, cb->extack);
+ if (err < 0)
+ return err;
+
+ bvm = nlmsg_data(cb->nlh);
+
+ rcu_read_lock();
+ if (bvm->ifindex) {
+ dev = dev_get_by_index_rcu(net, bvm->ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_err;
+ }
+ err = br_vlan_dump_dev(dev, skb, cb);
+ if (err && err != -EMSGSIZE)
+ goto out_err;
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ if (idx < s_idx)
+ goto skip;
+
+ err = br_vlan_dump_dev(dev, skb, cb);
+ if (err == -EMSGSIZE)
+ break;
+skip:
+ idx++;
+ }
+ }
+ cb->args[0] = idx;
+ rcu_read_unlock();
+
+ return skb->len;
+
+out_err:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
+ [BRIDGE_VLANDB_ENTRY_INFO] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct bridge_vlan_info) },
+ [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
+};
+
+static int br_vlan_rtm_process_one(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd, struct netlink_ext_ack *extack)
+{
+ struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL;
+ struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1];
+ bool changed = false, skip_processing = false;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ int err = 0, cmdmap = 0;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (WARN_ON(!p))
+ return -ENODEV;
+ br = p->br;
+ vg = nbp_vlan_group(p);
+ }
+
+ if (WARN_ON(!vg))
+ return -ENODEV;
+
+ err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr,
+ br_vlan_db_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info");
+ return -EINVAL;
+ }
+ memset(&vrange_end, 0, sizeof(vrange_end));
+
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
+ BRIDGE_VLAN_INFO_RANGE_END)) {
+ NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls");
+ return -EINVAL;
+ }
+ if (!br_vlan_valid_id(vinfo->vid, extack))
+ return -EINVAL;
+
+ if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) {
+ vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]);
+ /* validate user-provided flags without RANGE_BEGIN */
+ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags;
+ vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+
+ /* vinfo_last is the range start, vinfo the range end */
+ vinfo_last = vinfo;
+ vinfo = &vrange_end;
+
+ if (!br_vlan_valid_id(vinfo->vid, extack) ||
+ !br_vlan_valid_range(vinfo, vinfo_last, extack))
+ return -EINVAL;
+ }
+
+ switch (cmd) {
+ case RTM_NEWVLAN:
+ cmdmap = RTM_SETLINK;
+ skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS);
+ break;
+ case RTM_DELVLAN:
+ cmdmap = RTM_DELLINK;
+ break;
+ }
+
+ if (!skip_processing) {
+ struct bridge_vlan_info *tmp_last = vinfo_last;
+
+ /* br_process_vlan_info may overwrite vinfo_last */
+ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last,
+ &changed, extack);
+
+ /* notify first if anything changed */
+ if (changed)
+ br_ifinfo_notify(cmdmap, br, p);
+
+ if (err)
+ return err;
+ }
+
+ /* deal with options */
+ if (cmd == RTM_NEWVLAN) {
+ struct net_bridge_vlan *range_start, *range_end;
+
+ if (vinfo_last) {
+ range_start = br_vlan_find(vg, vinfo_last->vid);
+ range_end = br_vlan_find(vg, vinfo->vid);
+ } else {
+ range_start = br_vlan_find(vg, vinfo->vid);
+ range_end = range_start;
+ }
+
+ err = br_vlan_process_options(br, p, range_start, range_end,
+ tb, extack);
+ }
+
+ return err;
+}
+
+static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct br_vlan_msg *bvm;
+ struct net_device *dev;
+ struct nlattr *attr;
+ int err, vlans = 0;
+ int rem;
+
+ /* this should validate the header and check for remaining bytes */
+ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL,
+ extack);
+ if (err < 0)
+ return err;
+
+ bvm = nlmsg_data(nlh);
+ dev = __dev_get_by_index(net, bvm->ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port");
+ return -EINVAL;
+ }
+
+ nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
+ if (nla_type(attr) != BRIDGE_VLANDB_ENTRY)
+ continue;
+
+ vlans++;
+ err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type,
+ extack);
+ if (err)
+ break;
+ }
+ if (!vlans) {
+ NL_SET_ERR_MSG_MOD(extack, "No vlans found to process");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+void br_vlan_rtnl_init(void)
+{
+ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL,
+ br_vlan_rtm_dump, 0);
+ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN,
+ br_vlan_rtm_process, NULL, 0);
+ rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN,
+ br_vlan_rtm_process, NULL, 0);
+}
+
+void br_vlan_rtnl_uninit(void)
+{
+ rtnl_unregister(PF_BRIDGE, RTM_GETVLAN);
+ rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN);
+ rtnl_unregister(PF_BRIDGE, RTM_DELVLAN);
+}
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
new file mode 100644
index 000000000000..cd2eb194eb98
--- /dev/null
+++ b/net/bridge/br_vlan_options.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include "br_private.h"
+
+/* check if the options between two vlans are equal */
+bool br_vlan_opts_eq(const struct net_bridge_vlan *v1,
+ const struct net_bridge_vlan *v2)
+{
+ return v1->state == v2->state;
+}
+
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
+{
+ return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE,
+ br_vlan_get_state(v));
+}
+
+size_t br_vlan_opts_nl_size(void)
+{
+ return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */
+}
+
+static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ u8 state,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (state > BR_STATE_BLOCKING) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state");
+ return -EINVAL;
+ }
+
+ if (br_vlan_is_brentry(v))
+ br = v->br;
+ else
+ br = v->port->br;
+
+ if (br->stp_enabled == BR_KERNEL_STP) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP");
+ return -EBUSY;
+ }
+
+ if (v->state == state)
+ return 0;
+
+ if (v->vid == br_get_pvid(vg))
+ br_vlan_set_pvid_state(vg, state);
+
+ br_vlan_set_state(v, state);
+ *changed = true;
+
+ return 0;
+}
+
+static int br_vlan_process_one_opts(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ *changed = false;
+ if (tb[BRIDGE_VLANDB_ENTRY_STATE]) {
+ u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]);
+
+ err = br_vlan_modify_state(vg, v, state, changed, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_vlan_process_options(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan *range_start,
+ struct net_bridge_vlan *range_end,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL;
+ struct net_bridge_vlan_group *vg;
+ int vid, err = 0;
+ u16 pvid;
+
+ if (p)
+ vg = nbp_vlan_group(p);
+ else
+ vg = br_vlan_group(br);
+
+ if (!range_start || !br_vlan_should_use(range_start)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options");
+ return -ENOENT;
+ }
+ if (!range_end || !br_vlan_should_use(range_end)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options");
+ return -ENOENT;
+ }
+
+ pvid = br_get_pvid(vg);
+ for (vid = range_start->vid; vid <= range_end->vid; vid++) {
+ bool changed = false;
+
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options");
+ err = -ENOENT;
+ break;
+ }
+
+ err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed,
+ extack);
+ if (err)
+ break;
+
+ if (changed) {
+ /* vlan options changed, check for range */
+ if (!curr_start) {
+ curr_start = v;
+ curr_end = v;
+ continue;
+ }
+
+ if (v->vid == pvid ||
+ !br_vlan_can_enter_range(v, curr_end)) {
+ br_vlan_notify(br, p, curr_start->vid,
+ curr_end->vid, RTM_NEWVLAN);
+ curr_start = v;
+ }
+ curr_end = v;
+ } else {
+ /* nothing changed and nothing to notify yet */
+ if (!curr_start)
+ continue;
+
+ br_vlan_notify(br, p, curr_start->vid, curr_end->vid,
+ RTM_NEWVLAN);
+ curr_start = NULL;
+ curr_end = NULL;
+ }
+ }
+ if (curr_start)
+ br_vlan_notify(br, p, curr_start->vid, curr_end->vid,
+ RTM_NEWVLAN);
+
+ return err;
+}
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 2c8fe24400e5..68c2519bdc52 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -11,7 +11,13 @@
#include <linux/module.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_802_3.h>
+#include <linux/skbuff.h>
+#include <uapi/linux/netfilter_bridge/ebt_802_3.h>
+
+static struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb)
+{
+ return (struct ebt_802_3_hdr *)skb_mac_header(skb);
+}
static bool
ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index ed91ea31978a..12a4f4d93681 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -20,7 +20,6 @@ static unsigned int
ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- struct net_device *dev;
if (skb_ensure_writable(skb, ETH_ALEN))
return EBT_DROP;
@@ -33,10 +32,22 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
else
skb->pkt_type = PACKET_MULTICAST;
} else {
- if (xt_hooknum(par) != NF_BR_BROUTING)
- dev = br_port_get_rcu(xt_in(par))->br->dev;
- else
+ const struct net_device *dev;
+
+ switch (xt_hooknum(par)) {
+ case NF_BR_BROUTING:
dev = xt_in(par);
+ break;
+ case NF_BR_PRE_ROUTING:
+ dev = br_port_get_rcu(xt_in(par))->br->dev;
+ break;
+ default:
+ dev = NULL;
+ break;
+ }
+
+ if (!dev) /* NF_BR_LOCAL_OUT */
+ return info->target;
if (ether_addr_equal(info->mac, dev->dev_addr))
skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index c8177a89f52c..e1256e03a9a8 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -221,7 +221,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
return NF_DROP;
}
- ADD_COUNTER(*(counter_base + i), 1, skb->len);
+ ADD_COUNTER(*(counter_base + i), skb->len, 1);
/* these should only watch: not modify, nor tell us
* what to do with the packet
@@ -959,8 +959,8 @@ static void get_counters(const struct ebt_counter *oldcounters,
continue;
counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
for (i = 0; i < nentries; i++)
- ADD_COUNTER(counters[i], counter_base[i].pcnt,
- counter_base[i].bcnt);
+ ADD_COUNTER(counters[i], counter_base[i].bcnt,
+ counter_base[i].pcnt);
}
}
@@ -1280,7 +1280,7 @@ static int do_update_counters(struct net *net, const char *name,
/* we add to the counters of the first cpu */
for (i = 0; i < num_counters; i++)
- ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt);
+ ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt);
write_unlock_bh(&t->lock);
ret = 0;
@@ -1867,7 +1867,7 @@ static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz)
}
static int ebt_buf_add(struct ebt_entries_buf_state *state,
- void *data, unsigned int sz)
+ const void *data, unsigned int sz)
{
if (state->buf_kern_start == NULL)
goto count_only;
@@ -1901,7 +1901,7 @@ enum compat_mwt {
EBT_COMPAT_TARGET,
};
-static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
+static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
enum compat_mwt compat_mwt,
struct ebt_entries_buf_state *state,
const unsigned char *base)
@@ -1979,22 +1979,23 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
/* return size of all matches, watchers or target, including necessary
* alignment and padding.
*/
-static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
+static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32,
unsigned int size_left, enum compat_mwt type,
struct ebt_entries_buf_state *state, const void *base)
{
+ const char *buf = (const char *)match32;
int growth = 0;
- char *buf;
if (size_left == 0)
return 0;
- buf = (char *) match32;
-
- while (size_left >= sizeof(*match32)) {
+ do {
struct ebt_entry_match *match_kern;
int ret;
+ if (size_left < sizeof(*match32))
+ return -EINVAL;
+
match_kern = (struct ebt_entry_match *) state->buf_kern_start;
if (match_kern) {
char *tmp;
@@ -2031,22 +2032,18 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
if (match_kern)
match_kern->match_size = ret;
- /* rule should have no remaining data after target */
- if (type == EBT_COMPAT_TARGET && size_left)
- return -EINVAL;
-
match32 = (struct compat_ebt_entry_mwt *) buf;
- }
+ } while (size_left);
return growth;
}
/* called for all ebt_entry structures. */
-static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
+static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base,
unsigned int *total,
struct ebt_entries_buf_state *state)
{
- unsigned int i, j, startoff, new_offset = 0;
+ unsigned int i, j, startoff, next_expected_off, new_offset = 0;
/* stores match/watchers/targets & offset of next struct ebt_entry: */
unsigned int offsets[4];
unsigned int *offsets_update = NULL;
@@ -2132,11 +2129,13 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
return ret;
}
- startoff = state->buf_user_offset - startoff;
+ next_expected_off = state->buf_user_offset - startoff;
+ if (next_expected_off != entry->next_offset)
+ return -EINVAL;
- if (WARN_ON(*total < startoff))
+ if (*total < entry->next_offset)
return -EINVAL;
- *total -= startoff;
+ *total -= entry->next_offset;
return 0;
}
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 4f5444d2a526..809673222382 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -17,7 +17,6 @@
#include <net/netfilter/nf_conntrack_bridge.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_tables.h>
#include "../br_private.h"
@@ -27,13 +26,14 @@
*/
static int nf_br_ip_fragment(struct net *net, struct sock *sk,
struct sk_buff *skb,
- struct nf_ct_bridge_frag_data *data,
+ struct nf_bridge_frag_data *data,
int (*output)(struct net *, struct sock *sk,
- const struct nf_ct_bridge_frag_data *data,
+ const struct nf_bridge_frag_data *data,
struct sk_buff *))
{
int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
unsigned int hlen, ll_rs, mtu;
+ ktime_t tstamp = skb->tstamp;
struct ip_frag_state state;
struct iphdr *iph;
int err;
@@ -81,6 +81,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
if (iter.frag)
ip_fraglist_prepare(skb, &iter);
+ skb->tstamp = tstamp;
err = output(net, sk, data, skb);
if (err || !iter.frag)
break;
@@ -94,7 +95,7 @@ slow_path:
* This may also be a clone skbuff, we could preserve the geometry for
* the copies but probably not worth the effort.
*/
- ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state);
+ ip_frag_init(skb, hlen, ll_rs, frag_max_size, false, &state);
while (state.left > 0) {
struct sk_buff *skb2;
@@ -105,6 +106,7 @@ slow_path:
goto blackhole;
}
+ skb2->tstamp = tstamp;
err = output(net, sk, data, skb2);
if (err)
goto blackhole;
@@ -279,7 +281,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
}
static void nf_ct_bridge_frag_save(struct sk_buff *skb,
- struct nf_ct_bridge_frag_data *data)
+ struct nf_bridge_frag_data *data)
{
if (skb_vlan_tag_present(skb)) {
data->vlan_present = true;
@@ -294,10 +296,10 @@ static void nf_ct_bridge_frag_save(struct sk_buff *skb,
static unsigned int
nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
int (*output)(struct net *, struct sock *sk,
- const struct nf_ct_bridge_frag_data *data,
+ const struct nf_bridge_frag_data *data,
struct sk_buff *))
{
- struct nf_ct_bridge_frag_data data;
+ struct nf_bridge_frag_data data;
if (!BR_INPUT_SKB_CB(skb)->frag_max_size)
return NF_ACCEPT;
@@ -320,7 +322,7 @@ nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
/* Actually only slow path refragmentation needs this. */
static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
- const struct nf_ct_bridge_frag_data *data)
+ const struct nf_bridge_frag_data *data)
{
int err;
@@ -341,7 +343,7 @@ static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
}
static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
- const struct nf_ct_bridge_frag_data *data,
+ const struct nf_bridge_frag_data *data,
struct sk_buff *skb)
{
int err;
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 1804e867f715..7c9e92b2f806 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -53,7 +53,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
goto err;
br_vlan_get_proto(br_dev, &p_proto);
- nft_reg_store16(dest, p_proto);
+ nft_reg_store16(dest, htons(p_proto));
return;
}
default:
diff --git a/net/caif/Kconfig b/net/caif/Kconfig
index eb83051c8330..b7532a79ca7a 100644
--- a/net/caif/Kconfig
+++ b/net/caif/Kconfig
@@ -13,11 +13,11 @@ menuconfig CAIF
with its modems. It is accessed from user space as sockets (PF_CAIF).
Say Y (or M) here if you build for a phone product (e.g. Android or
- MeeGo ) that uses CAIF as transport, if unsure say N.
+ MeeGo) that uses CAIF as transport. If unsure say N.
If you select to build it as module then CAIF_NETDEV also needs to be
- built as modules. You will also need to say yes to any CAIF physical
- devices that your platform requires.
+ built as a module. You will also need to say Y (or M) to any CAIF
+ physical devices that your platform requires.
See Documentation/networking/caif for a further explanation on how to
use and configure CAIF.
@@ -37,7 +37,7 @@ config CAIF_NETDEV
default CAIF
---help---
Say Y if you will be using a CAIF based GPRS network device.
- This can be either built-in or a loadable module,
+ This can be either built-in or a loadable module.
If you select to build it as a built-in then the main CAIF device must
also be a built-in.
If unsure say Y.
@@ -48,7 +48,7 @@ config CAIF_USB
default n
---help---
Say Y if you are using CAIF over USB CDC NCM.
- This can be either built-in or a loadable module,
+ This can be either built-in or a loadable module.
If you select to build it as a built-in then the main CAIF device must
also be a built-in.
If unsure say N.
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 13ea920600ae..ef14da50a981 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -953,7 +953,7 @@ static __poll_t caif_poll(struct file *file,
mask |= EPOLLRDHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
(sk->sk_shutdown & RCV_SHUTDOWN))
mask |= EPOLLIN | EPOLLRDNORM;
diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
index 76bd67891fb3..a0116b9503d9 100644
--- a/net/caif/caif_usb.c
+++ b/net/caif/caif_usb.c
@@ -62,7 +62,7 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt)
hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1);
if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) {
- pr_warn("Headroom to small\n");
+ pr_warn("Headroom too small\n");
kfree_skb(skb);
return -EIO;
}
diff --git a/net/can/Kconfig b/net/can/Kconfig
index 0f9fe846ddef..d77042752457 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -8,11 +8,12 @@ menuconfig CAN
tristate "CAN bus subsystem support"
---help---
Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
- communications protocol which was developed by Bosch in
- 1991, mainly for automotive, but now widely used in marine
- (NMEA2000), industrial, and medical applications.
- More information on the CAN network protocol family PF_CAN
- is contained in <Documentation/networking/can.rst>.
+ communications protocol. Development of the CAN bus started in
+ 1983 at Robert Bosch GmbH, and the protocol was officially
+ released in 1986. The CAN bus was originally mainly for automotive,
+ but is now widely used in marine (NMEA2000), industrial, and medical
+ applications. More information on the CAN network protocol family
+ PF_CAN is contained in <Documentation/networking/can.rst>.
If you want CAN support you should say Y here and also to the
specific driver for your controller(s) below.
@@ -52,6 +53,8 @@ config CAN_GW
They can be modified with AND/OR/XOR/SET operations as configured
by the netlink configuration interface known e.g. from iptables.
+source "net/can/j1939/Kconfig"
+
source "drivers/net/can/Kconfig"
endif
diff --git a/net/can/Makefile b/net/can/Makefile
index 1242bbbfe57f..08bd217fc051 100644
--- a/net/can/Makefile
+++ b/net/can/Makefile
@@ -15,3 +15,5 @@ can-bcm-y := bcm.o
obj-$(CONFIG_CAN_GW) += can-gw.o
can-gw-y := gw.o
+
+obj-$(CONFIG_CAN_J1939) += j1939/
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 80281ef2ccbd..128d37a4c2e0 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -1,5 +1,5 @@
-/*
- * af_can.c - Protocol family CAN core module
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* af_can.c - Protocol family CAN core module
* (used by different CAN protocol modules)
*
* Copyright (c) 2002-2017 Volkswagen Group Electronic Research
@@ -58,6 +58,7 @@
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
+#include <linux/can/can-ml.h>
#include <linux/ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -83,24 +84,14 @@ static DEFINE_MUTEX(proto_tab_lock);
static atomic_t skbcounter = ATOMIC_INIT(0);
-/*
- * af_can socket functions
- */
-
-int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
-{
- switch (cmd) {
- default:
- return -ENOIOCTLCMD;
- }
-}
-EXPORT_SYMBOL(can_ioctl);
+/* af_can socket functions */
-static void can_sock_destruct(struct sock *sk)
+void can_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_error_queue);
}
+EXPORT_SYMBOL(can_sock_destruct);
static const struct can_proto *can_get_proto(int protocol)
{
@@ -140,14 +131,13 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
err = request_module("can-proto-%d", protocol);
- /*
- * In case of error we only print a message but don't
+ /* In case of error we only print a message but don't
* return the error code immediately. Below we will
* return -EPROTONOSUPPORT
*/
if (err)
- printk_ratelimited(KERN_ERR "can: request_module "
- "(can-proto-%d) failed.\n", protocol);
+ pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n",
+ protocol);
cp = can_get_proto(protocol);
}
@@ -188,9 +178,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
return err;
}
-/*
- * af_can tx path
- */
+/* af_can tx path */
/**
* can_send - transmit a CAN frame (optional with local loopback)
@@ -212,7 +200,7 @@ int can_send(struct sk_buff *skb, int loop)
{
struct sk_buff *newskb = NULL;
struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
- struct s_stats *can_stats = dev_net(skb->dev)->can.can_stats;
+ struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats;
int err = -EINVAL;
if (skb->len == CAN_MTU) {
@@ -223,11 +211,11 @@ int can_send(struct sk_buff *skb, int loop)
skb->protocol = htons(ETH_P_CANFD);
if (unlikely(cfd->len > CANFD_MAX_DLEN))
goto inval_skb;
- } else
+ } else {
goto inval_skb;
+ }
- /*
- * Make sure the CAN frame can pass the selected CAN netdevice.
+ /* Make sure the CAN frame can pass the selected CAN netdevice.
* As structs can_frame and canfd_frame are similar, we can provide
* CAN FD frames to legacy CAN drivers as long as the length is <= 8
*/
@@ -258,8 +246,7 @@ int can_send(struct sk_buff *skb, int loop)
/* indication for the CAN driver: do loopback */
skb->pkt_type = PACKET_LOOPBACK;
- /*
- * The reference to the originating sock may be required
+ /* The reference to the originating sock may be required
* by the receiving socket to check whether the frame is
* its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS
* Therefore we have to ensure that skb->sk remains the
@@ -268,8 +255,7 @@ int can_send(struct sk_buff *skb, int loop)
*/
if (!(skb->dev->flags & IFF_ECHO)) {
- /*
- * If the interface is not capable to do loopback
+ /* If the interface is not capable to do loopback
* itself, we do it here.
*/
newskb = skb_clone(skb, GFP_ATOMIC);
@@ -301,8 +287,8 @@ int can_send(struct sk_buff *skb, int loop)
netif_rx_ni(newskb);
/* update statistics */
- can_stats->tx_frames++;
- can_stats->tx_frames_delta++;
+ pkg_stats->tx_frames++;
+ pkg_stats->tx_frames_delta++;
return 0;
@@ -312,17 +298,17 @@ inval_skb:
}
EXPORT_SYMBOL(can_send);
-/*
- * af_can rx path
- */
+/* af_can rx path */
-static struct can_dev_rcv_lists *find_dev_rcv_lists(struct net *net,
- struct net_device *dev)
+static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net,
+ struct net_device *dev)
{
- if (!dev)
- return net->can.can_rx_alldev_list;
- else
- return (struct can_dev_rcv_lists *)dev->ml_priv;
+ if (dev) {
+ struct can_ml_priv *ml_priv = dev->ml_priv;
+ return &ml_priv->dev_rcv_lists;
+ } else {
+ return net->can.rx_alldev_list;
+ }
}
/**
@@ -349,7 +335,7 @@ static unsigned int effhash(canid_t can_id)
}
/**
- * find_rcv_list - determine optimal filterlist inside device filter struct
+ * can_rcv_list_find - determine optimal filterlist inside device filter struct
* @can_id: pointer to CAN identifier of a given can_filter
* @mask: pointer to CAN mask of a given can_filter
* @d: pointer to the device filter struct
@@ -375,8 +361,8 @@ static unsigned int effhash(canid_t can_id)
* Constistency checked mask.
* Reduced can_id to have a preprocessed filter compare value.
*/
-static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
- struct can_dev_rcv_lists *d)
+static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
+ struct can_dev_rcv_lists *dev_rcv_lists)
{
canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */
@@ -384,7 +370,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
if (*mask & CAN_ERR_FLAG) {
/* clear CAN_ERR_FLAG in filter entry */
*mask &= CAN_ERR_MASK;
- return &d->rx[RX_ERR];
+ return &dev_rcv_lists->rx[RX_ERR];
}
/* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */
@@ -400,27 +386,26 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
/* inverse can_id/can_mask filter */
if (inv)
- return &d->rx[RX_INV];
+ return &dev_rcv_lists->rx[RX_INV];
/* mask == 0 => no condition testing at receive time */
if (!(*mask))
- return &d->rx[RX_ALL];
+ return &dev_rcv_lists->rx[RX_ALL];
/* extra filterlists for the subscription of a single non-RTR can_id */
if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) &&
!(*can_id & CAN_RTR_FLAG)) {
-
if (*can_id & CAN_EFF_FLAG) {
if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS))
- return &d->rx_eff[effhash(*can_id)];
+ return &dev_rcv_lists->rx_eff[effhash(*can_id)];
} else {
if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS))
- return &d->rx_sff[*can_id];
+ return &dev_rcv_lists->rx_sff[*can_id];
}
}
/* default: filter via can_id/can_mask */
- return &d->rx[RX_FIL];
+ return &dev_rcv_lists->rx[RX_FIL];
}
/**
@@ -457,10 +442,10 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
canid_t mask, void (*func)(struct sk_buff *, void *),
void *data, char *ident, struct sock *sk)
{
- struct receiver *r;
- struct hlist_head *rl;
- struct can_dev_rcv_lists *d;
- struct s_pstats *can_pstats = net->can.can_pstats;
+ struct receiver *rcv;
+ struct hlist_head *rcv_list;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
int err = 0;
/* insert new receiver (dev,canid,mask) -> (func,data) */
@@ -471,50 +456,42 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
if (dev && !net_eq(net, dev_net(dev)))
return -ENODEV;
- r = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
- if (!r)
+ rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
+ if (!rcv)
return -ENOMEM;
- spin_lock(&net->can.can_rcvlists_lock);
+ spin_lock_bh(&net->can.rcvlists_lock);
- d = find_dev_rcv_lists(net, dev);
- if (d) {
- rl = find_rcv_list(&can_id, &mask, d);
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);
- r->can_id = can_id;
- r->mask = mask;
- r->matches = 0;
- r->func = func;
- r->data = data;
- r->ident = ident;
- r->sk = sk;
+ rcv->can_id = can_id;
+ rcv->mask = mask;
+ rcv->matches = 0;
+ rcv->func = func;
+ rcv->data = data;
+ rcv->ident = ident;
+ rcv->sk = sk;
- hlist_add_head_rcu(&r->list, rl);
- d->entries++;
+ hlist_add_head_rcu(&rcv->list, rcv_list);
+ dev_rcv_lists->entries++;
- can_pstats->rcv_entries++;
- if (can_pstats->rcv_entries_max < can_pstats->rcv_entries)
- can_pstats->rcv_entries_max = can_pstats->rcv_entries;
- } else {
- kmem_cache_free(rcv_cache, r);
- err = -ENODEV;
- }
-
- spin_unlock(&net->can.can_rcvlists_lock);
+ rcv_lists_stats->rcv_entries++;
+ rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max,
+ rcv_lists_stats->rcv_entries);
+ spin_unlock_bh(&net->can.rcvlists_lock);
return err;
}
EXPORT_SYMBOL(can_rx_register);
-/*
- * can_rx_delete_receiver - rcu callback for single receiver entry removal
- */
+/* can_rx_delete_receiver - rcu callback for single receiver entry removal */
static void can_rx_delete_receiver(struct rcu_head *rp)
{
- struct receiver *r = container_of(rp, struct receiver, rcu);
- struct sock *sk = r->sk;
+ struct receiver *rcv = container_of(rp, struct receiver, rcu);
+ struct sock *sk = rcv->sk;
- kmem_cache_free(rcv_cache, r);
+ kmem_cache_free(rcv_cache, rcv);
if (sk)
sock_put(sk);
}
@@ -534,10 +511,10 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
canid_t mask, void (*func)(struct sk_buff *, void *),
void *data)
{
- struct receiver *r = NULL;
- struct hlist_head *rl;
- struct s_pstats *can_pstats = net->can.can_pstats;
- struct can_dev_rcv_lists *d;
+ struct receiver *rcv = NULL;
+ struct hlist_head *rcv_list;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ struct can_dev_rcv_lists *dev_rcv_lists;
if (dev && dev->type != ARPHRD_CAN)
return;
@@ -545,86 +522,69 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
if (dev && !net_eq(net, dev_net(dev)))
return;
- spin_lock(&net->can.can_rcvlists_lock);
+ spin_lock_bh(&net->can.rcvlists_lock);
- d = find_dev_rcv_lists(net, dev);
- if (!d) {
- pr_err("BUG: receive list not found for "
- "dev %s, id %03X, mask %03X\n",
- DNAME(dev), can_id, mask);
- goto out;
- }
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);
- rl = find_rcv_list(&can_id, &mask, d);
-
- /*
- * Search the receiver list for the item to delete. This should
+ /* Search the receiver list for the item to delete. This should
* exist, since no receiver may be unregistered that hasn't
* been registered before.
*/
-
- hlist_for_each_entry_rcu(r, rl, list) {
- if (r->can_id == can_id && r->mask == mask &&
- r->func == func && r->data == data)
+ hlist_for_each_entry_rcu(rcv, rcv_list, list) {
+ if (rcv->can_id == can_id && rcv->mask == mask &&
+ rcv->func == func && rcv->data == data)
break;
}
- /*
- * Check for bugs in CAN protocol implementations using af_can.c:
- * 'r' will be NULL if no matching list item was found for removal.
+ /* Check for bugs in CAN protocol implementations using af_can.c:
+ * 'rcv' will be NULL if no matching list item was found for removal.
*/
-
- if (!r) {
- WARN(1, "BUG: receive list entry not found for dev %s, "
- "id %03X, mask %03X\n", DNAME(dev), can_id, mask);
+ if (!rcv) {
+ WARN(1, "BUG: receive list entry not found for dev %s, id %03X, mask %03X\n",
+ DNAME(dev), can_id, mask);
goto out;
}
- hlist_del_rcu(&r->list);
- d->entries--;
+ hlist_del_rcu(&rcv->list);
+ dev_rcv_lists->entries--;
- if (can_pstats->rcv_entries > 0)
- can_pstats->rcv_entries--;
-
- /* remove device structure requested by NETDEV_UNREGISTER */
- if (d->remove_on_zero_entries && !d->entries) {
- kfree(d);
- dev->ml_priv = NULL;
- }
+ if (rcv_lists_stats->rcv_entries > 0)
+ rcv_lists_stats->rcv_entries--;
out:
- spin_unlock(&net->can.can_rcvlists_lock);
+ spin_unlock_bh(&net->can.rcvlists_lock);
/* schedule the receiver item for deletion */
- if (r) {
- if (r->sk)
- sock_hold(r->sk);
- call_rcu(&r->rcu, can_rx_delete_receiver);
+ if (rcv) {
+ if (rcv->sk)
+ sock_hold(rcv->sk);
+ call_rcu(&rcv->rcu, can_rx_delete_receiver);
}
}
EXPORT_SYMBOL(can_rx_unregister);
-static inline void deliver(struct sk_buff *skb, struct receiver *r)
+static inline void deliver(struct sk_buff *skb, struct receiver *rcv)
{
- r->func(skb, r->data);
- r->matches++;
+ rcv->func(skb, rcv->data);
+ rcv->matches++;
}
-static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
+static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb)
{
- struct receiver *r;
+ struct receiver *rcv;
int matches = 0;
struct can_frame *cf = (struct can_frame *)skb->data;
canid_t can_id = cf->can_id;
- if (d->entries == 0)
+ if (dev_rcv_lists->entries == 0)
return 0;
if (can_id & CAN_ERR_FLAG) {
/* check for error message frame entries only */
- hlist_for_each_entry_rcu(r, &d->rx[RX_ERR], list) {
- if (can_id & r->mask) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) {
+ if (can_id & rcv->mask) {
+ deliver(skb, rcv);
matches++;
}
}
@@ -632,23 +592,23 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
}
/* check for unfiltered entries */
- hlist_for_each_entry_rcu(r, &d->rx[RX_ALL], list) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) {
+ deliver(skb, rcv);
matches++;
}
/* check for can_id/mask entries */
- hlist_for_each_entry_rcu(r, &d->rx[RX_FIL], list) {
- if ((can_id & r->mask) == r->can_id) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) {
+ if ((can_id & rcv->mask) == rcv->can_id) {
+ deliver(skb, rcv);
matches++;
}
}
/* check for inverted can_id/mask entries */
- hlist_for_each_entry_rcu(r, &d->rx[RX_INV], list) {
- if ((can_id & r->mask) != r->can_id) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) {
+ if ((can_id & rcv->mask) != rcv->can_id) {
+ deliver(skb, rcv);
matches++;
}
}
@@ -658,16 +618,16 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
return matches;
if (can_id & CAN_EFF_FLAG) {
- hlist_for_each_entry_rcu(r, &d->rx_eff[effhash(can_id)], list) {
- if (r->can_id == can_id) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) {
+ if (rcv->can_id == can_id) {
+ deliver(skb, rcv);
matches++;
}
}
} else {
can_id &= CAN_SFF_MASK;
- hlist_for_each_entry_rcu(r, &d->rx_sff[can_id], list) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) {
+ deliver(skb, rcv);
matches++;
}
}
@@ -677,14 +637,14 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
static void can_receive(struct sk_buff *skb, struct net_device *dev)
{
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = dev_net(dev);
- struct s_stats *can_stats = net->can.can_stats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
int matches;
/* update statistics */
- can_stats->rx_frames++;
- can_stats->rx_frames_delta++;
+ pkg_stats->rx_frames++;
+ pkg_stats->rx_frames_delta++;
/* create non-zero unique skb identifier together with *skb */
while (!(can_skb_prv(skb)->skbcnt))
@@ -693,12 +653,11 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
rcu_read_lock();
/* deliver the packet to sockets listening on all devices */
- matches = can_rcv_filter(net->can.can_rx_alldev_list, skb);
+ matches = can_rcv_filter(net->can.rx_alldev_list, skb);
/* find receive list for this device */
- d = find_dev_rcv_lists(net, dev);
- if (d)
- matches += can_rcv_filter(d, skb);
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ matches += can_rcv_filter(dev_rcv_lists, skb);
rcu_read_unlock();
@@ -706,8 +665,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
consume_skb(skb);
if (matches > 0) {
- can_stats->matches++;
- can_stats->matches_delta++;
+ pkg_stats->matches++;
+ pkg_stats->matches_delta++;
}
}
@@ -729,7 +688,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
}
static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
+ struct packet_type *pt, struct net_device *orig_dev)
{
struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
@@ -745,9 +704,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
return NET_RX_SUCCESS;
}
-/*
- * af_can protocol functions
- */
+/* af_can protocol functions */
/**
* can_proto_register - register CAN transport protocol
@@ -778,8 +735,9 @@ int can_proto_register(const struct can_proto *cp)
if (rcu_access_pointer(proto_tab[proto])) {
pr_err("can: protocol %d already registered\n", proto);
err = -EBUSY;
- } else
+ } else {
RCU_INIT_POINTER(proto_tab[proto], cp);
+ }
mutex_unlock(&proto_tab_lock);
@@ -809,48 +767,19 @@ void can_proto_unregister(const struct can_proto *cp)
}
EXPORT_SYMBOL(can_proto_unregister);
-/*
- * af_can notifier to create/remove CAN netdevice specific structs
- */
+/* af_can notifier to create/remove CAN netdevice specific structs */
static int can_notifier(struct notifier_block *nb, unsigned long msg,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct can_dev_rcv_lists *d;
if (dev->type != ARPHRD_CAN)
return NOTIFY_DONE;
switch (msg) {
-
case NETDEV_REGISTER:
-
- /* create new dev_rcv_lists for this device */
- d = kzalloc(sizeof(*d), GFP_KERNEL);
- if (!d)
- return NOTIFY_DONE;
- BUG_ON(dev->ml_priv);
- dev->ml_priv = d;
-
- break;
-
- case NETDEV_UNREGISTER:
- spin_lock(&dev_net(dev)->can.can_rcvlists_lock);
-
- d = dev->ml_priv;
- if (d) {
- if (d->entries)
- d->remove_on_zero_entries = 1;
- else {
- kfree(d);
- dev->ml_priv = NULL;
- }
- } else
- pr_err("can: notifier: receive list not found for dev "
- "%s\n", dev->name);
-
- spin_unlock(&dev_net(dev)->can.can_rcvlists_lock);
-
+ WARN(!dev->ml_priv,
+ "No CAN mid layer private allocated, please fix your driver and use alloc_candev()!\n");
break;
}
@@ -859,71 +788,54 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
static int can_pernet_init(struct net *net)
{
- spin_lock_init(&net->can.can_rcvlists_lock);
- net->can.can_rx_alldev_list =
- kzalloc(sizeof(struct can_dev_rcv_lists), GFP_KERNEL);
- if (!net->can.can_rx_alldev_list)
+ spin_lock_init(&net->can.rcvlists_lock);
+ net->can.rx_alldev_list =
+ kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL);
+ if (!net->can.rx_alldev_list)
goto out;
- net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL);
- if (!net->can.can_stats)
- goto out_free_alldev_list;
- net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL);
- if (!net->can.can_pstats)
- goto out_free_can_stats;
+ net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL);
+ if (!net->can.pkg_stats)
+ goto out_free_rx_alldev_list;
+ net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL);
+ if (!net->can.rcv_lists_stats)
+ goto out_free_pkg_stats;
if (IS_ENABLED(CONFIG_PROC_FS)) {
/* the statistics are updated every second (timer triggered) */
if (stats_timer) {
- timer_setup(&net->can.can_stattimer, can_stat_update,
+ timer_setup(&net->can.stattimer, can_stat_update,
0);
- mod_timer(&net->can.can_stattimer,
+ mod_timer(&net->can.stattimer,
round_jiffies(jiffies + HZ));
}
- net->can.can_stats->jiffies_init = jiffies;
+ net->can.pkg_stats->jiffies_init = jiffies;
can_init_proc(net);
}
return 0;
- out_free_can_stats:
- kfree(net->can.can_stats);
- out_free_alldev_list:
- kfree(net->can.can_rx_alldev_list);
+ out_free_pkg_stats:
+ kfree(net->can.pkg_stats);
+ out_free_rx_alldev_list:
+ kfree(net->can.rx_alldev_list);
out:
return -ENOMEM;
}
static void can_pernet_exit(struct net *net)
{
- struct net_device *dev;
-
if (IS_ENABLED(CONFIG_PROC_FS)) {
can_remove_proc(net);
if (stats_timer)
- del_timer_sync(&net->can.can_stattimer);
+ del_timer_sync(&net->can.stattimer);
}
- /* remove created dev_rcv_lists from still registered CAN devices */
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- if (dev->type == ARPHRD_CAN && dev->ml_priv) {
- struct can_dev_rcv_lists *d = dev->ml_priv;
-
- BUG_ON(d->entries);
- kfree(d);
- dev->ml_priv = NULL;
- }
- }
- rcu_read_unlock();
-
- kfree(net->can.can_rx_alldev_list);
- kfree(net->can.can_stats);
- kfree(net->can.can_pstats);
+ kfree(net->can.rx_alldev_list);
+ kfree(net->can.pkg_stats);
+ kfree(net->can.rcv_lists_stats);
}
-/*
- * af_can module init/exit functions
- */
+/* af_can module init/exit functions */
static struct packet_type can_packet __read_mostly = {
.type = cpu_to_be16(ETH_P_CAN),
diff --git a/net/can/af_can.h b/net/can/af_can.h
index 9cb3719632bd..7c2d9161e224 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -1,5 +1,5 @@
-/*
- * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -53,32 +53,17 @@ struct receiver {
canid_t can_id;
canid_t mask;
unsigned long matches;
- void (*func)(struct sk_buff *, void *);
+ void (*func)(struct sk_buff *skb, void *data);
void *data;
char *ident;
struct sock *sk;
struct rcu_head rcu;
};
-#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
-#define CAN_EFF_RCV_HASH_BITS 10
-#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS)
-
-enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX };
-
-/* per device receive filters linked at dev->ml_priv */
-struct can_dev_rcv_lists {
- struct hlist_head rx[RX_MAX];
- struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ];
- struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ];
- int remove_on_zero_entries;
- int entries;
-};
-
/* statistic structures */
/* can be reset e.g. by can_init_stats() */
-struct s_stats {
+struct can_pkg_stats {
unsigned long jiffies_init;
unsigned long rx_frames;
@@ -103,7 +88,7 @@ struct s_stats {
};
/* persistent statistics */
-struct s_pstats {
+struct can_rcv_lists_stats {
unsigned long stats_reset;
unsigned long user_reset;
unsigned long rcv_entries;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index a34ee52f19ea..c96fa0f33db3 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
/*
* bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
*
@@ -105,7 +106,6 @@ struct bcm_op {
unsigned long frames_abs, frames_filtered;
struct bcm_timeval ival1, ival2;
struct hrtimer timer, thrtimer;
- struct tasklet_struct tsklet, thrtsklet;
ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
int rx_ifindex;
int cfsiz;
@@ -370,25 +370,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
}
}
-static void bcm_tx_start_timer(struct bcm_op *op)
+static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
{
+ ktime_t ival;
+
if (op->kt_ival1 && op->count)
- hrtimer_start(&op->timer,
- ktime_add(ktime_get(), op->kt_ival1),
- HRTIMER_MODE_ABS);
+ ival = op->kt_ival1;
else if (op->kt_ival2)
- hrtimer_start(&op->timer,
- ktime_add(ktime_get(), op->kt_ival2),
- HRTIMER_MODE_ABS);
+ ival = op->kt_ival2;
+ else
+ return false;
+
+ hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
+ return true;
}
-static void bcm_tx_timeout_tsklet(unsigned long data)
+static void bcm_tx_start_timer(struct bcm_op *op)
{
- struct bcm_op *op = (struct bcm_op *)data;
+ if (bcm_tx_set_expiry(op, &op->timer))
+ hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
+}
+
+/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
struct bcm_msg_head msg_head;
if (op->kt_ival1 && (op->count > 0)) {
-
op->count--;
if (!op->count && (op->flags & TX_COUNTEVT)) {
@@ -405,22 +414,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
}
bcm_can_tx(op);
- } else if (op->kt_ival2)
+ } else if (op->kt_ival2) {
bcm_can_tx(op);
+ }
- bcm_tx_start_timer(op);
-}
-
-/*
- * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
- */
-static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
-{
- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
-
- tasklet_schedule(&op->tsklet);
-
- return HRTIMER_NORESTART;
+ return bcm_tx_set_expiry(op, &op->timer) ?
+ HRTIMER_RESTART : HRTIMER_NORESTART;
}
/*
@@ -486,7 +485,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
/* do not send the saved data - only start throttle timer */
hrtimer_start(&op->thrtimer,
ktime_add(op->kt_lastmsg, op->kt_ival2),
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_SOFT);
return;
}
@@ -545,14 +544,21 @@ static void bcm_rx_starttimer(struct bcm_op *op)
return;
if (op->kt_ival1)
- hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
}
-static void bcm_rx_timeout_tsklet(unsigned long data)
+/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
{
- struct bcm_op *op = (struct bcm_op *)data;
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
struct bcm_msg_head msg_head;
+ /* if user wants to be informed, when cyclic CAN-Messages come back */
+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
+ /* clear received CAN frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
+ }
+
/* create notification to user */
msg_head.opcode = RX_TIMEOUT;
msg_head.flags = op->flags;
@@ -563,25 +569,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data)
msg_head.nframes = 0;
bcm_send_to_user(op, &msg_head, NULL, 0);
-}
-
-/*
- * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
- */
-static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
-{
- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
-
- /* schedule before NET_RX_SOFTIRQ */
- tasklet_hi_schedule(&op->tsklet);
-
- /* no restart of the timer is done here! */
-
- /* if user wants to be informed, when cyclic CAN-Messages come back */
- if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
- /* clear received CAN frames to indicate 'nothing received' */
- memset(op->last_frames, 0, op->nframes * op->cfsiz);
- }
return HRTIMER_NORESTART;
}
@@ -589,14 +576,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
/*
* bcm_rx_do_flush - helper for bcm_rx_thr_flush
*/
-static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
- unsigned int index)
+static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
{
struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
if ((op->last_frames) && (lcf->flags & RX_THR)) {
- if (update)
- bcm_rx_changed(op, lcf);
+ bcm_rx_changed(op, lcf);
return 1;
}
return 0;
@@ -604,11 +589,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
/*
* bcm_rx_thr_flush - Check for throttled data and send it to the userspace
- *
- * update == 0 : just check if throttled data is available (any irq context)
- * update == 1 : check and send throttled data to userspace (soft_irq context)
*/
-static int bcm_rx_thr_flush(struct bcm_op *op, int update)
+static int bcm_rx_thr_flush(struct bcm_op *op)
{
int updated = 0;
@@ -617,24 +599,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
/* for MUX filter we start at index 1 */
for (i = 1; i < op->nframes; i++)
- updated += bcm_rx_do_flush(op, update, i);
+ updated += bcm_rx_do_flush(op, i);
} else {
/* for RX_FILTER_ID and simple filter */
- updated += bcm_rx_do_flush(op, update, 0);
+ updated += bcm_rx_do_flush(op, 0);
}
return updated;
}
-static void bcm_rx_thr_tsklet(unsigned long data)
-{
- struct bcm_op *op = (struct bcm_op *)data;
-
- /* push the changed data to the userspace */
- bcm_rx_thr_flush(op, 1);
-}
-
/*
* bcm_rx_thr_handler - the time for blocked content updates is over now:
* Check for throttled data and send it to the userspace
@@ -643,9 +617,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
{
struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
- tasklet_schedule(&op->thrtsklet);
-
- if (bcm_rx_thr_flush(op, 0)) {
+ if (bcm_rx_thr_flush(op)) {
hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
return HRTIMER_RESTART;
} else {
@@ -741,23 +713,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
static void bcm_remove_op(struct bcm_op *op)
{
- if (op->tsklet.func) {
- while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
- test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
- hrtimer_active(&op->timer)) {
- hrtimer_cancel(&op->timer);
- tasklet_kill(&op->tsklet);
- }
- }
-
- if (op->thrtsklet.func) {
- while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
- test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
- hrtimer_active(&op->thrtimer)) {
- hrtimer_cancel(&op->thrtimer);
- tasklet_kill(&op->thrtsklet);
- }
- }
+ hrtimer_cancel(&op->timer);
+ hrtimer_cancel(&op->thrtimer);
if ((op->frames) && (op->frames != &op->sframe))
kfree(op->frames);
@@ -990,15 +947,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->ifindex = ifindex;
/* initialize uninitialized (kzalloc) structure */
- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
op->timer.function = bcm_tx_timeout_handler;
- /* initialize tasklet for tx countevent notification */
- tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
- (unsigned long) op);
-
/* currently unused in tx_ops */
- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
/* add this bcm_op to the list of the tx_ops */
list_add(&op->list, &bo->tx_ops);
@@ -1167,20 +1122,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->rx_ifindex = ifindex;
/* initialize uninitialized (kzalloc) structure */
- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&op->timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
op->timer.function = bcm_rx_timeout_handler;
- /* initialize tasklet for rx timeout notification */
- tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
- (unsigned long) op);
-
- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
op->thrtimer.function = bcm_rx_thr_handler;
- /* initialize tasklet for rx throttle handling */
- tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
- (unsigned long) op);
-
/* add this bcm_op to the list of the rx_ops */
list_add(&op->list, &bo->rx_ops);
@@ -1226,12 +1175,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
*/
op->kt_lastmsg = 0;
hrtimer_cancel(&op->thrtimer);
- bcm_rx_thr_flush(op, 1);
+ bcm_rx_thr_flush(op);
}
if ((op->flags & STARTTIMER) && op->kt_ival1)
hrtimer_start(&op->timer, op->kt_ival1,
- HRTIMER_MODE_REL);
+ HRTIMER_MODE_REL_SOFT);
}
/* now we can register for can_ids, if we added a new bcm_op */
@@ -1345,7 +1294,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
/* no bound device as default => check msg_name */
DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
- if (msg->msg_namelen < sizeof(*addr))
+ if (msg->msg_namelen < CAN_REQUIRED_SIZE(*addr, can_ifindex))
return -EINVAL;
if (addr->can_family != AF_CAN)
@@ -1587,7 +1536,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
struct net *net = sock_net(sk);
int ret = 0;
- if (len < sizeof(*addr))
+ if (len < CAN_REQUIRED_SIZE(*addr, can_ifindex))
return -EINVAL;
lock_sock(sk);
@@ -1679,6 +1628,13 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
return size;
}
+static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
static const struct proto_ops bcm_ops = {
.family = PF_CAN,
.release = bcm_release,
@@ -1688,7 +1644,7 @@ static const struct proto_ops bcm_ops = {
.accept = sock_no_accept,
.getname = sock_no_getname,
.poll = datagram_poll,
- .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .ioctl = bcm_sock_no_ioctlcmd,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
diff --git a/net/can/gw.c b/net/can/gw.c
index 72711053ebe6..65d60c93af29 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1,7 +1,7 @@
-/*
- * gw.c - CAN frame Gateway/Router/Bridge with netlink interface
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
*
- * Copyright (c) 2017 Volkswagen Group Electronic Research
+ * Copyright (c) 2019 Volkswagen Group Electronic Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -59,7 +59,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
-#define CAN_GW_VERSION "20170425"
+#define CAN_GW_VERSION "20190810"
#define CAN_GW_NAME "can-gw"
MODULE_DESCRIPTION("PF_CAN netlink gateway");
@@ -85,10 +85,10 @@ static struct kmem_cache *cgw_cache __read_mostly;
/* structure that contains the (on-the-fly) CAN frame modifications */
struct cf_mod {
struct {
- struct can_frame and;
- struct can_frame or;
- struct can_frame xor;
- struct can_frame set;
+ struct canfd_frame and;
+ struct canfd_frame or;
+ struct canfd_frame xor;
+ struct canfd_frame set;
} modframe;
struct {
u8 and;
@@ -96,7 +96,7 @@ struct cf_mod {
u8 xor;
u8 set;
} modtype;
- void (*modfunc[MAX_MODFUNCTIONS])(struct can_frame *cf,
+ void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf,
struct cf_mod *mod);
/* CAN frame checksum calculation after CAN frame modifications */
@@ -105,15 +105,15 @@ struct cf_mod {
struct cgw_csum_crc8 crc8;
} csum;
struct {
- void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor);
- void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8);
+ void (*xor)(struct canfd_frame *cf,
+ struct cgw_csum_xor *xor);
+ void (*crc8)(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8);
} csumfunc;
u32 uid;
};
-
-/*
- * So far we just support CAN -> CAN routing and frame modifications.
+/* So far we just support CAN -> CAN routing and frame modifications.
*
* The internal can_can_gw structure contains data and attributes for
* a CAN -> CAN gateway job.
@@ -151,39 +151,88 @@ struct cgw_job {
/* modification functions that are invoked in the hot path in can_can_gw_rcv */
-#define MODFUNC(func, op) static void func(struct can_frame *cf, \
+#define MODFUNC(func, op) static void func(struct canfd_frame *cf, \
struct cf_mod *mod) { op ; }
MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
-MODFUNC(mod_and_dlc, cf->can_dlc &= mod->modframe.and.can_dlc)
+MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len)
+MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags)
MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
-MODFUNC(mod_or_dlc, cf->can_dlc |= mod->modframe.or.can_dlc)
+MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len)
+MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags)
MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
-MODFUNC(mod_xor_dlc, cf->can_dlc ^= mod->modframe.xor.can_dlc)
+MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len)
+MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags)
MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
-MODFUNC(mod_set_dlc, cf->can_dlc = mod->modframe.set.can_dlc)
+MODFUNC(mod_set_len, cf->len = mod->modframe.set.len)
+MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags)
MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)
-static inline void canframecpy(struct can_frame *dst, struct can_frame *src)
+static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i);
+}
+
+static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i);
+}
+
+static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i);
+}
+
+static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN);
+}
+
+static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
{
- /*
- * Copy the struct members separately to ensure that no uninitialized
+ /* Copy the struct members separately to ensure that no uninitialized
* data are copied in the 3 bytes hole of the struct. This is needed
* to make easy compares of the data in the struct cf_mod.
*/
dst->can_id = src->can_id;
- dst->can_dlc = src->can_dlc;
+ dst->len = src->can_dlc;
*(u64 *)dst->data = *(u64 *)src->data;
}
-static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
+static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src)
{
- /*
- * absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
+ /* Copy the struct members separately to ensure that no uninitialized
+ * data are copied in the 2 bytes hole of the struct. This is needed
+ * to make easy compares of the data in the struct cf_mod.
+ */
+
+ dst->can_id = src->can_id;
+ dst->flags = src->flags;
+ dst->len = src->len;
+ memcpy(dst->data, src->data, CANFD_MAX_DLEN);
+}
+
+static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r)
+{
+ s8 dlen = CAN_MAX_DLEN;
+
+ if (r->flags & CGW_FLAGS_CAN_FD)
+ dlen = CANFD_MAX_DLEN;
+
+ /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
* relative to received dlc -1 .. -8 :
* e.g. for received dlc = 8
* -1 => index = 7 (data[7])
@@ -191,27 +240,27 @@ static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
* -8 => index = 0 (data[0])
*/
- if (fr > -9 && fr < 8 &&
- to > -9 && to < 8 &&
- re > -9 && re < 8)
+ if (fr >= -dlen && fr < dlen &&
+ to >= -dlen && to < dlen &&
+ re >= -dlen && re < dlen)
return 0;
else
return -EINVAL;
}
-static inline int calc_idx(int idx, int rx_dlc)
+static inline int calc_idx(int idx, int rx_len)
{
if (idx < 0)
- return rx_dlc + idx;
+ return rx_len + idx;
else
return idx;
}
-static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
- int from = calc_idx(xor->from_idx, cf->can_dlc);
- int to = calc_idx(xor->to_idx, cf->can_dlc);
- int res = calc_idx(xor->result_idx, cf->can_dlc);
+ int from = calc_idx(xor->from_idx, cf->len);
+ int to = calc_idx(xor->to_idx, cf->len);
+ int res = calc_idx(xor->result_idx, cf->len);
u8 val = xor->init_xor_val;
int i;
@@ -229,7 +278,7 @@ static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
cf->data[res] = val;
}
-static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
u8 val = xor->init_xor_val;
int i;
@@ -240,7 +289,7 @@ static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
cf->data[xor->result_idx] = val;
}
-static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
u8 val = xor->init_xor_val;
int i;
@@ -251,11 +300,12 @@ static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
cf->data[xor->result_idx] = val;
}
-static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_rel(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
{
- int from = calc_idx(crc8->from_idx, cf->can_dlc);
- int to = calc_idx(crc8->to_idx, cf->can_dlc);
- int res = calc_idx(crc8->result_idx, cf->can_dlc);
+ int from = calc_idx(crc8->from_idx, cf->len);
+ int to = calc_idx(crc8->to_idx, cf->len);
+ int res = calc_idx(crc8->result_idx, cf->len);
u8 crc = crc8->init_crc_val;
int i;
@@ -264,96 +314,102 @@ static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
if (from <= to) {
for (i = crc8->from_idx; i <= crc8->to_idx; i++)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
} else {
for (i = crc8->from_idx; i >= crc8->to_idx; i--)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
}
switch (crc8->profile) {
-
case CGW_CRC8PRF_1U8:
- crc = crc8->crctab[crc^crc8->profile_data[0]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
break;
case CGW_CRC8PRF_16U8:
- crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
break;
case CGW_CRC8PRF_SFFID_XOR:
- crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
(cf->can_id >> 8 & 0xFF)];
break;
-
}
- cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}
-static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_pos(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
{
u8 crc = crc8->init_crc_val;
int i;
for (i = crc8->from_idx; i <= crc8->to_idx; i++)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
switch (crc8->profile) {
-
case CGW_CRC8PRF_1U8:
- crc = crc8->crctab[crc^crc8->profile_data[0]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
break;
case CGW_CRC8PRF_16U8:
- crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
break;
case CGW_CRC8PRF_SFFID_XOR:
- crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
(cf->can_id >> 8 & 0xFF)];
break;
}
- cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}
-static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_neg(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
{
u8 crc = crc8->init_crc_val;
int i;
for (i = crc8->from_idx; i >= crc8->to_idx; i--)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
switch (crc8->profile) {
-
case CGW_CRC8PRF_1U8:
- crc = crc8->crctab[crc^crc8->profile_data[0]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
break;
case CGW_CRC8PRF_16U8:
- crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
break;
case CGW_CRC8PRF_SFFID_XOR:
- crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
(cf->can_id >> 8 & 0xFF)];
break;
}
- cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}
/* the receive & process & send function */
static void can_can_gw_rcv(struct sk_buff *skb, void *data)
{
struct cgw_job *gwj = (struct cgw_job *)data;
- struct can_frame *cf;
+ struct canfd_frame *cf;
struct sk_buff *nskb;
int modidx = 0;
- /*
- * Do not handle CAN frames routed more than 'max_hops' times.
+ /* process strictly Classic CAN or CAN FD frames */
+ if (gwj->flags & CGW_FLAGS_CAN_FD) {
+ if (skb->len != CANFD_MTU)
+ return;
+ } else {
+ if (skb->len != CAN_MTU)
+ return;
+ }
+
+ /* Do not handle CAN frames routed more than 'max_hops' times.
* In general we should never catch this delimiter which is intended
* to cover a misconfiguration protection (e.g. circular CAN routes).
*
@@ -384,8 +440,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex)
return;
- /*
- * clone the given skb, which has not been done in can_rcv()
+ /* clone the given skb, which has not been done in can_rcv()
*
* When there is at least one modification function activated,
* we need to copy the skb as we want to modify skb->data.
@@ -410,7 +465,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
nskb->dev = gwj->dst.dev;
/* pointer to modifiable CAN frame */
- cf = (struct can_frame *)nskb->data;
+ cf = (struct canfd_frame *)nskb->data;
/* perform preprocessed modification functions if there are any */
while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx])
@@ -419,26 +474,22 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
/* Has the CAN frame been modified? */
if (modidx) {
/* get available space for the processed CAN frame type */
- int max_len = nskb->len - offsetof(struct can_frame, data);
+ int max_len = nskb->len - offsetof(struct canfd_frame, data);
/* dlc may have changed, make sure it fits to the CAN frame */
- if (cf->can_dlc > max_len)
- goto out_delete;
-
- /* check for checksum updates in classic CAN length only */
- if (gwj->mod.csumfunc.crc8) {
- if (cf->can_dlc > 8)
- goto out_delete;
-
- (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+ if (cf->len > max_len) {
+ /* delete frame due to misconfiguration */
+ gwj->deleted_frames++;
+ kfree_skb(nskb);
+ return;
}
- if (gwj->mod.csumfunc.xor) {
- if (cf->can_dlc > 8)
- goto out_delete;
+ /* check for checksum updates */
+ if (gwj->mod.csumfunc.crc8)
+ (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+ if (gwj->mod.csumfunc.xor)
(*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
- }
}
/* clear the skb timestamp if not configured the other way */
@@ -450,14 +501,6 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
gwj->dropped_frames++;
else
gwj->handled_frames++;
-
- return;
-
- out_delete:
- /* delete frame due to misconfiguration */
- gwj->deleted_frames++;
- kfree_skb(nskb);
- return;
}
static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj)
@@ -483,14 +526,12 @@ static int cgw_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
if (msg == NETDEV_UNREGISTER) {
-
struct cgw_job *gwj = NULL;
struct hlist_node *nx;
ASSERT_RTNL();
hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
-
if (gwj->src.dev == dev || gwj->dst.dev == dev) {
hlist_del(&gwj->list);
cgw_unregister_filter(net, gwj);
@@ -505,7 +546,6 @@ static int cgw_notifier(struct notifier_block *nb,
static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
u32 pid, u32 seq, int flags)
{
- struct cgw_frame_mod mb;
struct rtcanmsg *rtcan;
struct nlmsghdr *nlh;
@@ -542,32 +582,66 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
goto cancel;
}
- if (gwj->mod.modtype.and) {
- memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.and;
- if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
- goto cancel;
- }
+ if (gwj->flags & CGW_FLAGS_CAN_FD) {
+ struct cgw_fdframe_mod mb;
- if (gwj->mod.modtype.or) {
- memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.or;
- if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
- goto cancel;
- }
+ if (gwj->mod.modtype.and) {
+ memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.and;
+ if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
- if (gwj->mod.modtype.xor) {
- memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.xor;
- if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
- goto cancel;
- }
+ if (gwj->mod.modtype.or) {
+ memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.or;
+ if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
- if (gwj->mod.modtype.set) {
- memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.set;
- if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
- goto cancel;
+ if (gwj->mod.modtype.xor) {
+ memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.xor;
+ if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.set) {
+ memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.set;
+ if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+ } else {
+ struct cgw_frame_mod mb;
+
+ if (gwj->mod.modtype.and) {
+ memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.and;
+ if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.or) {
+ memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.or;
+ if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.xor) {
+ memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.xor;
+ if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (gwj->mod.modtype.set) {
+ memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
+ mb.modtype = gwj->mod.modtype.set;
+ if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
}
if (gwj->mod.uid) {
@@ -588,7 +662,6 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
}
if (gwj->gwtype == CGW_TYPE_CAN_CAN) {
-
if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) {
if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter),
&gwj->ccgw.filter) < 0)
@@ -623,8 +696,9 @@ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
- if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
+ if (cgw_put_job(skb, gwj, RTM_NEWROUTE,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
break;
cont:
idx++;
@@ -636,7 +710,7 @@ cont:
return skb->len;
}
-static const struct nla_policy cgw_policy[CGW_MAX+1] = {
+static const struct nla_policy cgw_policy[CGW_MAX + 1] = {
[CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) },
[CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) },
[CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) },
@@ -648,14 +722,18 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = {
[CGW_FILTER] = { .len = sizeof(struct can_filter) },
[CGW_LIM_HOPS] = { .type = NLA_U8 },
[CGW_MOD_UID] = { .type = NLA_U32 },
+ [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) },
};
/* check for common and gwtype specific attributes */
static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
u8 gwtype, void *gwtypeattr, u8 *limhops)
{
- struct nlattr *tb[CGW_MAX+1];
- struct cgw_frame_mod mb;
+ struct nlattr *tb[CGW_MAX + 1];
+ struct rtcanmsg *r = nlmsg_data(nlh);
int modidx = 0;
int err = 0;
@@ -675,87 +753,166 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
}
/* check for AND/OR/XOR/SET modifications */
+ if (r->flags & CGW_FLAGS_CAN_FD) {
+ struct cgw_fdframe_mod mb;
- if (tb[CGW_MOD_AND]) {
- nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
+ if (tb[CGW_FDMOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN);
- canframecpy(&mod->modframe.and, &mb.cf);
- mod->modtype.and = mb.modtype;
+ canfdframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_and_id;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_and_dlc;
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_and_len;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_and_data;
- }
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_and_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_fddata;
+ }
- if (tb[CGW_MOD_OR]) {
- nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+ if (tb[CGW_FDMOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN);
- canframecpy(&mod->modframe.or, &mb.cf);
- mod->modtype.or = mb.modtype;
+ canfdframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_or_id;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_or_dlc;
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_or_len;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_or_data;
- }
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_or_flags;
- if (tb[CGW_MOD_XOR]) {
- nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_fddata;
+ }
- canframecpy(&mod->modframe.xor, &mb.cf);
- mod->modtype.xor = mb.modtype;
+ if (tb[CGW_FDMOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN);
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_xor_id;
+ canfdframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_xor_dlc;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_xor_data;
- }
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_xor_len;
- if (tb[CGW_MOD_SET]) {
- nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_xor_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_fddata;
+ }
- canframecpy(&mod->modframe.set, &mb.cf);
- mod->modtype.set = mb.modtype;
+ if (tb[CGW_FDMOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN);
+
+ canfdframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_set_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_set_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_fddata;
+ }
+ } else {
+ struct cgw_frame_mod mb;
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_set_id;
+ if (tb[CGW_MOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_set_dlc;
+ canframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_set_data;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_and_len;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_data;
+ }
+
+ if (tb[CGW_MOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_or_len;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_data;
+ }
+
+ if (tb[CGW_MOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_xor_len;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_data;
+ }
+
+ if (tb[CGW_MOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_set_len;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_data;
+ }
}
/* check for checksum operations after CAN frame modifications */
if (modidx) {
-
if (tb[CGW_CS_CRC8]) {
struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]);
err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
- c->result_idx);
+ c->result_idx, r);
if (err)
return err;
nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8],
CGW_CS_CRC8_LEN);
- /*
- * select dedicated processing function to reduce
+ /* select dedicated processing function to reduce
* runtime operations in receive hot path.
*/
if (c->from_idx < 0 || c->to_idx < 0 ||
@@ -771,15 +928,14 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]);
err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
- c->result_idx);
+ c->result_idx, r);
if (err)
return err;
nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR],
CGW_CS_XOR_LEN);
- /*
- * select dedicated processing function to reduce
+ /* select dedicated processing function to reduce
* runtime operations in receive hot path.
*/
if (c->from_idx < 0 || c->to_idx < 0 ||
@@ -791,16 +947,14 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
mod->csumfunc.xor = cgw_csum_xor_neg;
}
- if (tb[CGW_MOD_UID]) {
+ if (tb[CGW_MOD_UID])
nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32));
- }
}
if (gwtype == CGW_TYPE_CAN_CAN) {
-
/* check CGW_TYPE_CAN_CAN specific attributes */
-
struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr;
+
memset(ccgw, 0, sizeof(*ccgw));
/* check for can_filter in attributes */
@@ -861,12 +1015,10 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
return err;
if (mod.uid) {
-
ASSERT_RTNL();
/* check for updating an existing job with identical uid */
hlist_for_each_entry(gwj, &net->can.cgw_list, list) {
-
if (gwj->mod.uid != mod.uid)
continue;
@@ -987,7 +1139,6 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh,
/* remove only the first matching entry */
hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
-
if (gwj->flags != r->flags)
continue;
diff --git a/net/can/j1939/Kconfig b/net/can/j1939/Kconfig
new file mode 100644
index 000000000000..2998298b71ec
--- /dev/null
+++ b/net/can/j1939/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# SAE J1939 network layer core configuration
+#
+
+config CAN_J1939
+ tristate "SAE J1939"
+ depends on CAN
+ help
+ SAE J1939
+ Say Y to have in-kernel support for j1939 socket type. This
+ allows communication according to SAE j1939.
+ The relevant parts in kernel are
+ SAE j1939-21 (datalink & transport protocol)
+ & SAE j1939-81 (network management).
diff --git a/net/can/j1939/Makefile b/net/can/j1939/Makefile
new file mode 100644
index 000000000000..19181bdae173
--- /dev/null
+++ b/net/can/j1939/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_CAN_J1939) += can-j1939.o
+
+can-j1939-objs := \
+ address-claim.o \
+ bus.o \
+ main.o \
+ socket.o \
+ transport.o
diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c
new file mode 100644
index 000000000000..f33c47327927
--- /dev/null
+++ b/net/can/j1939/address-claim.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* J1939 Address Claiming.
+ * Address Claiming in the kernel
+ * - keeps track of the AC states of ECU's,
+ * - resolves NAME<=>SA taking into account the AC states of ECU's.
+ *
+ * All Address Claim msgs (including host-originated msg) are processed
+ * at the receive path (a sent msg is always received again via CAN echo).
+ * As such, the processing of AC msgs is done in the order on which msgs
+ * are sent on the bus.
+ *
+ * This module doesn't send msgs itself (e.g. replies on Address Claims),
+ * this is the responsibility of a user space application or daemon.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include "j1939-priv.h"
+
+static inline name_t j1939_skb_to_name(const struct sk_buff *skb)
+{
+ return le64_to_cpup((__le64 *)skb->data);
+}
+
+static inline bool j1939_ac_msg_is_request(struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int req_pgn;
+
+ if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST)
+ return false;
+
+ req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16);
+
+ return req_pgn == J1939_PGN_ADDRESS_CLAIMED;
+}
+
+static int j1939_ac_verify_outgoing(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+ if (skb->len != 8) {
+ netdev_notice(priv->ndev, "tx address claim with dlc %i\n",
+ skb->len);
+ return -EPROTO;
+ }
+
+ if (skcb->addr.src_name != j1939_skb_to_name(skb)) {
+ netdev_notice(priv->ndev, "tx address claim with different name\n");
+ return -EPROTO;
+ }
+
+ if (skcb->addr.sa == J1939_NO_ADDR) {
+ netdev_notice(priv->ndev, "tx address claim with broadcast sa\n");
+ return -EPROTO;
+ }
+
+ /* ac must always be a broadcast */
+ if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) {
+ netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n");
+ return -EPROTO;
+ }
+ return 0;
+}
+
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int ret;
+ u8 addr;
+
+ /* network mgmt: address claiming msgs */
+ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+ struct j1939_ecu *ecu;
+
+ ret = j1939_ac_verify_outgoing(priv, skb);
+ /* return both when failure & when successful */
+ if (ret < 0)
+ return ret;
+ ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name);
+ if (!ecu)
+ return -ENODEV;
+
+ if (ecu->addr != skcb->addr.sa)
+ /* hold further traffic for ecu, remove from parent */
+ j1939_ecu_unmap(ecu);
+ j1939_ecu_put(ecu);
+ } else if (skcb->addr.src_name) {
+ /* assign source address */
+ addr = j1939_name_to_addr(priv, skcb->addr.src_name);
+ if (!j1939_address_is_unicast(addr) &&
+ !j1939_ac_msg_is_request(skb)) {
+ netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n",
+ skcb->addr.src_name);
+ return -EADDRNOTAVAIL;
+ }
+ skcb->addr.sa = addr;
+ }
+
+ /* assign destination address */
+ if (skcb->addr.dst_name) {
+ addr = j1939_name_to_addr(priv, skcb->addr.dst_name);
+ if (!j1939_address_is_unicast(addr)) {
+ netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n",
+ skcb->addr.dst_name);
+ return -EADDRNOTAVAIL;
+ }
+ skcb->addr.da = addr;
+ }
+ return 0;
+}
+
+static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_ecu *ecu, *prev;
+ name_t name;
+
+ if (skb->len != 8) {
+ netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n",
+ skb->len);
+ return;
+ }
+
+ name = j1939_skb_to_name(skb);
+ skcb->addr.src_name = name;
+ if (!name) {
+ netdev_notice(priv->ndev, "rx address claim without name\n");
+ return;
+ }
+
+ if (!j1939_address_is_valid(skcb->addr.sa)) {
+ netdev_notice(priv->ndev, "rx address claim with broadcast sa\n");
+ return;
+ }
+
+ write_lock_bh(&priv->lock);
+
+ /* Few words on the ECU ref counting:
+ *
+ * First we get an ECU handle, either with
+ * j1939_ecu_get_by_name_locked() (increments the ref counter)
+ * or j1939_ecu_create_locked() (initializes an ECU object
+ * with a ref counter of 1).
+ *
+ * j1939_ecu_unmap_locked() will decrement the ref counter,
+ * but only if the ECU was mapped before. So "ecu" still
+ * belongs to us.
+ *
+ * j1939_ecu_timer_start() will increment the ref counter
+ * before it starts the timer, so we can put the ecu when
+ * leaving this function.
+ */
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ if (!ecu && j1939_address_is_unicast(skcb->addr.sa))
+ ecu = j1939_ecu_create_locked(priv, name);
+
+ if (IS_ERR_OR_NULL(ecu))
+ goto out_unlock_bh;
+
+ /* cancel pending (previous) address claim */
+ j1939_ecu_timer_cancel(ecu);
+
+ if (j1939_address_is_idle(skcb->addr.sa)) {
+ j1939_ecu_unmap_locked(ecu);
+ goto out_ecu_put;
+ }
+
+ /* save new addr */
+ if (ecu->addr != skcb->addr.sa)
+ j1939_ecu_unmap_locked(ecu);
+ ecu->addr = skcb->addr.sa;
+
+ prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa);
+ if (prev) {
+ if (ecu->name > prev->name) {
+ j1939_ecu_unmap_locked(ecu);
+ j1939_ecu_put(prev);
+ goto out_ecu_put;
+ } else {
+ /* kick prev if less or equal */
+ j1939_ecu_unmap_locked(prev);
+ j1939_ecu_put(prev);
+ }
+ }
+
+ j1939_ecu_timer_start(ecu);
+ out_ecu_put:
+ j1939_ecu_put(ecu);
+ out_unlock_bh:
+ write_unlock_bh(&priv->lock);
+}
+
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_ecu *ecu;
+
+ /* network mgmt */
+ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+ j1939_ac_process(priv, skb);
+ } else if (j1939_address_is_unicast(skcb->addr.sa)) {
+ /* assign source name */
+ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa);
+ if (ecu) {
+ skcb->addr.src_name = ecu->name;
+ j1939_ecu_put(ecu);
+ }
+ }
+
+ /* assign destination name */
+ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da);
+ if (ecu) {
+ skcb->addr.dst_name = ecu->name;
+ j1939_ecu_put(ecu);
+ }
+}
diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c
new file mode 100644
index 000000000000..486687901602
--- /dev/null
+++ b/net/can/j1939/bus.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* bus for j1939 remote devices
+ * Since rtnetlink, no real bus is used.
+ */
+
+#include <net/sock.h>
+
+#include "j1939-priv.h"
+
+static void __j1939_ecu_release(struct kref *kref)
+{
+ struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref);
+ struct j1939_priv *priv = ecu->priv;
+
+ list_del(&ecu->list);
+ kfree(ecu);
+ j1939_priv_put(priv);
+}
+
+void j1939_ecu_put(struct j1939_ecu *ecu)
+{
+ kref_put(&ecu->kref, __j1939_ecu_release);
+}
+
+static void j1939_ecu_get(struct j1939_ecu *ecu)
+{
+ kref_get(&ecu->kref);
+}
+
+static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+
+ lockdep_assert_held(&priv->lock);
+
+ return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu;
+}
+
+/* ECU device interface */
+/* map ECU to a bus address space */
+static void j1939_ecu_map_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+ struct j1939_addr_ent *ent;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(ecu->addr))
+ return;
+
+ ent = &priv->ents[ecu->addr];
+
+ if (ent->ecu) {
+ netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n",
+ ecu->addr, ecu->name);
+ return;
+ }
+
+ j1939_ecu_get(ecu);
+ ent->ecu = ecu;
+ ent->nusers += ecu->nusers;
+}
+
+/* unmap ECU from a bus address space */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+ struct j1939_addr_ent *ent;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(ecu->addr))
+ return;
+
+ if (!j1939_ecu_is_mapped_locked(ecu))
+ return;
+
+ ent = &priv->ents[ecu->addr];
+ ent->ecu = NULL;
+ ent->nusers -= ecu->nusers;
+ j1939_ecu_put(ecu);
+}
+
+void j1939_ecu_unmap(struct j1939_ecu *ecu)
+{
+ write_lock_bh(&ecu->priv->lock);
+ j1939_ecu_unmap_locked(ecu);
+ write_unlock_bh(&ecu->priv->lock);
+}
+
+void j1939_ecu_unmap_all(struct j1939_priv *priv)
+{
+ int i;
+
+ write_lock_bh(&priv->lock);
+ for (i = 0; i < ARRAY_SIZE(priv->ents); i++)
+ if (priv->ents[i].ecu)
+ j1939_ecu_unmap_locked(priv->ents[i].ecu);
+ write_unlock_bh(&priv->lock);
+}
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu)
+{
+ /* The ECU is held here and released in the
+ * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel().
+ */
+ j1939_ecu_get(ecu);
+
+ /* Schedule timer in 250 msec to commit address change. */
+ hrtimer_start(&ecu->ac_timer, ms_to_ktime(250),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu)
+{
+ if (hrtimer_cancel(&ecu->ac_timer))
+ j1939_ecu_put(ecu);
+}
+
+static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer)
+{
+ struct j1939_ecu *ecu =
+ container_of(hrtimer, struct j1939_ecu, ac_timer);
+ struct j1939_priv *priv = ecu->priv;
+
+ write_lock_bh(&priv->lock);
+ /* TODO: can we test if ecu->addr is unicast before starting
+ * the timer?
+ */
+ j1939_ecu_map_locked(ecu);
+
+ /* The corresponding j1939_ecu_get() is in
+ * j1939_ecu_timer_start().
+ */
+ j1939_ecu_put(ecu);
+ write_unlock_bh(&priv->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ ecu = kzalloc(sizeof(*ecu), gfp_any());
+ if (!ecu)
+ return ERR_PTR(-ENOMEM);
+ kref_init(&ecu->kref);
+ ecu->addr = J1939_IDLE_ADDR;
+ ecu->name = name;
+
+ hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ ecu->ac_timer.function = j1939_ecu_timer_handler;
+ INIT_LIST_HEAD(&ecu->list);
+
+ j1939_priv_get(priv);
+ ecu->priv = priv;
+ list_add_tail(&ecu->list, &priv->ecus);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+ u8 addr)
+{
+ lockdep_assert_held(&priv->lock);
+
+ return priv->ents[addr].ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(addr))
+ return NULL;
+
+ ecu = j1939_ecu_find_by_addr_locked(priv, addr);
+ if (ecu)
+ j1939_ecu_get(ecu);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr)
+{
+ struct j1939_ecu *ecu;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_get_by_addr_locked(priv, addr);
+ read_unlock_bh(&priv->lock);
+
+ return ecu;
+}
+
+/* get pointer to ecu without increasing ref counter */
+static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv,
+ name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ list_for_each_entry(ecu, &priv->ecus, list) {
+ if (ecu->name == name)
+ return ecu;
+ }
+
+ return NULL;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+ name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!name)
+ return NULL;
+
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (ecu)
+ j1939_ecu_get(ecu);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ read_unlock_bh(&priv->lock);
+
+ return ecu;
+}
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+ int addr = J1939_IDLE_ADDR;
+
+ if (!name)
+ return J1939_NO_ADDR;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (ecu && j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's SA is registered */
+ addr = ecu->addr;
+
+ read_unlock_bh(&priv->lock);
+
+ return addr;
+}
+
+/* TX addr/name accounting
+ * Transport protocol needs to know if a SA is local or not
+ * These functions originate from userspace manipulating sockets,
+ * so locking is straigforward
+ */
+
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)
+{
+ struct j1939_ecu *ecu;
+ int err = 0;
+
+ write_lock_bh(&priv->lock);
+
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers++;
+
+ if (!name)
+ goto done;
+
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ if (!ecu)
+ ecu = j1939_ecu_create_locked(priv, name);
+ err = PTR_ERR_OR_ZERO(ecu);
+ if (err)
+ goto done;
+
+ ecu->nusers++;
+ /* TODO: do we care if ecu->addr != sa? */
+ if (j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's sa is active already */
+ priv->ents[ecu->addr].nusers++;
+
+ done:
+ write_unlock_bh(&priv->lock);
+
+ return err;
+}
+
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa)
+{
+ struct j1939_ecu *ecu;
+
+ write_lock_bh(&priv->lock);
+
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers--;
+
+ if (!name)
+ goto done;
+
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (WARN_ON_ONCE(!ecu))
+ goto done;
+
+ ecu->nusers--;
+ /* TODO: do we care if ecu->addr != sa? */
+ if (j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's sa is active already */
+ priv->ents[ecu->addr].nusers--;
+ j1939_ecu_put(ecu);
+
+ done:
+ write_unlock_bh(&priv->lock);
+}
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
new file mode 100644
index 000000000000..12369b604ce9
--- /dev/null
+++ b/net/can/j1939/j1939-priv.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#ifndef _J1939_PRIV_H_
+#define _J1939_PRIV_H_
+
+#include <linux/can/j1939.h>
+#include <net/sock.h>
+
+/* Timeout to receive the abort signal over loop back. In case CAN
+ * bus is open, the timeout should be triggered.
+ */
+#define J1939_XTP_ABORT_TIMEOUT_MS 500
+#define J1939_SIMPLE_ECHO_TIMEOUT_MS (10 * 1000)
+
+struct j1939_session;
+enum j1939_sk_errqueue_type {
+ J1939_ERRQUEUE_ACK,
+ J1939_ERRQUEUE_SCHED,
+ J1939_ERRQUEUE_ABORT,
+};
+
+/* j1939 devices */
+struct j1939_ecu {
+ struct list_head list;
+ name_t name;
+ u8 addr;
+
+ /* indicates that this ecu successfully claimed @sa as its address */
+ struct hrtimer ac_timer;
+ struct kref kref;
+ struct j1939_priv *priv;
+
+ /* count users, to help transport protocol decide for interaction */
+ int nusers;
+};
+
+struct j1939_priv {
+ struct list_head ecus;
+ /* local list entry in priv
+ * These allow irq (& softirq) context lookups on j1939 devices
+ * This approach (separate lists) is done as the other 2 alternatives
+ * are not easier or even wrong
+ * 1) using the pure kobject methods involves mutexes, which are not
+ * allowed in irq context.
+ * 2) duplicating data structures would require a lot of synchronization
+ * code
+ * usage:
+ */
+
+ /* segments need a lock to protect the above list */
+ rwlock_t lock;
+
+ struct net_device *ndev;
+
+ /* list of 256 ecu ptrs, that cache the claimed addresses.
+ * also protected by the above lock
+ */
+ struct j1939_addr_ent {
+ struct j1939_ecu *ecu;
+ /* count users, to help transport protocol */
+ int nusers;
+ } ents[256];
+
+ struct kref kref;
+
+ /* List of active sessions to prevent start of conflicting
+ * one.
+ *
+ * Do not start two sessions of same type, addresses and
+ * direction.
+ */
+ struct list_head active_session_list;
+
+ /* protects active_session_list */
+ spinlock_t active_session_list_lock;
+
+ unsigned int tp_max_packet_size;
+
+ /* lock for j1939_socks list */
+ spinlock_t j1939_socks_lock;
+ struct list_head j1939_socks;
+
+ struct kref rx_kref;
+};
+
+void j1939_ecu_put(struct j1939_ecu *ecu);
+
+/* keep the cache of what is local */
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa);
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa);
+
+static inline bool j1939_address_is_unicast(u8 addr)
+{
+ return addr <= J1939_MAX_UNICAST_ADDR;
+}
+
+static inline bool j1939_address_is_idle(u8 addr)
+{
+ return addr == J1939_IDLE_ADDR;
+}
+
+static inline bool j1939_address_is_valid(u8 addr)
+{
+ return addr != J1939_NO_ADDR;
+}
+
+static inline bool j1939_pgn_is_pdu1(pgn_t pgn)
+{
+ /* ignore dp & res bits for this */
+ return (pgn & 0xff00) < 0xf000;
+}
+
+/* utility to correctly unmap an ECU */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu);
+void j1939_ecu_unmap(struct j1939_ecu *ecu);
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+ u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv,
+ u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+ name_t name);
+
+enum j1939_transfer_type {
+ J1939_TP,
+ J1939_ETP,
+ J1939_SIMPLE,
+};
+
+struct j1939_addr {
+ name_t src_name;
+ name_t dst_name;
+ pgn_t pgn;
+
+ u8 sa;
+ u8 da;
+
+ u8 type;
+};
+
+/* control buffer of the sk_buff */
+struct j1939_sk_buff_cb {
+ /* Offset in bytes within one ETP session */
+ u32 offset;
+
+ /* for tx, MSG_SYN will be used to sync on sockets */
+ u32 msg_flags;
+ u32 tskey;
+
+ struct j1939_addr addr;
+
+ /* Flags for quick lookups during skb processing.
+ * These are set in the receive path only.
+ */
+#define J1939_ECU_LOCAL_SRC BIT(0)
+#define J1939_ECU_LOCAL_DST BIT(1)
+ u8 flags;
+
+ priority_t priority;
+};
+
+static inline
+struct j1939_sk_buff_cb *j1939_skb_to_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct j1939_sk_buff_cb) > sizeof(skb->cb));
+
+ return (struct j1939_sk_buff_cb *)skb->cb;
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb);
+bool j1939_sk_recv_match(struct j1939_priv *priv,
+ struct j1939_sk_buff_cb *skcb);
+void j1939_sk_send_loop_abort(struct sock *sk, int err);
+void j1939_sk_errqueue(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type);
+void j1939_sk_queue_activate_next(struct j1939_session *session);
+
+/* stack entries */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size);
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb);
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb);
+
+/* network management */
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name);
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu);
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu);
+void j1939_ecu_unmap_all(struct j1939_priv *priv);
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev);
+void j1939_netdev_stop(struct j1939_priv *priv);
+
+void j1939_priv_put(struct j1939_priv *priv);
+void j1939_priv_get(struct j1939_priv *priv);
+
+/* notify/alert all j1939 sockets bound to ifindex */
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv);
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);
+void j1939_tp_init(struct j1939_priv *priv);
+
+/* decrement pending skb for a j1939 socket */
+void j1939_sock_pending_del(struct sock *sk);
+
+enum j1939_session_state {
+ J1939_SESSION_NEW,
+ J1939_SESSION_ACTIVE,
+ /* waiting for abort signal on the bus */
+ J1939_SESSION_WAITING_ABORT,
+ J1939_SESSION_ACTIVE_MAX,
+ J1939_SESSION_DONE,
+};
+
+struct j1939_session {
+ struct j1939_priv *priv;
+ struct list_head active_session_list_entry;
+ struct list_head sk_session_queue_entry;
+ struct kref kref;
+ struct sock *sk;
+
+ /* ifindex, src, dst, pgn define the session block
+ * the are _never_ modified after insertion in the list
+ * this decreases locking problems a _lot_
+ */
+ struct j1939_sk_buff_cb skcb;
+ struct sk_buff_head skb_queue;
+
+ /* all tx related stuff (last_txcmd, pkt.tx)
+ * is protected (modified only) with the txtimer hrtimer
+ * 'total' & 'block' are never changed,
+ * last_cmd, last & block are protected by ->lock
+ * this means that the tx may run after cts is received that should
+ * have stopped tx, but this time discrepancy is never avoided anyhow
+ */
+ u8 last_cmd, last_txcmd;
+ bool transmission;
+ bool extd;
+ /* Total message size, number of bytes */
+ unsigned int total_message_size;
+ /* Total number of bytes queue from socket to the session */
+ unsigned int total_queued_size;
+ unsigned int tx_retry;
+
+ int err;
+ u32 tskey;
+ enum j1939_session_state state;
+
+ /* Packets counters for a (extended) transfer session. The packet is
+ * maximal of 7 bytes.
+ */
+ struct {
+ /* total - total number of packets for this session */
+ unsigned int total;
+ /* last - last packet of a transfer block after which
+ * responder should send ETP.CM_CTS and originator
+ * ETP.CM_DPO
+ */
+ unsigned int last;
+ /* tx - number of packets send by originator node.
+ * this counter can be set back if responder node
+ * didn't received all packets send by originator.
+ */
+ unsigned int tx;
+ unsigned int tx_acked;
+ /* rx - number of packets received */
+ unsigned int rx;
+ /* block - amount of packets expected in one block */
+ unsigned int block;
+ /* dpo - ETP.CM_DPO, Data Packet Offset */
+ unsigned int dpo;
+ } pkt;
+ struct hrtimer txtimer, rxtimer;
+};
+
+struct j1939_sock {
+ struct sock sk; /* must be first to skip with memset */
+ struct j1939_priv *priv;
+ struct list_head list;
+
+#define J1939_SOCK_BOUND BIT(0)
+#define J1939_SOCK_CONNECTED BIT(1)
+#define J1939_SOCK_PROMISC BIT(2)
+#define J1939_SOCK_ERRQUEUE BIT(3)
+ int state;
+
+ int ifindex;
+ struct j1939_addr addr;
+ struct j1939_filter *filters;
+ int nfilters;
+ pgn_t pgn_rx_filter;
+
+ /* j1939 may emit equal PGN (!= equal CAN-id's) out of order
+ * when transport protocol comes in.
+ * To allow emitting in order, keep a 'pending' nr. of packets
+ */
+ atomic_t skb_pending;
+ wait_queue_head_t waitq;
+
+ /* lock for the sk_session_queue list */
+ spinlock_t sk_session_queue_lock;
+ struct list_head sk_session_queue;
+};
+
+static inline struct j1939_sock *j1939_sk(const struct sock *sk)
+{
+ return container_of(sk, struct j1939_sock, sk);
+}
+
+void j1939_session_get(struct j1939_session *session);
+void j1939_session_put(struct j1939_session *session);
+void j1939_session_skb_queue(struct j1939_session *session,
+ struct sk_buff *skb);
+int j1939_session_activate(struct j1939_session *session);
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec);
+void j1939_session_timers_cancel(struct j1939_session *session);
+
+#define J1939_MAX_TP_PACKET_SIZE (7 * 0xff)
+#define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff)
+
+#define J1939_REGULAR 0
+#define J1939_EXTENDED 1
+
+/* CAN protocol */
+extern const struct can_proto j1939_can_proto;
+
+#endif /* _J1939_PRIV_H_ */
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
new file mode 100644
index 000000000000..137054bff9ec
--- /dev/null
+++ b/net/can/j1939/main.c
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* Core of can-j1939 that links j1939 to CAN. */
+
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+
+#include "j1939-priv.h"
+
+MODULE_DESCRIPTION("PF_CAN SAE J1939");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)");
+MODULE_ALIAS("can-proto-" __stringify(CAN_J1939));
+
+/* LOWLEVEL CAN interface */
+
+/* CAN_HDR: #bytes before can_frame data part */
+#define J1939_CAN_HDR (offsetof(struct can_frame, data))
+
+/* CAN_FTR: #bytes beyond data part */
+#define J1939_CAN_FTR (sizeof(struct can_frame) - J1939_CAN_HDR - \
+ sizeof(((struct can_frame *)0)->data))
+
+/* lowest layer */
+static void j1939_can_recv(struct sk_buff *iskb, void *data)
+{
+ struct j1939_priv *priv = data;
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb, *iskcb;
+ struct can_frame *cf;
+
+ /* create a copy of the skb
+ * j1939 only delivers the real data bytes,
+ * the header goes into sockaddr.
+ * j1939 may not touch the incoming skb in such way
+ */
+ skb = skb_clone(iskb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ j1939_priv_get(priv);
+ can_skb_set_owner(skb, iskb->sk);
+
+ /* get a pointer to the header of the skb
+ * the skb payload (pointer) is moved, so that the next skb_data
+ * returns the actual payload
+ */
+ cf = (void *)skb->data;
+ skb_pull(skb, J1939_CAN_HDR);
+
+ /* fix length, set to dlc, with 8 maximum */
+ skb_trim(skb, min_t(uint8_t, cf->can_dlc, 8));
+
+ /* set addr */
+ skcb = j1939_skb_to_cb(skb);
+ memset(skcb, 0, sizeof(*skcb));
+
+ iskcb = j1939_skb_to_cb(iskb);
+ skcb->tskey = iskcb->tskey;
+ skcb->priority = (cf->can_id >> 26) & 0x7;
+ skcb->addr.sa = cf->can_id;
+ skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX;
+ /* set default message type */
+ skcb->addr.type = J1939_TP;
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) {
+ /* Type 1: with destination address */
+ skcb->addr.da = skcb->addr.pgn;
+ /* normalize pgn: strip dst address */
+ skcb->addr.pgn &= 0x3ff00;
+ } else {
+ /* set broadcast address */
+ skcb->addr.da = J1939_NO_ADDR;
+ }
+
+ /* update localflags */
+ read_lock_bh(&priv->lock);
+ if (j1939_address_is_unicast(skcb->addr.sa) &&
+ priv->ents[skcb->addr.sa].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+ read_unlock_bh(&priv->lock);
+
+ /* deliver into the j1939 stack ... */
+ j1939_ac_recv(priv, skb);
+
+ if (j1939_tp_recv(priv, skb))
+ /* this means the transport layer processed the message */
+ goto done;
+
+ j1939_simple_recv(priv, skb);
+ j1939_sk_recv(priv, skb);
+ done:
+ j1939_priv_put(priv);
+ kfree_skb(skb);
+}
+
+/* NETDEV MANAGEMENT */
+
+/* values for can_rx_(un)register */
+#define J1939_CAN_ID CAN_EFF_FLAG
+#define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+static DEFINE_SPINLOCK(j1939_netdev_lock);
+
+static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return NULL;
+
+ rwlock_init(&priv->lock);
+ INIT_LIST_HEAD(&priv->ecus);
+ priv->ndev = ndev;
+ kref_init(&priv->kref);
+ kref_init(&priv->rx_kref);
+ dev_hold(ndev);
+
+ netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv);
+
+ return priv;
+}
+
+static inline void j1939_priv_set(struct net_device *ndev,
+ struct j1939_priv *priv)
+{
+ struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+
+ can_ml_priv->j1939_priv = priv;
+}
+
+static void __j1939_priv_release(struct kref *kref)
+{
+ struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref);
+ struct net_device *ndev = priv->ndev;
+
+ netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv);
+
+ WARN_ON_ONCE(!list_empty(&priv->active_session_list));
+ WARN_ON_ONCE(!list_empty(&priv->ecus));
+ WARN_ON_ONCE(!list_empty(&priv->j1939_socks));
+
+ dev_put(ndev);
+ kfree(priv);
+}
+
+void j1939_priv_put(struct j1939_priv *priv)
+{
+ kref_put(&priv->kref, __j1939_priv_release);
+}
+
+void j1939_priv_get(struct j1939_priv *priv)
+{
+ kref_get(&priv->kref);
+}
+
+static int j1939_can_rx_register(struct j1939_priv *priv)
+{
+ struct net_device *ndev = priv->ndev;
+ int ret;
+
+ j1939_priv_get(priv);
+ ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+ j1939_can_recv, priv, "j1939", NULL);
+ if (ret < 0) {
+ j1939_priv_put(priv);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void j1939_can_rx_unregister(struct j1939_priv *priv)
+{
+ struct net_device *ndev = priv->ndev;
+
+ can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+ j1939_can_recv, priv);
+
+ j1939_priv_put(priv);
+}
+
+static void __j1939_rx_release(struct kref *kref)
+ __releases(&j1939_netdev_lock)
+{
+ struct j1939_priv *priv = container_of(kref, struct j1939_priv,
+ rx_kref);
+
+ j1939_can_rx_unregister(priv);
+ j1939_ecu_unmap_all(priv);
+ j1939_priv_set(priv->ndev, NULL);
+ spin_unlock(&j1939_netdev_lock);
+}
+
+/* get pointer to priv without increasing ref counter */
+static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
+{
+ struct can_ml_priv *can_ml_priv = ndev->ml_priv;
+
+ if (!can_ml_priv)
+ return NULL;
+
+ return can_ml_priv->j1939_priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ lockdep_assert_held(&j1939_netdev_lock);
+
+ if (ndev->type != ARPHRD_CAN)
+ return NULL;
+
+ priv = j1939_ndev_to_priv(ndev);
+ if (priv)
+ j1939_priv_get(priv);
+
+ return priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ spin_lock(&j1939_netdev_lock);
+ priv = j1939_priv_get_by_ndev_locked(ndev);
+ spin_unlock(&j1939_netdev_lock);
+
+ return priv;
+}
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev)
+{
+ struct j1939_priv *priv, *priv_new;
+ int ret;
+
+ priv = j1939_priv_get_by_ndev(ndev);
+ if (priv) {
+ kref_get(&priv->rx_kref);
+ return priv;
+ }
+
+ priv = j1939_priv_create(ndev);
+ if (!priv)
+ return ERR_PTR(-ENOMEM);
+
+ j1939_tp_init(priv);
+ spin_lock_init(&priv->j1939_socks_lock);
+ INIT_LIST_HEAD(&priv->j1939_socks);
+
+ spin_lock(&j1939_netdev_lock);
+ priv_new = j1939_priv_get_by_ndev_locked(ndev);
+ if (priv_new) {
+ /* Someone was faster than us, use their priv and roll
+ * back our's.
+ */
+ spin_unlock(&j1939_netdev_lock);
+ dev_put(ndev);
+ kfree(priv);
+ kref_get(&priv_new->rx_kref);
+ return priv_new;
+ }
+ j1939_priv_set(ndev, priv);
+ spin_unlock(&j1939_netdev_lock);
+
+ ret = j1939_can_rx_register(priv);
+ if (ret < 0)
+ goto out_priv_put;
+
+ return priv;
+
+ out_priv_put:
+ j1939_priv_set(ndev, NULL);
+ dev_put(ndev);
+ kfree(priv);
+
+ return ERR_PTR(ret);
+}
+
+void j1939_netdev_stop(struct j1939_priv *priv)
+{
+ kref_put_lock(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock);
+ j1939_priv_put(priv);
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ int ret, dlc;
+ canid_t canid;
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct can_frame *cf;
+
+ /* apply sanity checks */
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+ skcb->addr.pgn &= J1939_PGN_PDU1_MAX;
+ else
+ skcb->addr.pgn &= J1939_PGN_MAX;
+
+ if (skcb->priority > 7)
+ skcb->priority = 6;
+
+ ret = j1939_ac_fixup(priv, skb);
+ if (unlikely(ret))
+ goto failed;
+ dlc = skb->len;
+
+ /* re-claim the CAN_HDR from the SKB */
+ cf = skb_push(skb, J1939_CAN_HDR);
+
+ /* make it a full can frame again */
+ skb_put(skb, J1939_CAN_FTR + (8 - dlc));
+
+ canid = CAN_EFF_FLAG |
+ (skcb->priority << 26) |
+ (skcb->addr.pgn << 8) |
+ skcb->addr.sa;
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+ canid |= skcb->addr.da << 8;
+
+ cf->can_id = canid;
+ cf->can_dlc = dlc;
+
+ return can_send(skb, 1);
+
+ failed:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int j1939_netdev_notify(struct notifier_block *nb,
+ unsigned long msg, void *data)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(data);
+ struct j1939_priv *priv;
+
+ priv = j1939_priv_get_by_ndev(ndev);
+ if (!priv)
+ goto notify_done;
+
+ if (ndev->type != ARPHRD_CAN)
+ goto notify_put;
+
+ switch (msg) {
+ case NETDEV_DOWN:
+ j1939_cancel_active_session(priv, NULL);
+ j1939_sk_netdev_event_netdown(priv);
+ j1939_ecu_unmap_all(priv);
+ break;
+ }
+
+notify_put:
+ j1939_priv_put(priv);
+
+notify_done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block j1939_netdev_notifier = {
+ .notifier_call = j1939_netdev_notify,
+};
+
+/* MODULE interface */
+static __init int j1939_module_init(void)
+{
+ int ret;
+
+ pr_info("can: SAE J1939\n");
+
+ ret = register_netdevice_notifier(&j1939_netdev_notifier);
+ if (ret)
+ goto fail_notifier;
+
+ ret = can_proto_register(&j1939_can_proto);
+ if (ret < 0) {
+ pr_err("can: registration of j1939 protocol failed\n");
+ goto fail_sk;
+ }
+
+ return 0;
+
+ fail_sk:
+ unregister_netdevice_notifier(&j1939_netdev_notifier);
+ fail_notifier:
+ return ret;
+}
+
+static __exit void j1939_module_exit(void)
+{
+ can_proto_unregister(&j1939_can_proto);
+
+ unregister_netdevice_notifier(&j1939_netdev_notifier);
+}
+
+module_init(j1939_module_init);
+module_exit(j1939_module_exit);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
new file mode 100644
index 000000000000..f7587428febd
--- /dev/null
+++ b/net/can/j1939/socket.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/errqueue.h>
+#include <linux/if_arp.h>
+
+#include "j1939-priv.h"
+
+#define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939)
+
+/* conversion function between struct sock::sk_priority from linux and
+ * j1939 priority field
+ */
+static inline priority_t j1939_prio(u32 sk_priority)
+{
+ sk_priority = min(sk_priority, 7U);
+
+ return 7 - sk_priority;
+}
+
+static inline u32 j1939_to_sk_priority(priority_t prio)
+{
+ return 7 - prio;
+}
+
+/* function to see if pgn is to be evaluated */
+static inline bool j1939_pgn_is_valid(pgn_t pgn)
+{
+ return pgn <= J1939_PGN_MAX;
+}
+
+/* test function to avoid non-zero DA placeholder for pdu1 pgn's */
+static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn)
+{
+ if (j1939_pgn_is_pdu1(pgn))
+ return !(pgn & 0xff);
+ else
+ return true;
+}
+
+static inline void j1939_sock_pending_add(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ atomic_inc(&jsk->skb_pending);
+}
+
+static int j1939_sock_pending_get(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ return atomic_read(&jsk->skb_pending);
+}
+
+void j1939_sock_pending_del(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* atomic_dec_return returns the new value */
+ if (!atomic_dec_return(&jsk->skb_pending))
+ wake_up(&jsk->waitq); /* no pending SKB's */
+}
+
+static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+ jsk->state |= J1939_SOCK_BOUND;
+ j1939_priv_get(priv);
+
+ spin_lock_bh(&priv->j1939_socks_lock);
+ list_add_tail(&jsk->list, &priv->j1939_socks);
+ spin_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+ spin_lock_bh(&priv->j1939_socks_lock);
+ list_del_init(&jsk->list);
+ spin_unlock_bh(&priv->j1939_socks_lock);
+
+ j1939_priv_put(priv);
+ jsk->state &= ~J1939_SOCK_BOUND;
+}
+
+static bool j1939_sk_queue_session(struct j1939_session *session)
+{
+ struct j1939_sock *jsk = j1939_sk(session->sk);
+ bool empty;
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ empty = list_empty(&jsk->sk_session_queue);
+ j1939_session_get(session);
+ list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue);
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+ j1939_sock_pending_add(&jsk->sk);
+
+ return empty;
+}
+
+static struct
+j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk)
+{
+ struct j1939_session *session = NULL;
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ if (!list_empty(&jsk->sk_session_queue)) {
+ session = list_last_entry(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+ if (session->total_queued_size == session->total_message_size)
+ session = NULL;
+ else
+ j1939_session_get(session);
+ }
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+
+ return session;
+}
+
+static void j1939_sk_queue_drop_all(struct j1939_priv *priv,
+ struct j1939_sock *jsk, int err)
+{
+ struct j1939_session *session, *tmp;
+
+ netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err);
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue,
+ sk_session_queue_entry) {
+ list_del_init(&session->sk_session_queue_entry);
+ session->err = err;
+ j1939_session_put(session);
+ }
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static void j1939_sk_queue_activate_next_locked(struct j1939_session *session)
+{
+ struct j1939_sock *jsk;
+ struct j1939_session *first;
+ int err;
+
+ /* RX-Session don't have a socket (yet) */
+ if (!session->sk)
+ return;
+
+ jsk = j1939_sk(session->sk);
+ lockdep_assert_held(&jsk->sk_session_queue_lock);
+
+ err = session->err;
+
+ first = list_first_entry_or_null(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+
+ /* Some else has already activated the next session */
+ if (first != session)
+ return;
+
+activate_next:
+ list_del_init(&first->sk_session_queue_entry);
+ j1939_session_put(first);
+ first = list_first_entry_or_null(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+ if (!first)
+ return;
+
+ if (WARN_ON_ONCE(j1939_session_activate(first))) {
+ first->err = -EBUSY;
+ goto activate_next;
+ } else {
+ /* Give receiver some time (arbitrary chosen) to recover */
+ int time_ms = 0;
+
+ if (err)
+ time_ms = 10 + prandom_u32_max(16);
+
+ j1939_tp_schedule_txtimer(first, time_ms);
+ }
+}
+
+void j1939_sk_queue_activate_next(struct j1939_session *session)
+{
+ struct j1939_sock *jsk;
+
+ if (!session->sk)
+ return;
+
+ jsk = j1939_sk(session->sk);
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ j1939_sk_queue_activate_next_locked(session);
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static bool j1939_sk_match_dst(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ if ((jsk->state & J1939_SOCK_PROMISC))
+ return true;
+
+ /* Destination address filter */
+ if (jsk->addr.src_name && skcb->addr.dst_name) {
+ if (jsk->addr.src_name != skcb->addr.dst_name)
+ return false;
+ } else {
+ /* receive (all sockets) if
+ * - all packages that match our bind() address
+ * - all broadcast on a socket if SO_BROADCAST
+ * is set
+ */
+ if (j1939_address_is_unicast(skcb->addr.da)) {
+ if (jsk->addr.sa != skcb->addr.da)
+ return false;
+ } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+ /* receiving broadcast without SO_BROADCAST
+ * flag is not allowed
+ */
+ return false;
+ }
+ }
+
+ /* Source address filter */
+ if (jsk->state & J1939_SOCK_CONNECTED) {
+ /* receive (all sockets) if
+ * - all packages that match our connect() name or address
+ */
+ if (jsk->addr.dst_name && skcb->addr.src_name) {
+ if (jsk->addr.dst_name != skcb->addr.src_name)
+ return false;
+ } else {
+ if (jsk->addr.da != skcb->addr.sa)
+ return false;
+ }
+ }
+
+ /* PGN filter */
+ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) &&
+ jsk->pgn_rx_filter != skcb->addr.pgn)
+ return false;
+
+ return true;
+}
+
+/* matches skb control buffer (addr) with a j1939 filter */
+static bool j1939_sk_match_filter(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ const struct j1939_filter *f = jsk->filters;
+ int nfilter = jsk->nfilters;
+
+ if (!nfilter)
+ /* receive all when no filters are assigned */
+ return true;
+
+ for (; nfilter; ++f, --nfilter) {
+ if ((skcb->addr.pgn & f->pgn_mask) != f->pgn)
+ continue;
+ if ((skcb->addr.sa & f->addr_mask) != f->addr)
+ continue;
+ if ((skcb->addr.src_name & f->name_mask) != f->name)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static bool j1939_sk_recv_match_one(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ if (!(jsk->state & J1939_SOCK_BOUND))
+ return false;
+
+ if (!j1939_sk_match_dst(jsk, skcb))
+ return false;
+
+ if (!j1939_sk_match_filter(jsk, skcb))
+ return false;
+
+ return true;
+}
+
+static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb)
+{
+ const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb);
+ struct j1939_sk_buff_cb *skcb;
+ struct sk_buff *skb;
+
+ if (oskb->sk == &jsk->sk)
+ return;
+
+ if (!j1939_sk_recv_match_one(jsk, oskcb))
+ return;
+
+ skb = skb_clone(oskb, GFP_ATOMIC);
+ if (!skb) {
+ pr_warn("skb clone failed\n");
+ return;
+ }
+ can_skb_set_owner(skb, oskb->sk);
+
+ skcb = j1939_skb_to_cb(skb);
+ skcb->msg_flags &= ~(MSG_DONTROUTE);
+ if (skb->sk)
+ skcb->msg_flags |= MSG_DONTROUTE;
+
+ if (sock_queue_rcv_skb(&jsk->sk, skb) < 0)
+ kfree_skb(skb);
+}
+
+bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb)
+{
+ struct j1939_sock *jsk;
+ bool match = false;
+
+ spin_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ match = j1939_sk_recv_match_one(jsk, skcb);
+ if (match)
+ break;
+ }
+ spin_unlock_bh(&priv->j1939_socks_lock);
+
+ return match;
+}
+
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sock *jsk;
+
+ spin_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ j1939_sk_recv_one(jsk, skb);
+ }
+ spin_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_sk_sock_destruct(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* This function will be call by the generic networking code, when then
+ * the socket is ultimately closed (sk->sk_destruct).
+ *
+ * The race between
+ * - processing a received CAN frame
+ * (can_receive -> j1939_can_recv)
+ * and accessing j1939_priv
+ * ... and ...
+ * - closing a socket
+ * (j1939_can_rx_unregister -> can_rx_unregister)
+ * and calling the final j1939_priv_put()
+ *
+ * is avoided by calling the final j1939_priv_put() from this
+ * RCU deferred cleanup call.
+ */
+ if (jsk->priv) {
+ j1939_priv_put(jsk->priv);
+ jsk->priv = NULL;
+ }
+
+ /* call generic CAN sock destruct */
+ can_sock_destruct(sk);
+}
+
+static int j1939_sk_init(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* Ensure that "sk" is first member in "struct j1939_sock", so that we
+ * can skip it during memset().
+ */
+ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0);
+ memset((void *)jsk + sizeof(jsk->sk), 0x0,
+ sizeof(*jsk) - sizeof(jsk->sk));
+
+ INIT_LIST_HEAD(&jsk->list);
+ init_waitqueue_head(&jsk->waitq);
+ jsk->sk.sk_priority = j1939_to_sk_priority(6);
+ jsk->sk.sk_reuse = 1; /* per default */
+ jsk->addr.sa = J1939_NO_ADDR;
+ jsk->addr.da = J1939_NO_ADDR;
+ jsk->addr.pgn = J1939_NO_PGN;
+ jsk->pgn_rx_filter = J1939_NO_PGN;
+ atomic_set(&jsk->skb_pending, 0);
+ spin_lock_init(&jsk->sk_session_queue_lock);
+ INIT_LIST_HEAD(&jsk->sk_session_queue);
+ sk->sk_destruct = j1939_sk_sock_destruct;
+
+ return 0;
+}
+
+static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len)
+{
+ if (!addr)
+ return -EDESTADDRREQ;
+ if (len < J1939_MIN_NAMELEN)
+ return -EINVAL;
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+ if (!addr->can_ifindex)
+ return -ENODEV;
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct j1939_sock *jsk = j1939_sk(sock->sk);
+ struct j1939_priv *priv;
+ struct sock *sk;
+ struct net *net;
+ int ret = 0;
+
+ ret = j1939_sk_sanity_check(addr, len);
+ if (ret)
+ return ret;
+
+ lock_sock(sock->sk);
+
+ priv = jsk->priv;
+ sk = sock->sk;
+ net = sock_net(sk);
+
+ /* Already bound to an interface? */
+ if (jsk->state & J1939_SOCK_BOUND) {
+ /* A re-bind() to a different interface is not
+ * supported.
+ */
+ if (jsk->ifindex != addr->can_ifindex) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ /* drop old references */
+ j1939_jsk_del(priv, jsk);
+ j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+ } else {
+ struct net_device *ndev;
+
+ ndev = dev_get_by_index(net, addr->can_ifindex);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto out_release_sock;
+ }
+
+ if (ndev->type != ARPHRD_CAN) {
+ dev_put(ndev);
+ ret = -ENODEV;
+ goto out_release_sock;
+ }
+
+ priv = j1939_netdev_start(ndev);
+ dev_put(ndev);
+ if (IS_ERR(priv)) {
+ ret = PTR_ERR(priv);
+ goto out_release_sock;
+ }
+
+ jsk->ifindex = addr->can_ifindex;
+
+ /* the corresponding j1939_priv_put() is called via
+ * sk->sk_destruct, which points to j1939_sk_sock_destruct()
+ */
+ j1939_priv_get(priv);
+ jsk->priv = priv;
+ }
+
+ /* set default transmit pgn */
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ jsk->pgn_rx_filter = addr->can_addr.j1939.pgn;
+ jsk->addr.src_name = addr->can_addr.j1939.name;
+ jsk->addr.sa = addr->can_addr.j1939.addr;
+
+ /* get new references */
+ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);
+ if (ret) {
+ j1939_netdev_stop(priv);
+ goto out_release_sock;
+ }
+
+ j1939_jsk_add(priv, jsk);
+
+ out_release_sock: /* fall through */
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr,
+ int len, int flags)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct j1939_sock *jsk = j1939_sk(sock->sk);
+ int ret = 0;
+
+ ret = j1939_sk_sanity_check(addr, len);
+ if (ret)
+ return ret;
+
+ lock_sock(sock->sk);
+
+ /* bind() before connect() is mandatory */
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ /* A connect() to a different interface is not supported. */
+ if (jsk->ifindex != addr->can_ifindex) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ if (!addr->can_addr.j1939.name &&
+ addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+ !sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto out_release_sock;
+ }
+
+ jsk->addr.dst_name = addr->can_addr.j1939.name;
+ jsk->addr.da = addr->can_addr.j1939.addr;
+
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ jsk->addr.pgn = addr->can_addr.j1939.pgn;
+
+ jsk->state |= J1939_SOCK_CONNECTED;
+
+ out_release_sock: /* fall through */
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr,
+ const struct j1939_sock *jsk, int peer)
+{
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = jsk->ifindex;
+ addr->can_addr.j1939.pgn = jsk->addr.pgn;
+ if (peer) {
+ addr->can_addr.j1939.name = jsk->addr.dst_name;
+ addr->can_addr.j1939.addr = jsk->addr.da;
+ } else {
+ addr->can_addr.j1939.name = jsk->addr.src_name;
+ addr->can_addr.j1939.addr = jsk->addr.sa;
+ }
+}
+
+static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) {
+ ret = -EADDRNOTAVAIL;
+ goto failure;
+ }
+
+ j1939_sk_sock2sockaddr_can(addr, jsk, peer);
+ ret = J1939_MIN_NAMELEN;
+
+ failure:
+ release_sock(sk);
+
+ return ret;
+}
+
+static int j1939_sk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ jsk = j1939_sk(sk);
+
+ if (jsk->state & J1939_SOCK_BOUND) {
+ struct j1939_priv *priv = jsk->priv;
+
+ if (wait_event_interruptible(jsk->waitq,
+ !j1939_sock_pending_get(&jsk->sk))) {
+ j1939_cancel_active_session(priv, sk);
+ j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN);
+ }
+
+ j1939_jsk_del(priv, jsk);
+
+ j1939_local_ecu_put(priv, jsk->addr.src_name,
+ jsk->addr.sa);
+
+ j1939_netdev_stop(priv);
+ }
+
+ kfree(jsk->filters);
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, char __user *optval,
+ unsigned int optlen, int flag)
+{
+ int tmp;
+
+ if (optlen != sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_user(&tmp, optval, optlen))
+ return -EFAULT;
+ lock_sock(&jsk->sk);
+ if (tmp)
+ jsk->state |= flag;
+ else
+ jsk->state &= ~flag;
+ release_sock(&jsk->sk);
+ return tmp;
+}
+
+static int j1939_sk_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int tmp, count = 0, ret = 0;
+ struct j1939_filter *filters = NULL, *ofilters;
+
+ if (level != SOL_CAN_J1939)
+ return -EINVAL;
+
+ switch (optname) {
+ case SO_J1939_FILTER:
+ if (optval) {
+ struct j1939_filter *f;
+ int c;
+
+ if (optlen % sizeof(*filters) != 0)
+ return -EINVAL;
+
+ if (optlen > J1939_FILTER_MAX *
+ sizeof(struct j1939_filter))
+ return -EINVAL;
+
+ count = optlen / sizeof(*filters);
+ filters = memdup_user(optval, optlen);
+ if (IS_ERR(filters))
+ return PTR_ERR(filters);
+
+ for (f = filters, c = count; c; f++, c--) {
+ f->name &= f->name_mask;
+ f->pgn &= f->pgn_mask;
+ f->addr &= f->addr_mask;
+ }
+ }
+
+ lock_sock(&jsk->sk);
+ ofilters = jsk->filters;
+ jsk->filters = filters;
+ jsk->nfilters = count;
+ release_sock(&jsk->sk);
+ kfree(ofilters);
+ return 0;
+ case SO_J1939_PROMISC:
+ return j1939_sk_setsockopt_flag(jsk, optval, optlen,
+ J1939_SOCK_PROMISC);
+ case SO_J1939_ERRQUEUE:
+ ret = j1939_sk_setsockopt_flag(jsk, optval, optlen,
+ J1939_SOCK_ERRQUEUE);
+ if (ret < 0)
+ return ret;
+
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ skb_queue_purge(&sk->sk_error_queue);
+ return ret;
+ case SO_J1939_SEND_PRIO:
+ if (optlen != sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_user(&tmp, optval, optlen))
+ return -EFAULT;
+ if (tmp < 0 || tmp > 7)
+ return -EDOM;
+ if (tmp < 2 && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ lock_sock(&jsk->sk);
+ jsk->sk.sk_priority = j1939_to_sk_priority(tmp);
+ release_sock(&jsk->sk);
+ return 0;
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int j1939_sk_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int ret, ulen;
+ /* set defaults for using 'int' properties */
+ int tmp = 0;
+ int len = sizeof(tmp);
+ void *val = &tmp;
+
+ if (level != SOL_CAN_J1939)
+ return -EINVAL;
+ if (get_user(ulen, optlen))
+ return -EFAULT;
+ if (ulen < 0)
+ return -EINVAL;
+
+ lock_sock(&jsk->sk);
+ switch (optname) {
+ case SO_J1939_PROMISC:
+ tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0;
+ break;
+ case SO_J1939_ERRQUEUE:
+ tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0;
+ break;
+ case SO_J1939_SEND_PRIO:
+ tmp = j1939_prio(jsk->sk.sk_priority);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ goto no_copy;
+ }
+
+ /* copy to user, based on 'len' & 'val'
+ * but most sockopt's are 'int' properties, and have 'len' & 'val'
+ * left unchanged, but instead modified 'tmp'
+ */
+ if (len > ulen)
+ ret = -EFAULT;
+ else if (put_user(len, optlen))
+ ret = -EFAULT;
+ else if (copy_to_user(optval, val, len))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ no_copy:
+ release_sock(&jsk->sk);
+ return ret;
+}
+
+static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+ int ret = 0;
+
+ if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE))
+ return -EINVAL;
+
+ if (flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939,
+ SCM_J1939_ERRQUEUE);
+
+ skb = skb_recv_datagram(sk, flags, 0, &ret);
+ if (!skb)
+ return ret;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ ret = memcpy_to_msg(msg, skb->data, size);
+ if (ret < 0) {
+ skb_free_datagram(sk, skb);
+ return ret;
+ }
+
+ skcb = j1939_skb_to_cb(skb);
+ if (j1939_address_is_valid(skcb->addr.da))
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR,
+ sizeof(skcb->addr.da), &skcb->addr.da);
+
+ if (skcb->addr.dst_name)
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME,
+ sizeof(skcb->addr.dst_name), &skcb->addr.dst_name);
+
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO,
+ sizeof(skcb->priority), &skcb->priority);
+
+ if (msg->msg_name) {
+ struct sockaddr_can *paddr = msg->msg_name;
+
+ msg->msg_namelen = J1939_MIN_NAMELEN;
+ memset(msg->msg_name, 0, msg->msg_namelen);
+ paddr->can_family = AF_CAN;
+ paddr->can_ifindex = skb->skb_iif;
+ paddr->can_addr.j1939.name = skcb->addr.src_name;
+ paddr->can_addr.j1939.addr = skcb->addr.sa;
+ paddr->can_addr.j1939.pgn = skcb->addr.pgn;
+ }
+
+ sock_recv_ts_and_drops(msg, sk, skb);
+ msg->msg_flags |= skcb->msg_flags;
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
+ struct sock *sk,
+ struct msghdr *msg, size_t size,
+ int *errcode)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_sk_buff_cb *skcb;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = sock_alloc_send_skb(sk,
+ size +
+ sizeof(struct can_frame) -
+ sizeof(((struct can_frame *)NULL)->data) +
+ sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &ret);
+ if (!skb)
+ goto failure;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ skb_reserve(skb, offsetof(struct can_frame, data));
+
+ ret = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (ret < 0)
+ goto free_skb;
+
+ skb->dev = ndev;
+
+ skcb = j1939_skb_to_cb(skb);
+ memset(skcb, 0, sizeof(*skcb));
+ skcb->addr = jsk->addr;
+ skcb->priority = j1939_prio(sk->sk_priority);
+
+ if (msg->msg_name) {
+ struct sockaddr_can *addr = msg->msg_name;
+
+ if (addr->can_addr.j1939.name ||
+ addr->can_addr.j1939.addr != J1939_NO_ADDR) {
+ skcb->addr.dst_name = addr->can_addr.j1939.name;
+ skcb->addr.da = addr->can_addr.j1939.addr;
+ }
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ skcb->addr.pgn = addr->can_addr.j1939.pgn;
+ }
+
+ *errcode = ret;
+ return skb;
+
+free_skb:
+ kfree_skb(skb);
+failure:
+ *errcode = ret;
+ return NULL;
+}
+
+static size_t j1939_sk_opt_stats_get_size(void)
+{
+ return
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
+ 0;
+}
+
+static struct sk_buff *
+j1939_sk_get_timestamping_opt_stats(struct j1939_session *session)
+{
+ struct sk_buff *stats;
+ u32 size;
+
+ stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ if (session->skcb.addr.type == J1939_SIMPLE)
+ size = session->total_message_size;
+ else
+ size = min(session->pkt.tx_acked * 7,
+ session->total_message_size);
+
+ nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+
+ return stats;
+}
+
+void j1939_sk_errqueue(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sock *sk = session->sk;
+ struct j1939_sock *jsk;
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ char *state = "UNK";
+ int err;
+
+ /* currently we have no sk for the RX session */
+ if (!sk)
+ return;
+
+ jsk = j1939_sk(sk);
+
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ return;
+
+ skb = j1939_sk_get_timestamping_opt_stats(session);
+ if (!skb)
+ return;
+
+ skb->tstamp = ktime_get_real();
+
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ switch (type) {
+ case J1939_ERRQUEUE_ACK:
+ if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = SCM_TSTAMP_ACK;
+ state = "ACK";
+ break;
+ case J1939_ERRQUEUE_SCHED:
+ if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = SCM_TSTAMP_SCHED;
+ state = "SCH";
+ break;
+ case J1939_ERRQUEUE_ABORT:
+ serr->ee.ee_errno = session->err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_TX_ABORT;
+ state = "ABT";
+ break;
+ default:
+ netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
+ }
+
+ serr->opt_stats = true;
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ serr->ee.ee_data = session->tskey;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n",
+ __func__, session, session->tskey, state);
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+};
+
+void j1939_sk_send_loop_abort(struct sock *sk, int err)
+{
+ sk->sk_err = err;
+
+ sk->sk_error_report(sk);
+}
+
+static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk,
+ struct msghdr *msg, size_t size)
+
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_session *session = j1939_sk_get_incomplete_session(jsk);
+ struct sk_buff *skb;
+ size_t segment_size, todo_size;
+ int ret = 0;
+
+ if (session &&
+ session->total_message_size != session->total_queued_size + size) {
+ j1939_session_put(session);
+ return -EIO;
+ }
+
+ todo_size = size;
+
+ while (todo_size) {
+ struct j1939_sk_buff_cb *skcb;
+
+ segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE,
+ todo_size);
+
+ /* Allocate skb for one segment */
+ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size,
+ &ret);
+ if (ret)
+ break;
+
+ skcb = j1939_skb_to_cb(skb);
+
+ if (!session) {
+ /* at this point the size should be full size
+ * of the session
+ */
+ skcb->offset = 0;
+ session = j1939_tp_send(priv, skb, size);
+ if (IS_ERR(session)) {
+ ret = PTR_ERR(session);
+ goto kfree_skb;
+ }
+ if (j1939_sk_queue_session(session)) {
+ /* try to activate session if we a
+ * fist in the queue
+ */
+ if (!j1939_session_activate(session)) {
+ j1939_tp_schedule_txtimer(session, 0);
+ } else {
+ ret = -EBUSY;
+ session->err = ret;
+ j1939_sk_queue_drop_all(priv, jsk,
+ EBUSY);
+ break;
+ }
+ }
+ } else {
+ skcb->offset = session->total_queued_size;
+ j1939_session_skb_queue(session, skb);
+ }
+
+ todo_size -= segment_size;
+ session->total_queued_size += segment_size;
+ }
+
+ switch (ret) {
+ case 0: /* OK */
+ if (todo_size)
+ netdev_warn(priv->ndev,
+ "no error found and not completely queued?! %zu\n",
+ todo_size);
+ ret = size;
+ break;
+ case -ERESTARTSYS:
+ ret = -EINTR;
+ /* fall through */
+ case -EAGAIN: /* OK */
+ if (todo_size != size)
+ ret = size - todo_size;
+ break;
+ default: /* ERROR */
+ break;
+ }
+
+ if (session)
+ j1939_session_put(session);
+
+ return ret;
+
+ kfree_skb:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_priv *priv;
+ int ifindex;
+ int ret;
+
+ lock_sock(sock->sk);
+ /* various socket state tests */
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ priv = jsk->priv;
+ ifindex = jsk->ifindex;
+
+ if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) {
+ /* no source address assigned yet */
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ /* deal with provided destination address info */
+ if (msg->msg_name) {
+ struct sockaddr_can *addr = msg->msg_name;
+
+ if (msg->msg_namelen < J1939_MIN_NAMELEN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (addr->can_family != AF_CAN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (addr->can_ifindex && addr->can_ifindex != ifindex) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (!addr->can_addr.j1939.name &&
+ addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
+ } else {
+ if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
+ }
+
+ ret = j1939_sk_send_loop(priv, sk, msg, size);
+
+sendmsg_done:
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
+{
+ struct j1939_sock *jsk;
+ int error_code = ENETDOWN;
+
+ spin_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ jsk->sk.sk_err = error_code;
+ if (!sock_flag(&jsk->sk, SOCK_DEAD))
+ jsk->sk.sk_error_report(&jsk->sk);
+
+ j1939_sk_queue_drop_all(priv, jsk, error_code);
+ }
+ spin_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops j1939_ops = {
+ .family = PF_CAN,
+ .release = j1939_sk_release,
+ .bind = j1939_sk_bind,
+ .connect = j1939_sk_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = j1939_sk_getname,
+ .poll = datagram_poll,
+ .ioctl = j1939_sk_no_ioctlcmd,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = j1939_sk_setsockopt,
+ .getsockopt = j1939_sk_getsockopt,
+ .sendmsg = j1939_sk_sendmsg,
+ .recvmsg = j1939_sk_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+};
+
+static struct proto j1939_proto __read_mostly = {
+ .name = "CAN_J1939",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct j1939_sock),
+ .init = j1939_sk_init,
+};
+
+const struct can_proto j1939_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_J1939,
+ .ops = &j1939_ops,
+ .prot = &j1939_proto,
+};
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
new file mode 100644
index 000000000000..9f99af5b0b11
--- /dev/null
+++ b/net/can/j1939/transport.c
@@ -0,0 +1,2063 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/can/skb.h>
+
+#include "j1939-priv.h"
+
+#define J1939_XTP_TX_RETRY_LIMIT 100
+
+#define J1939_ETP_PGN_CTL 0xc800
+#define J1939_ETP_PGN_DAT 0xc700
+#define J1939_TP_PGN_CTL 0xec00
+#define J1939_TP_PGN_DAT 0xeb00
+
+#define J1939_TP_CMD_RTS 0x10
+#define J1939_TP_CMD_CTS 0x11
+#define J1939_TP_CMD_EOMA 0x13
+#define J1939_TP_CMD_BAM 0x20
+#define J1939_TP_CMD_ABORT 0xff
+
+#define J1939_ETP_CMD_RTS 0x14
+#define J1939_ETP_CMD_CTS 0x15
+#define J1939_ETP_CMD_DPO 0x16
+#define J1939_ETP_CMD_EOMA 0x17
+#define J1939_ETP_CMD_ABORT 0xff
+
+enum j1939_xtp_abort {
+ J1939_XTP_NO_ABORT = 0,
+ J1939_XTP_ABORT_BUSY = 1,
+ /* Already in one or more connection managed sessions and
+ * cannot support another.
+ *
+ * EALREADY:
+ * Operation already in progress
+ */
+
+ J1939_XTP_ABORT_RESOURCE = 2,
+ /* System resources were needed for another task so this
+ * connection managed session was terminated.
+ *
+ * EMSGSIZE:
+ * The socket type requires that message be sent atomically,
+ * and the size of the message to be sent made this
+ * impossible.
+ */
+
+ J1939_XTP_ABORT_TIMEOUT = 3,
+ /* A timeout occurred and this is the connection abort to
+ * close the session.
+ *
+ * EHOSTUNREACH:
+ * The destination host cannot be reached (probably because
+ * the host is down or a remote router cannot reach it).
+ */
+
+ J1939_XTP_ABORT_GENERIC = 4,
+ /* CTS messages received when data transfer is in progress
+ *
+ * EBADMSG:
+ * Not a data message
+ */
+
+ J1939_XTP_ABORT_FAULT = 5,
+ /* Maximal retransmit request limit reached
+ *
+ * ENOTRECOVERABLE:
+ * State not recoverable
+ */
+
+ J1939_XTP_ABORT_UNEXPECTED_DATA = 6,
+ /* Unexpected data transfer packet
+ *
+ * ENOTCONN:
+ * Transport endpoint is not connected
+ */
+
+ J1939_XTP_ABORT_BAD_SEQ = 7,
+ /* Bad sequence number (and software is not able to recover)
+ *
+ * EILSEQ:
+ * Illegal byte sequence
+ */
+
+ J1939_XTP_ABORT_DUP_SEQ = 8,
+ /* Duplicate sequence number (and software is not able to
+ * recover)
+ */
+
+ J1939_XTP_ABORT_EDPO_UNEXPECTED = 9,
+ /* Unexpected EDPO packet (ETP) or Message size > 1785 bytes
+ * (TP)
+ */
+
+ J1939_XTP_ABORT_BAD_EDPO_PGN = 10,
+ /* Unexpected EDPO PGN (PGN in EDPO is bad) */
+
+ J1939_XTP_ABORT_EDPO_OUTOF_CTS = 11,
+ /* EDPO number of packets is greater than CTS */
+
+ J1939_XTP_ABORT_BAD_EDPO_OFFSET = 12,
+ /* Bad EDPO offset */
+
+ J1939_XTP_ABORT_OTHER_DEPRECATED = 13,
+ /* Deprecated. Use 250 instead (Any other reason) */
+
+ J1939_XTP_ABORT_ECTS_UNXPECTED_PGN = 14,
+ /* Unexpected ECTS PGN (PGN in ECTS is bad) */
+
+ J1939_XTP_ABORT_ECTS_TOO_BIG = 15,
+ /* ECTS requested packets exceeds message size */
+
+ J1939_XTP_ABORT_OTHER = 250,
+ /* Any other reason (if a Connection Abort reason is
+ * identified that is not listed in the table use code 250)
+ */
+};
+
+static unsigned int j1939_tp_block = 255;
+static unsigned int j1939_tp_packet_delay;
+static unsigned int j1939_tp_padding = 1;
+
+/* helpers */
+static const char *j1939_xtp_abort_to_str(enum j1939_xtp_abort abort)
+{
+ switch (abort) {
+ case J1939_XTP_ABORT_BUSY:
+ return "Already in one or more connection managed sessions and cannot support another.";
+ case J1939_XTP_ABORT_RESOURCE:
+ return "System resources were needed for another task so this connection managed session was terminated.";
+ case J1939_XTP_ABORT_TIMEOUT:
+ return "A timeout occurred and this is the connection abort to close the session.";
+ case J1939_XTP_ABORT_GENERIC:
+ return "CTS messages received when data transfer is in progress";
+ case J1939_XTP_ABORT_FAULT:
+ return "Maximal retransmit request limit reached";
+ case J1939_XTP_ABORT_UNEXPECTED_DATA:
+ return "Unexpected data transfer packet";
+ case J1939_XTP_ABORT_BAD_SEQ:
+ return "Bad sequence number (and software is not able to recover)";
+ case J1939_XTP_ABORT_DUP_SEQ:
+ return "Duplicate sequence number (and software is not able to recover)";
+ case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+ return "Unexpected EDPO packet (ETP) or Message size > 1785 bytes (TP)";
+ case J1939_XTP_ABORT_BAD_EDPO_PGN:
+ return "Unexpected EDPO PGN (PGN in EDPO is bad)";
+ case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+ return "EDPO number of packets is greater than CTS";
+ case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+ return "Bad EDPO offset";
+ case J1939_XTP_ABORT_OTHER_DEPRECATED:
+ return "Deprecated. Use 250 instead (Any other reason)";
+ case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+ return "Unexpected ECTS PGN (PGN in ECTS is bad)";
+ case J1939_XTP_ABORT_ECTS_TOO_BIG:
+ return "ECTS requested packets exceeds message size";
+ case J1939_XTP_ABORT_OTHER:
+ return "Any other reason (if a Connection Abort reason is identified that is not listed in the table use code 250)";
+ default:
+ return "<unknown>";
+ }
+}
+
+static int j1939_xtp_abort_to_errno(struct j1939_priv *priv,
+ enum j1939_xtp_abort abort)
+{
+ int err;
+
+ switch (abort) {
+ case J1939_XTP_NO_ABORT:
+ WARN_ON_ONCE(abort == J1939_XTP_NO_ABORT);
+ err = 0;
+ break;
+ case J1939_XTP_ABORT_BUSY:
+ err = EALREADY;
+ break;
+ case J1939_XTP_ABORT_RESOURCE:
+ err = EMSGSIZE;
+ break;
+ case J1939_XTP_ABORT_TIMEOUT:
+ err = EHOSTUNREACH;
+ break;
+ case J1939_XTP_ABORT_GENERIC:
+ err = EBADMSG;
+ break;
+ case J1939_XTP_ABORT_FAULT:
+ err = ENOTRECOVERABLE;
+ break;
+ case J1939_XTP_ABORT_UNEXPECTED_DATA:
+ err = ENOTCONN;
+ break;
+ case J1939_XTP_ABORT_BAD_SEQ:
+ err = EILSEQ;
+ break;
+ case J1939_XTP_ABORT_DUP_SEQ:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_BAD_EDPO_PGN:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_OTHER_DEPRECATED:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_ECTS_TOO_BIG:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_OTHER:
+ err = EPROTO;
+ break;
+ default:
+ netdev_warn(priv->ndev, "Unknown abort code %i", abort);
+ err = EPROTO;
+ }
+
+ return err;
+}
+
+static inline void j1939_session_list_lock(struct j1939_priv *priv)
+{
+ spin_lock_bh(&priv->active_session_list_lock);
+}
+
+static inline void j1939_session_list_unlock(struct j1939_priv *priv)
+{
+ spin_unlock_bh(&priv->active_session_list_lock);
+}
+
+void j1939_session_get(struct j1939_session *session)
+{
+ kref_get(&session->kref);
+}
+
+/* session completion functions */
+static void __j1939_session_drop(struct j1939_session *session)
+{
+ if (!session->transmission)
+ return;
+
+ j1939_sock_pending_del(session->sk);
+ sock_put(session->sk);
+}
+
+static void j1939_session_destroy(struct j1939_session *session)
+{
+ if (session->err)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT);
+ else
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_ACK);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry));
+ WARN_ON_ONCE(!list_empty(&session->active_session_list_entry));
+
+ skb_queue_purge(&session->skb_queue);
+ __j1939_session_drop(session);
+ j1939_priv_put(session->priv);
+ kfree(session);
+}
+
+static void __j1939_session_release(struct kref *kref)
+{
+ struct j1939_session *session = container_of(kref, struct j1939_session,
+ kref);
+
+ j1939_session_destroy(session);
+}
+
+void j1939_session_put(struct j1939_session *session)
+{
+ kref_put(&session->kref, __j1939_session_release);
+}
+
+static void j1939_session_txtimer_cancel(struct j1939_session *session)
+{
+ if (hrtimer_cancel(&session->txtimer))
+ j1939_session_put(session);
+}
+
+static void j1939_session_rxtimer_cancel(struct j1939_session *session)
+{
+ if (hrtimer_cancel(&session->rxtimer))
+ j1939_session_put(session);
+}
+
+void j1939_session_timers_cancel(struct j1939_session *session)
+{
+ j1939_session_txtimer_cancel(session);
+ j1939_session_rxtimer_cancel(session);
+}
+
+static inline bool j1939_cb_is_broadcast(const struct j1939_sk_buff_cb *skcb)
+{
+ return (!skcb->addr.dst_name && (skcb->addr.da == 0xff));
+}
+
+static void j1939_session_skb_drop_old(struct j1939_session *session)
+{
+ struct sk_buff *do_skb;
+ struct j1939_sk_buff_cb *do_skcb;
+ unsigned int offset_start;
+ unsigned long flags;
+
+ if (skb_queue_len(&session->skb_queue) < 2)
+ return;
+
+ offset_start = session->pkt.tx_acked * 7;
+
+ spin_lock_irqsave(&session->skb_queue.lock, flags);
+ do_skb = skb_peek(&session->skb_queue);
+ do_skcb = j1939_skb_to_cb(do_skb);
+
+ if ((do_skcb->offset + do_skb->len) < offset_start) {
+ __skb_unlink(do_skb, &session->skb_queue);
+ kfree_skb(do_skb);
+ }
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+}
+
+void j1939_session_skb_queue(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_priv *priv = session->priv;
+
+ j1939_ac_fixup(priv, skb);
+
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+ skb_queue_tail(&session->skb_queue, skb);
+}
+
+static struct sk_buff *j1939_session_skb_find(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *do_skb;
+ struct j1939_sk_buff_cb *do_skcb;
+ unsigned int offset_start;
+ unsigned long flags;
+
+ offset_start = session->pkt.dpo * 7;
+
+ spin_lock_irqsave(&session->skb_queue.lock, flags);
+ skb_queue_walk(&session->skb_queue, do_skb) {
+ do_skcb = j1939_skb_to_cb(do_skb);
+
+ if (offset_start >= do_skcb->offset &&
+ offset_start < (do_skcb->offset + do_skb->len)) {
+ skb = do_skb;
+ }
+ }
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+
+ if (!skb)
+ netdev_dbg(priv->ndev, "%s: 0x%p: no skb found for start: %i, queue size: %i\n",
+ __func__, session, offset_start,
+ skb_queue_len(&session->skb_queue));
+
+ return skb;
+}
+
+/* see if we are receiver
+ * returns 0 for broadcasts, although we will receive them
+ */
+static inline int j1939_tp_im_receiver(const struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & J1939_ECU_LOCAL_DST;
+}
+
+/* see if we are sender */
+static inline int j1939_tp_im_transmitter(const struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & J1939_ECU_LOCAL_SRC;
+}
+
+/* see if we are involved as either receiver or transmitter */
+static int j1939_tp_im_involved(const struct j1939_sk_buff_cb *skcb, bool swap)
+{
+ if (swap)
+ return j1939_tp_im_receiver(skcb);
+ else
+ return j1939_tp_im_transmitter(skcb);
+}
+
+static int j1939_tp_im_involved_anydir(struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & (J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+}
+
+/* extract pgn from flow-ctl message */
+static inline pgn_t j1939_xtp_ctl_to_pgn(const u8 *dat)
+{
+ pgn_t pgn;
+
+ pgn = (dat[7] << 16) | (dat[6] << 8) | (dat[5] << 0);
+ if (j1939_pgn_is_pdu1(pgn))
+ pgn &= 0xffff00;
+ return pgn;
+}
+
+static inline unsigned int j1939_tp_ctl_to_size(const u8 *dat)
+{
+ return (dat[2] << 8) + (dat[1] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_packet(const u8 *dat)
+{
+ return (dat[4] << 16) | (dat[3] << 8) | (dat[2] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_size(const u8 *dat)
+{
+ return (dat[4] << 24) | (dat[3] << 16) |
+ (dat[2] << 8) | (dat[1] << 0);
+}
+
+/* find existing session:
+ * reverse: swap cb's src & dst
+ * there is no problem with matching broadcasts, since
+ * broadcasts (no dst, no da) would never call this
+ * with reverse == true
+ */
+static bool j1939_session_match(struct j1939_addr *se_addr,
+ struct j1939_addr *sk_addr, bool reverse)
+{
+ if (se_addr->type != sk_addr->type)
+ return false;
+
+ if (reverse) {
+ if (se_addr->src_name) {
+ if (se_addr->src_name != sk_addr->dst_name)
+ return false;
+ } else if (se_addr->sa != sk_addr->da) {
+ return false;
+ }
+
+ if (se_addr->dst_name) {
+ if (se_addr->dst_name != sk_addr->src_name)
+ return false;
+ } else if (se_addr->da != sk_addr->sa) {
+ return false;
+ }
+ } else {
+ if (se_addr->src_name) {
+ if (se_addr->src_name != sk_addr->src_name)
+ return false;
+ } else if (se_addr->sa != sk_addr->sa) {
+ return false;
+ }
+
+ if (se_addr->dst_name) {
+ if (se_addr->dst_name != sk_addr->dst_name)
+ return false;
+ } else if (se_addr->da != sk_addr->da) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr_locked(struct j1939_priv *priv,
+ struct list_head *root,
+ struct j1939_addr *addr,
+ bool reverse, bool transmitter)
+{
+ struct j1939_session *session;
+
+ lockdep_assert_held(&priv->active_session_list_lock);
+
+ list_for_each_entry(session, root, active_session_list_entry) {
+ j1939_session_get(session);
+ if (j1939_session_match(&session->skcb.addr, addr, reverse) &&
+ session->transmission == transmitter)
+ return session;
+ j1939_session_put(session);
+ }
+
+ return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_simple(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ lockdep_assert_held(&priv->active_session_list_lock);
+
+ list_for_each_entry(session, &priv->active_session_list,
+ active_session_list_entry) {
+ j1939_session_get(session);
+ if (session->skcb.addr.type == J1939_SIMPLE &&
+ session->tskey == skcb->tskey && session->sk == skb->sk)
+ return session;
+ j1939_session_put(session);
+ }
+
+ return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr(struct j1939_priv *priv,
+ struct j1939_addr *addr,
+ bool reverse, bool transmitter)
+{
+ struct j1939_session *session;
+
+ j1939_session_list_lock(priv);
+ session = j1939_session_get_by_addr_locked(priv,
+ &priv->active_session_list,
+ addr, reverse, transmitter);
+ j1939_session_list_unlock(priv);
+
+ return session;
+}
+
+static void j1939_skbcb_swap(struct j1939_sk_buff_cb *skcb)
+{
+ u8 tmp = 0;
+
+ swap(skcb->addr.dst_name, skcb->addr.src_name);
+ swap(skcb->addr.da, skcb->addr.sa);
+
+ /* swap SRC and DST flags, leave other untouched */
+ if (skcb->flags & J1939_ECU_LOCAL_SRC)
+ tmp |= J1939_ECU_LOCAL_DST;
+ if (skcb->flags & J1939_ECU_LOCAL_DST)
+ tmp |= J1939_ECU_LOCAL_SRC;
+ skcb->flags &= ~(J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+ skcb->flags |= tmp;
+}
+
+static struct
+sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool ctl,
+ bool swap_src_dst)
+{
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+
+ skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv),
+ GFP_ATOMIC);
+ if (unlikely(!skb))
+ return ERR_PTR(-ENOMEM);
+
+ skb->dev = priv->ndev;
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ /* reserve CAN header */
+ skb_reserve(skb, offsetof(struct can_frame, data));
+
+ memcpy(skb->cb, re_skcb, sizeof(skb->cb));
+ skcb = j1939_skb_to_cb(skb);
+ if (swap_src_dst)
+ j1939_skbcb_swap(skcb);
+
+ if (ctl) {
+ if (skcb->addr.type == J1939_ETP)
+ skcb->addr.pgn = J1939_ETP_PGN_CTL;
+ else
+ skcb->addr.pgn = J1939_TP_PGN_CTL;
+ } else {
+ if (skcb->addr.type == J1939_ETP)
+ skcb->addr.pgn = J1939_ETP_PGN_DAT;
+ else
+ skcb->addr.pgn = J1939_TP_PGN_DAT;
+ }
+
+ return skb;
+}
+
+/* TP transmit packet functions */
+static int j1939_tp_tx_dat(struct j1939_session *session,
+ const u8 *dat, int len)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *skb;
+
+ skb = j1939_tp_tx_dat_new(priv, &session->skcb,
+ false, false);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ skb_put_data(skb, dat, len);
+ if (j1939_tp_padding && len < 8)
+ memset(skb_put(skb, 8 - len), 0xff, 8 - len);
+
+ return j1939_send_one(priv, skb);
+}
+
+static int j1939_xtp_do_tx_ctl(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool swap_src_dst, pgn_t pgn, const u8 *dat)
+{
+ struct sk_buff *skb;
+ u8 *skdat;
+
+ if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+ return 0;
+
+ skb = j1939_tp_tx_dat_new(priv, re_skcb, true, swap_src_dst);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ skdat = skb_put(skb, 8);
+ memcpy(skdat, dat, 5);
+ skdat[5] = (pgn >> 0);
+ skdat[6] = (pgn >> 8);
+ skdat[7] = (pgn >> 16);
+
+ return j1939_send_one(priv, skb);
+}
+
+static inline int j1939_tp_tx_ctl(struct j1939_session *session,
+ bool swap_src_dst, const u8 *dat)
+{
+ struct j1939_priv *priv = session->priv;
+
+ return j1939_xtp_do_tx_ctl(priv, &session->skcb,
+ swap_src_dst,
+ session->skcb.addr.pgn, dat);
+}
+
+static int j1939_xtp_tx_abort(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool swap_src_dst,
+ enum j1939_xtp_abort err,
+ pgn_t pgn)
+{
+ u8 dat[5];
+
+ if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+ return 0;
+
+ memset(dat, 0xff, sizeof(dat));
+ dat[0] = J1939_TP_CMD_ABORT;
+ dat[1] = err;
+ return j1939_xtp_do_tx_ctl(priv, re_skcb, swap_src_dst, pgn, dat);
+}
+
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec)
+{
+ j1939_session_get(session);
+ hrtimer_start(&session->txtimer, ms_to_ktime(msec),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+static inline void j1939_tp_set_rxtimeout(struct j1939_session *session,
+ int msec)
+{
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_get(session);
+ hrtimer_start(&session->rxtimer, ms_to_ktime(msec),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+static int j1939_session_tx_rts(struct j1939_session *session)
+{
+ u8 dat[8];
+ int ret;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ dat[1] = (session->total_message_size >> 0);
+ dat[2] = (session->total_message_size >> 8);
+ dat[3] = session->pkt.total;
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ dat[0] = J1939_ETP_CMD_RTS;
+ dat[1] = (session->total_message_size >> 0);
+ dat[2] = (session->total_message_size >> 8);
+ dat[3] = (session->total_message_size >> 16);
+ dat[4] = (session->total_message_size >> 24);
+ } else if (j1939_cb_is_broadcast(&session->skcb)) {
+ dat[0] = J1939_TP_CMD_BAM;
+ /* fake cts for broadcast */
+ session->pkt.tx = 0;
+ } else {
+ dat[0] = J1939_TP_CMD_RTS;
+ dat[4] = dat[3];
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, false, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+ if (dat[0] == J1939_TP_CMD_BAM)
+ j1939_tp_schedule_txtimer(session, 50);
+
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_dpo(struct j1939_session *session)
+{
+ unsigned int pkt;
+ u8 dat[8];
+ int ret;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ dat[0] = J1939_ETP_CMD_DPO;
+ session->pkt.dpo = session->pkt.tx_acked;
+ pkt = session->pkt.dpo;
+ dat[1] = session->pkt.last - session->pkt.tx_acked;
+ dat[2] = (pkt >> 0);
+ dat[3] = (pkt >> 8);
+ dat[4] = (pkt >> 16);
+
+ ret = j1939_tp_tx_ctl(session, false, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 1250);
+ session->pkt.tx = session->pkt.tx_acked;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_dat(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *skcb;
+ int offset, pkt_done, pkt_end;
+ unsigned int len, pdelay;
+ struct sk_buff *se_skb;
+ const u8 *tpdat;
+ int ret = 0;
+ u8 dat[8];
+
+ se_skb = j1939_session_skb_find(session);
+ if (!se_skb)
+ return -ENOBUFS;
+
+ skcb = j1939_skb_to_cb(se_skb);
+ tpdat = se_skb->data;
+ ret = 0;
+ pkt_done = 0;
+ if (session->skcb.addr.type != J1939_ETP &&
+ j1939_cb_is_broadcast(&session->skcb))
+ pkt_end = session->pkt.total;
+ else
+ pkt_end = session->pkt.last;
+
+ while (session->pkt.tx < pkt_end) {
+ dat[0] = session->pkt.tx - session->pkt.dpo + 1;
+ offset = (session->pkt.tx * 7) - skcb->offset;
+ len = se_skb->len - offset;
+ if (len > 7)
+ len = 7;
+
+ memcpy(&dat[1], &tpdat[offset], len);
+ ret = j1939_tp_tx_dat(session, dat, len + 1);
+ if (ret < 0) {
+ /* ENOBUS == CAN interface TX queue is full */
+ if (ret != -ENOBUFS)
+ netdev_alert(priv->ndev,
+ "%s: 0x%p: queue data error: %i\n",
+ __func__, session, ret);
+ break;
+ }
+
+ session->last_txcmd = 0xff;
+ pkt_done++;
+ session->pkt.tx++;
+ pdelay = j1939_cb_is_broadcast(&session->skcb) ? 50 :
+ j1939_tp_packet_delay;
+
+ if (session->pkt.tx < session->pkt.total && pdelay) {
+ j1939_tp_schedule_txtimer(session, pdelay);
+ break;
+ }
+ }
+
+ if (pkt_done)
+ j1939_tp_set_rxtimeout(session, 250);
+
+ return ret;
+}
+
+static int j1939_xtp_txnext_transmiter(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (!j1939_tp_im_transmitter(&session->skcb)) {
+ netdev_alert(priv->ndev, "%s: 0x%p: called by not transmitter!\n",
+ __func__, session);
+ return -EINVAL;
+ }
+
+ switch (session->last_cmd) {
+ case 0:
+ ret = j1939_session_tx_rts(session);
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ if (session->last_txcmd != J1939_ETP_CMD_DPO) {
+ ret = j1939_session_tx_dpo(session);
+ if (ret)
+ return ret;
+ }
+
+ /* fall through */
+ case J1939_TP_CMD_CTS:
+ case 0xff: /* did some data */
+ case J1939_ETP_CMD_DPO:
+ case J1939_TP_CMD_BAM:
+ ret = j1939_session_tx_dat(session);
+
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+ __func__, session, session->last_cmd);
+ }
+
+ return ret;
+}
+
+static int j1939_session_tx_cts(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ unsigned int pkt, len;
+ int ret;
+ u8 dat[8];
+
+ if (!j1939_sk_recv_match(priv, &session->skcb))
+ return -ENOENT;
+
+ len = session->pkt.total - session->pkt.rx;
+ len = min3(len, session->pkt.block, j1939_tp_block ?: 255);
+ memset(dat, 0xff, sizeof(dat));
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ pkt = session->pkt.rx + 1;
+ dat[0] = J1939_ETP_CMD_CTS;
+ dat[1] = len;
+ dat[2] = (pkt >> 0);
+ dat[3] = (pkt >> 8);
+ dat[4] = (pkt >> 16);
+ } else {
+ dat[0] = J1939_TP_CMD_CTS;
+ dat[1] = len;
+ dat[2] = session->pkt.rx + 1;
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, true, dat);
+ if (ret < 0)
+ return ret;
+
+ if (len)
+ /* only mark cts done when len is set */
+ session->last_txcmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_eoma(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ u8 dat[8];
+ int ret;
+
+ if (!j1939_sk_recv_match(priv, &session->skcb))
+ return -ENOENT;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ dat[0] = J1939_ETP_CMD_EOMA;
+ dat[1] = session->total_message_size >> 0;
+ dat[2] = session->total_message_size >> 8;
+ dat[3] = session->total_message_size >> 16;
+ dat[4] = session->total_message_size >> 24;
+ } else {
+ dat[0] = J1939_TP_CMD_EOMA;
+ dat[1] = session->total_message_size;
+ dat[2] = session->total_message_size >> 8;
+ dat[3] = session->pkt.total;
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, true, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+
+ /* wait for the EOMA packet to come in */
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_xtp_txnext_receiver(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (!j1939_tp_im_receiver(&session->skcb)) {
+ netdev_alert(priv->ndev, "%s: 0x%p: called by not receiver!\n",
+ __func__, session);
+ return -EINVAL;
+ }
+
+ switch (session->last_cmd) {
+ case J1939_TP_CMD_RTS:
+ case J1939_ETP_CMD_RTS:
+ ret = j1939_session_tx_cts(session);
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ case J1939_TP_CMD_CTS:
+ case 0xff: /* did some data */
+ case J1939_ETP_CMD_DPO:
+ if ((session->skcb.addr.type == J1939_TP &&
+ j1939_cb_is_broadcast(&session->skcb)))
+ break;
+
+ if (session->pkt.rx >= session->pkt.total) {
+ ret = j1939_session_tx_eoma(session);
+ } else if (session->pkt.rx >= session->pkt.last) {
+ session->last_txcmd = 0;
+ ret = j1939_session_tx_cts(session);
+ }
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+ __func__, session, session->last_cmd);
+ }
+
+ return ret;
+}
+
+static int j1939_simple_txnext(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *se_skb = j1939_session_skb_find(session);
+ struct sk_buff *skb;
+ int ret;
+
+ if (!se_skb)
+ return 0;
+
+ skb = skb_clone(se_skb, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ can_skb_set_owner(skb, se_skb->sk);
+
+ j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS);
+
+ ret = j1939_send_one(priv, skb);
+ if (ret)
+ return ret;
+
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_SCHED);
+ j1939_sk_queue_activate_next(session);
+
+ return 0;
+}
+
+static bool j1939_session_deactivate_locked(struct j1939_session *session)
+{
+ bool active = false;
+
+ lockdep_assert_held(&session->priv->active_session_list_lock);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_ACTIVE_MAX) {
+ active = true;
+
+ list_del_init(&session->active_session_list_entry);
+ session->state = J1939_SESSION_DONE;
+ j1939_session_put(session);
+ }
+
+ return active;
+}
+
+static bool j1939_session_deactivate(struct j1939_session *session)
+{
+ bool active;
+
+ j1939_session_list_lock(session->priv);
+ active = j1939_session_deactivate_locked(session);
+ j1939_session_list_unlock(session->priv);
+
+ return active;
+}
+
+static void
+j1939_session_deactivate_activate_next(struct j1939_session *session)
+{
+ if (j1939_session_deactivate(session))
+ j1939_sk_queue_activate_next(session);
+}
+
+static void __j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ struct j1939_priv *priv = session->priv;
+
+ WARN_ON_ONCE(!err);
+ lockdep_assert_held(&session->priv->active_session_list_lock);
+
+ session->err = j1939_xtp_abort_to_errno(priv, err);
+ /* do not send aborts on incoming broadcasts */
+ if (!j1939_cb_is_broadcast(&session->skcb)) {
+ session->state = J1939_SESSION_WAITING_ABORT;
+ j1939_xtp_tx_abort(priv, &session->skcb,
+ !session->transmission,
+ err, session->skcb.addr.pgn);
+ }
+
+ if (session->sk)
+ j1939_sk_send_loop_abort(session->sk, session->err);
+}
+
+static void j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ j1939_session_list_lock(session->priv);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_WAITING_ABORT) {
+ j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+ __j1939_session_cancel(session, err);
+ }
+
+ j1939_session_list_unlock(session->priv);
+}
+
+static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer)
+{
+ struct j1939_session *session =
+ container_of(hrtimer, struct j1939_session, txtimer);
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (session->skcb.addr.type == J1939_SIMPLE) {
+ ret = j1939_simple_txnext(session);
+ } else {
+ if (session->transmission)
+ ret = j1939_xtp_txnext_transmiter(session);
+ else
+ ret = j1939_xtp_txnext_receiver(session);
+ }
+
+ switch (ret) {
+ case -ENOBUFS:
+ /* Retry limit is currently arbitrary chosen */
+ if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) {
+ session->tx_retry++;
+ j1939_tp_schedule_txtimer(session,
+ 10 + prandom_u32_max(16));
+ } else {
+ netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n",
+ __func__, session);
+ session->err = -ENETUNREACH;
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_deactivate_activate_next(session);
+ }
+ break;
+ case -ENETDOWN:
+ /* In this case we should get a netdev_event(), all active
+ * sessions will be cleared by
+ * j1939_cancel_all_active_sessions(). So handle this as an
+ * error, but let j1939_cancel_all_active_sessions() do the
+ * cleanup including propagation of the error to user space.
+ */
+ break;
+ case 0:
+ session->tx_retry = 0;
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n",
+ __func__, session, ret);
+ if (session->skcb.addr.type != J1939_SIMPLE) {
+ j1939_session_cancel(session, J1939_XTP_ABORT_OTHER);
+ } else {
+ session->err = ret;
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_deactivate_activate_next(session);
+ }
+ }
+
+ j1939_session_put(session);
+
+ return HRTIMER_NORESTART;
+}
+
+static void j1939_session_completed(struct j1939_session *session)
+{
+ struct sk_buff *skb;
+
+ if (!session->transmission) {
+ skb = j1939_session_skb_find(session);
+ /* distribute among j1939 receivers */
+ j1939_sk_recv(session->priv, skb);
+ }
+
+ j1939_session_deactivate_activate_next(session);
+}
+
+static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer)
+{
+ struct j1939_session *session = container_of(hrtimer,
+ struct j1939_session,
+ rxtimer);
+ struct j1939_priv *priv = session->priv;
+
+ if (session->state == J1939_SESSION_WAITING_ABORT) {
+ netdev_alert(priv->ndev, "%s: 0x%p: abort rx timeout. Force session deactivation\n",
+ __func__, session);
+
+ j1939_session_deactivate_activate_next(session);
+
+ } else if (session->skcb.addr.type == J1939_SIMPLE) {
+ netdev_alert(priv->ndev, "%s: 0x%p: Timeout. Failed to send simple message.\n",
+ __func__, session);
+
+ /* The message is probably stuck in the CAN controller and can
+ * be send as soon as CAN bus is in working state again.
+ */
+ session->err = -ETIME;
+ j1939_session_deactivate(session);
+ } else {
+ netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n",
+ __func__, session);
+
+ j1939_session_list_lock(session->priv);
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_ACTIVE_MAX) {
+ j1939_session_get(session);
+ hrtimer_start(&session->rxtimer,
+ ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS),
+ HRTIMER_MODE_REL_SOFT);
+ __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT);
+ }
+ j1939_session_list_unlock(session->priv);
+ }
+
+ j1939_session_put(session);
+
+ return HRTIMER_NORESTART;
+}
+
+static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session,
+ const struct sk_buff *skb)
+{
+ const struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ pgn_t pgn = j1939_xtp_ctl_to_pgn(skb->data);
+ struct j1939_priv *priv = session->priv;
+ enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+ u8 cmd = skb->data[0];
+
+ if (session->skcb.addr.pgn == pgn)
+ return false;
+
+ switch (cmd) {
+ case J1939_TP_CMD_BAM:
+ abort = J1939_XTP_NO_ABORT;
+ break;
+
+ case J1939_ETP_CMD_RTS:
+ case J1939_TP_CMD_RTS: /* fall through */
+ abort = J1939_XTP_ABORT_BUSY;
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ case J1939_TP_CMD_CTS: /* fall through */
+ abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN;
+ break;
+
+ case J1939_ETP_CMD_DPO:
+ abort = J1939_XTP_ABORT_BAD_EDPO_PGN;
+ break;
+
+ case J1939_ETP_CMD_EOMA:
+ case J1939_TP_CMD_EOMA: /* fall through */
+ abort = J1939_XTP_ABORT_OTHER;
+ break;
+
+ case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+ abort = J1939_XTP_NO_ABORT;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ netdev_warn(priv->ndev, "%s: 0x%p: CMD 0x%02x with PGN 0x%05x for running session with different PGN 0x%05x.\n",
+ __func__, session, cmd, pgn, session->skcb.addr.pgn);
+ if (abort != J1939_XTP_NO_ABORT)
+ j1939_xtp_tx_abort(priv, skcb, true, abort, pgn);
+
+ return true;
+}
+
+static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb,
+ bool reverse, bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ u8 abort = skb->data[1];
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, reverse,
+ transmitter);
+ if (!session)
+ return;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ goto abort_put;
+
+ netdev_info(priv->ndev, "%s: 0x%p: 0x%05x: (%u) %s\n", __func__,
+ session, j1939_xtp_ctl_to_pgn(skb->data), abort,
+ j1939_xtp_abort_to_str(abort));
+
+ j1939_session_timers_cancel(session);
+ session->err = j1939_xtp_abort_to_errno(priv, abort);
+ if (session->sk)
+ j1939_sk_send_loop_abort(session->sk, session->err);
+ j1939_session_deactivate_activate_next(session);
+
+abort_put:
+ j1939_session_put(session);
+}
+
+/* abort packets may come in 2 directions */
+static void
+j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ j1939_xtp_rx_abort_one(priv, skb, false, transmitter);
+ j1939_xtp_rx_abort_one(priv, skb, true, transmitter);
+}
+
+static void
+j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ const u8 *dat;
+ int len;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ dat = skb->data;
+
+ if (skcb->addr.type == J1939_ETP)
+ len = j1939_etp_ctl_to_size(dat);
+ else
+ len = j1939_tp_ctl_to_size(dat);
+
+ if (session->total_message_size != len) {
+ netdev_warn_once(session->priv->ndev,
+ "%s: 0x%p: Incorrect size. Expected: %i; got: %i.\n",
+ __func__, session, session->total_message_size,
+ len);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ session->pkt.tx_acked = session->pkt.total;
+ j1939_session_timers_cancel(session);
+ /* transmitted without problems */
+ j1939_session_completed(session);
+}
+
+static void
+j1939_xtp_rx_eoma(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+ transmitter);
+ if (!session)
+ return;
+
+ j1939_xtp_rx_eoma_one(session, skb);
+ j1939_session_put(session);
+}
+
+static void
+j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb)
+{
+ enum j1939_xtp_abort err = J1939_XTP_ABORT_FAULT;
+ unsigned int pkt;
+ const u8 *dat;
+
+ dat = skb->data;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ if (session->last_cmd == dat[0]) {
+ err = J1939_XTP_ABORT_DUP_SEQ;
+ goto out_session_cancel;
+ }
+
+ if (session->skcb.addr.type == J1939_ETP)
+ pkt = j1939_etp_ctl_to_packet(dat);
+ else
+ pkt = dat[2];
+
+ if (!pkt)
+ goto out_session_cancel;
+ else if (dat[1] > session->pkt.block /* 0xff for etp */)
+ goto out_session_cancel;
+
+ /* set packet counters only when not CTS(0) */
+ session->pkt.tx_acked = pkt - 1;
+ j1939_session_skb_drop_old(session);
+ session->pkt.last = session->pkt.tx_acked + dat[1];
+ if (session->pkt.last > session->pkt.total)
+ /* safety measure */
+ session->pkt.last = session->pkt.total;
+ /* TODO: do not set tx here, do it in txtimer */
+ session->pkt.tx = session->pkt.tx_acked;
+
+ session->last_cmd = dat[0];
+ if (dat[1]) {
+ j1939_tp_set_rxtimeout(session, 1250);
+ if (session->transmission) {
+ if (session->pkt.tx_acked)
+ j1939_sk_errqueue(session,
+ J1939_ERRQUEUE_SCHED);
+ j1939_session_txtimer_cancel(session);
+ j1939_tp_schedule_txtimer(session, 0);
+ }
+ } else {
+ /* CTS(0) */
+ j1939_tp_set_rxtimeout(session, 550);
+ }
+ return;
+
+ out_session_cancel:
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, err);
+}
+
+static void
+j1939_xtp_rx_cts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+ transmitter);
+ if (!session)
+ return;
+ j1939_xtp_rx_cts_one(session, skb);
+ j1939_session_put(session);
+}
+
+static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size)
+{
+ struct j1939_session *session;
+ struct j1939_sk_buff_cb *skcb;
+
+ session = kzalloc(sizeof(*session), gfp_any());
+ if (!session)
+ return NULL;
+
+ INIT_LIST_HEAD(&session->active_session_list_entry);
+ INIT_LIST_HEAD(&session->sk_session_queue_entry);
+ kref_init(&session->kref);
+
+ j1939_priv_get(priv);
+ session->priv = priv;
+ session->total_message_size = size;
+ session->state = J1939_SESSION_NEW;
+
+ skb_queue_head_init(&session->skb_queue);
+ skb_queue_tail(&session->skb_queue, skb);
+
+ skcb = j1939_skb_to_cb(skb);
+ memcpy(&session->skcb, skcb, sizeof(session->skcb));
+
+ hrtimer_init(&session->txtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+ session->txtimer.function = j1939_tp_txtimer;
+ hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+ session->rxtimer.function = j1939_tp_rxtimer;
+
+ netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n",
+ __func__, session, skcb->addr.sa, skcb->addr.da);
+
+ return session;
+}
+
+static struct
+j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
+ int size,
+ const struct j1939_sk_buff_cb *rel_skcb)
+{
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+ struct j1939_session *session;
+
+ skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb->dev = priv->ndev;
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ skcb = j1939_skb_to_cb(skb);
+ memcpy(skcb, rel_skcb, sizeof(*skcb));
+
+ session = j1939_session_new(priv, skb, size);
+ if (!session) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ /* alloc data area */
+ skb_put(skb, size);
+ /* skb is recounted in j1939_session_new() */
+ return session;
+}
+
+int j1939_session_activate(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_session *active = NULL;
+ int ret = 0;
+
+ j1939_session_list_lock(priv);
+ if (session->skcb.addr.type != J1939_SIMPLE)
+ active = j1939_session_get_by_addr_locked(priv,
+ &priv->active_session_list,
+ &session->skcb.addr, false,
+ session->transmission);
+ if (active) {
+ j1939_session_put(active);
+ ret = -EAGAIN;
+ } else {
+ WARN_ON_ONCE(session->state != J1939_SESSION_NEW);
+ list_add_tail(&session->active_session_list_entry,
+ &priv->active_session_list);
+ j1939_session_get(session);
+ session->state = J1939_SESSION_ACTIVE;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n",
+ __func__, session);
+ }
+ j1939_session_list_unlock(priv);
+
+ return ret;
+}
+
+static struct
+j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+ struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ const u8 *dat;
+ pgn_t pgn;
+ int len;
+
+ netdev_dbg(priv->ndev, "%s\n", __func__);
+
+ dat = skb->data;
+ pgn = j1939_xtp_ctl_to_pgn(dat);
+ skcb.addr.pgn = pgn;
+
+ if (!j1939_sk_recv_match(priv, &skcb))
+ return NULL;
+
+ if (skcb.addr.type == J1939_ETP) {
+ len = j1939_etp_ctl_to_size(dat);
+ if (len > J1939_MAX_ETP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ else if (len > priv->tp_max_packet_size)
+ abort = J1939_XTP_ABORT_RESOURCE;
+ else if (len <= J1939_MAX_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ } else {
+ len = j1939_tp_ctl_to_size(dat);
+ if (len > J1939_MAX_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ else if (len > priv->tp_max_packet_size)
+ abort = J1939_XTP_ABORT_RESOURCE;
+ }
+
+ if (abort != J1939_XTP_NO_ABORT) {
+ j1939_xtp_tx_abort(priv, &skcb, true, abort, pgn);
+ return NULL;
+ }
+
+ session = j1939_session_fresh_new(priv, len, &skcb);
+ if (!session) {
+ j1939_xtp_tx_abort(priv, &skcb, true,
+ J1939_XTP_ABORT_RESOURCE, pgn);
+ return NULL;
+ }
+
+ /* initialize the control buffer: plain copy */
+ session->pkt.total = (len + 6) / 7;
+ session->pkt.block = 0xff;
+ if (skcb.addr.type != J1939_ETP) {
+ if (dat[3] != session->pkt.total)
+ netdev_alert(priv->ndev, "%s: 0x%p: strange total, %u != %u\n",
+ __func__, session, session->pkt.total,
+ dat[3]);
+ session->pkt.total = dat[3];
+ session->pkt.block = min(dat[3], dat[4]);
+ }
+
+ session->pkt.rx = 0;
+ session->pkt.tx = 0;
+
+ WARN_ON_ONCE(j1939_session_activate(session));
+
+ return session;
+}
+
+static int j1939_xtp_rx_rts_session_active(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_priv *priv = session->priv;
+
+ if (!session->transmission) {
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return -EBUSY;
+
+ /* RTS on active session */
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+ }
+
+ if (session->last_cmd != 0) {
+ /* we received a second rts on the same connection */
+ netdev_alert(priv->ndev, "%s: 0x%p: connection exists (%02x %02x). last cmd: %x\n",
+ __func__, session, skcb->addr.sa, skcb->addr.da,
+ session->last_cmd);
+
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+
+ return -EBUSY;
+ }
+
+ if (session->skcb.addr.sa != skcb->addr.sa ||
+ session->skcb.addr.da != skcb->addr.da)
+ netdev_warn(priv->ndev, "%s: 0x%p: session->skcb.addr.sa=0x%02x skcb->addr.sa=0x%02x session->skcb.addr.da=0x%02x skcb->addr.da=0x%02x\n",
+ __func__, session,
+ session->skcb.addr.sa, skcb->addr.sa,
+ session->skcb.addr.da, skcb->addr.da);
+ /* make sure 'sa' & 'da' are correct !
+ * They may be 'not filled in yet' for sending
+ * skb's, since they did not pass the Address Claim ever.
+ */
+ session->skcb.addr.sa = skcb->addr.sa;
+ session->skcb.addr.da = skcb->addr.da;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ u8 cmd = skb->data[0];
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ transmitter);
+
+ if (!session) {
+ if (transmitter) {
+ /* If we're the transmitter and this function is called,
+ * we received our own RTS. A session has already been
+ * created.
+ *
+ * For some reasons however it might have been destroyed
+ * already. So don't create a new one here (using
+ * "j1939_xtp_rx_rts_session_new()") as this will be a
+ * receiver session.
+ *
+ * The reasons the session is already destroyed might
+ * be:
+ * - user space closed socket was and the session was
+ * aborted
+ * - session was aborted due to external abort message
+ */
+ return;
+ }
+ session = j1939_xtp_rx_rts_session_new(priv, skb);
+ if (!session)
+ return;
+ } else {
+ if (j1939_xtp_rx_rts_session_active(session, skb)) {
+ j1939_session_put(session);
+ return;
+ }
+ }
+ session->last_cmd = cmd;
+
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ if (cmd != J1939_TP_CMD_BAM && !session->transmission) {
+ j1939_session_txtimer_cancel(session);
+ j1939_tp_schedule_txtimer(session, 0);
+ }
+
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dpo_one(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ const u8 *dat = skb->data;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ /* transmitted without problems */
+ session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data);
+ session->last_cmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 750);
+}
+
+static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ transmitter);
+ if (!session) {
+ netdev_info(priv->ndev,
+ "%s: no connection found\n", __func__);
+ return;
+ }
+
+ j1939_xtp_rx_dpo_one(session, skb);
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat_one(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *skcb;
+ struct sk_buff *se_skb;
+ const u8 *dat;
+ u8 *tpdat;
+ int offset;
+ int nbytes;
+ bool final = false;
+ bool do_cts_eoma = false;
+ int packet;
+
+ skcb = j1939_skb_to_cb(skb);
+ dat = skb->data;
+ if (skb->len <= 1)
+ /* makes no sense */
+ goto out_session_cancel;
+
+ switch (session->last_cmd) {
+ case 0xff:
+ break;
+ case J1939_ETP_CMD_DPO:
+ if (skcb->addr.type == J1939_ETP)
+ break;
+ /* fall through */
+ case J1939_TP_CMD_BAM: /* fall through */
+ case J1939_TP_CMD_CTS: /* fall through */
+ if (skcb->addr.type != J1939_ETP)
+ break;
+ /* fall through */
+ default:
+ netdev_info(priv->ndev, "%s: 0x%p: last %02x\n", __func__,
+ session, session->last_cmd);
+ goto out_session_cancel;
+ }
+
+ packet = (dat[0] - 1 + session->pkt.dpo);
+ if (packet > session->pkt.total ||
+ (session->pkt.rx + 1) > session->pkt.total) {
+ netdev_info(priv->ndev, "%s: 0x%p: should have been completed\n",
+ __func__, session);
+ goto out_session_cancel;
+ }
+ se_skb = j1939_session_skb_find(session);
+ if (!se_skb) {
+ netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__,
+ session);
+ goto out_session_cancel;
+ }
+
+ skcb = j1939_skb_to_cb(se_skb);
+ offset = packet * 7 - skcb->offset;
+ nbytes = se_skb->len - offset;
+ if (nbytes > 7)
+ nbytes = 7;
+ if (nbytes <= 0 || (nbytes + 1) > skb->len) {
+ netdev_info(priv->ndev, "%s: 0x%p: nbytes %i, len %i\n",
+ __func__, session, nbytes, skb->len);
+ goto out_session_cancel;
+ }
+
+ tpdat = se_skb->data;
+ memcpy(&tpdat[offset], &dat[1], nbytes);
+ if (packet == session->pkt.rx)
+ session->pkt.rx++;
+
+ if (skcb->addr.type != J1939_ETP &&
+ j1939_cb_is_broadcast(&session->skcb)) {
+ if (session->pkt.rx >= session->pkt.total)
+ final = true;
+ } else {
+ /* never final, an EOMA must follow */
+ if (session->pkt.rx >= session->pkt.last)
+ do_cts_eoma = true;
+ }
+
+ if (final) {
+ j1939_session_completed(session);
+ } else if (do_cts_eoma) {
+ j1939_tp_set_rxtimeout(session, 1250);
+ if (!session->transmission)
+ j1939_tp_schedule_txtimer(session, 0);
+ } else {
+ j1939_tp_set_rxtimeout(session, 250);
+ }
+ session->last_cmd = 0xff;
+ j1939_session_put(session);
+
+ return;
+
+ out_session_cancel:
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_FAULT);
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb;
+ struct j1939_session *session;
+
+ skcb = j1939_skb_to_cb(skb);
+
+ if (j1939_tp_im_transmitter(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ true);
+ if (!session)
+ netdev_info(priv->ndev, "%s: no tx connection found\n",
+ __func__);
+ else
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+
+ if (j1939_tp_im_receiver(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ false);
+ if (!session)
+ netdev_info(priv->ndev, "%s: no rx connection found\n",
+ __func__);
+ else
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+}
+
+/* j1939 main intf */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ int ret;
+
+ if (skcb->addr.pgn == J1939_TP_PGN_DAT ||
+ skcb->addr.pgn == J1939_TP_PGN_CTL ||
+ skcb->addr.pgn == J1939_ETP_PGN_DAT ||
+ skcb->addr.pgn == J1939_ETP_PGN_CTL)
+ /* avoid conflict */
+ return ERR_PTR(-EDOM);
+
+ if (size > priv->tp_max_packet_size)
+ return ERR_PTR(-EMSGSIZE);
+
+ if (size <= 8)
+ skcb->addr.type = J1939_SIMPLE;
+ else if (size > J1939_MAX_TP_PACKET_SIZE)
+ skcb->addr.type = J1939_ETP;
+ else
+ skcb->addr.type = J1939_TP;
+
+ if (skcb->addr.type == J1939_ETP &&
+ j1939_cb_is_broadcast(skcb))
+ return ERR_PTR(-EDESTADDRREQ);
+
+ /* fill in addresses from names */
+ ret = j1939_ac_fixup(priv, skb);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ /* fix DST flags, it may be used there soon */
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+
+ /* src is always local, I'm sending ... */
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+ /* prepare new session */
+ session = j1939_session_new(priv, skb, size);
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ /* skb is recounted in j1939_session_new() */
+ sock_hold(skb->sk);
+ session->sk = skb->sk;
+ session->transmission = true;
+ session->pkt.total = (size + 6) / 7;
+ session->pkt.block = skcb->addr.type == J1939_ETP ? 255 :
+ min(j1939_tp_block ?: 255, session->pkt.total);
+
+ if (j1939_cb_is_broadcast(&session->skcb))
+ /* set the end-packet for broadcast */
+ session->pkt.last = session->pkt.total;
+
+ skcb->tskey = session->sk->sk_tskey++;
+ session->tskey = skcb->tskey;
+
+ return session;
+}
+
+static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int extd = J1939_TP;
+ u8 cmd = skb->data[0];
+
+ switch (cmd) {
+ case J1939_ETP_CMD_RTS:
+ extd = J1939_ETP;
+ /* fall through */
+ case J1939_TP_CMD_BAM: /* fall through */
+ case J1939_TP_CMD_RTS: /* fall through */
+ if (skcb->addr.type != extd)
+ return;
+
+ if (cmd == J1939_TP_CMD_RTS && j1939_cb_is_broadcast(skcb)) {
+ netdev_alert(priv->ndev, "%s: rts without destination (%02x)\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_rts(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_rts(priv, skb, false);
+
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ extd = J1939_ETP;
+ /* fall through */
+ case J1939_TP_CMD_CTS:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_cts(priv, skb, false);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_cts(priv, skb, true);
+
+ break;
+
+ case J1939_ETP_CMD_DPO:
+ if (skcb->addr.type != J1939_ETP)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_dpo(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_dpo(priv, skb, false);
+
+ break;
+
+ case J1939_ETP_CMD_EOMA:
+ extd = J1939_ETP;
+ /* fall through */
+ case J1939_TP_CMD_EOMA:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_eoma(priv, skb, false);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_eoma(priv, skb, true);
+
+ break;
+
+ case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_abort(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_abort(priv, skb, false);
+
+ break;
+ default:
+ return;
+ }
+}
+
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+ if (!j1939_tp_im_involved_anydir(skcb))
+ return 0;
+
+ switch (skcb->addr.pgn) {
+ case J1939_ETP_PGN_DAT:
+ skcb->addr.type = J1939_ETP;
+ /* fall through */
+ case J1939_TP_PGN_DAT:
+ j1939_xtp_rx_dat(priv, skb);
+ break;
+
+ case J1939_ETP_PGN_CTL:
+ skcb->addr.type = J1939_ETP;
+ /* fall through */
+ case J1939_TP_PGN_CTL:
+ if (skb->len < 8)
+ return 0; /* Don't care. Nothing to extract here */
+
+ j1939_tp_cmd_recv(priv, skb);
+ break;
+ default:
+ return 0; /* no problem */
+ }
+ return 1; /* "I processed the message" */
+}
+
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_session *session;
+
+ if (!skb->sk)
+ return;
+
+ j1939_session_list_lock(priv);
+ session = j1939_session_get_simple(priv, skb);
+ j1939_session_list_unlock(priv);
+ if (!session) {
+ netdev_warn(priv->ndev,
+ "%s: Received already invalidated message\n",
+ __func__);
+ return;
+ }
+
+ j1939_session_timers_cancel(session);
+ j1939_session_deactivate(session);
+ j1939_session_put(session);
+}
+
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk)
+{
+ struct j1939_session *session, *saved;
+
+ netdev_dbg(priv->ndev, "%s, sk: %p\n", __func__, sk);
+ j1939_session_list_lock(priv);
+ list_for_each_entry_safe(session, saved,
+ &priv->active_session_list,
+ active_session_list_entry) {
+ if (!sk || sk == session->sk) {
+ if (hrtimer_try_to_cancel(&session->txtimer) == 1)
+ j1939_session_put(session);
+ if (hrtimer_try_to_cancel(&session->rxtimer) == 1)
+ j1939_session_put(session);
+
+ session->err = ESHUTDOWN;
+ j1939_session_deactivate_locked(session);
+ }
+ }
+ j1939_session_list_unlock(priv);
+ return NOTIFY_DONE;
+}
+
+void j1939_tp_init(struct j1939_priv *priv)
+{
+ spin_lock_init(&priv->active_session_list_lock);
+ INIT_LIST_HEAD(&priv->active_session_list);
+ priv->tp_max_packet_size = J1939_MAX_ETP_PACKET_SIZE;
+}
diff --git a/net/can/proc.c b/net/can/proc.c
index 70fea17bb04c..e6881bfc3ed1 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
/*
* proc.c - procfs support for Protocol family CAN core module
*
@@ -44,6 +45,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/if_arp.h>
+#include <linux/can/can-ml.h>
#include <linux/can/core.h>
#include "af_can.h"
@@ -77,21 +79,21 @@ static const char rx_list_name[][8] = {
static void can_init_stats(struct net *net)
{
- struct s_stats *can_stats = net->can.can_stats;
- struct s_pstats *can_pstats = net->can.can_pstats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
/*
* This memset function is called from a timer context (when
* can_stattimer is active which is the default) OR in a process
* context (reading the proc_fs when can_stattimer is disabled).
*/
- memset(can_stats, 0, sizeof(struct s_stats));
- can_stats->jiffies_init = jiffies;
+ memset(pkg_stats, 0, sizeof(struct can_pkg_stats));
+ pkg_stats->jiffies_init = jiffies;
- can_pstats->stats_reset++;
+ rcv_lists_stats->stats_reset++;
if (user_reset) {
user_reset = 0;
- can_pstats->user_reset++;
+ rcv_lists_stats->user_reset++;
}
}
@@ -117,8 +119,8 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
void can_stat_update(struct timer_list *t)
{
- struct net *net = from_timer(net, t, can.can_stattimer);
- struct s_stats *can_stats = net->can.can_stats;
+ struct net *net = from_timer(net, t, can.stattimer);
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
unsigned long j = jiffies; /* snapshot */
/* restart counting in timer context on user request */
@@ -126,57 +128,57 @@ void can_stat_update(struct timer_list *t)
can_init_stats(net);
/* restart counting on jiffies overflow */
- if (j < can_stats->jiffies_init)
+ if (j < pkg_stats->jiffies_init)
can_init_stats(net);
/* prevent overflow in calc_rate() */
- if (can_stats->rx_frames > (ULONG_MAX / HZ))
+ if (pkg_stats->rx_frames > (ULONG_MAX / HZ))
can_init_stats(net);
/* prevent overflow in calc_rate() */
- if (can_stats->tx_frames > (ULONG_MAX / HZ))
+ if (pkg_stats->tx_frames > (ULONG_MAX / HZ))
can_init_stats(net);
/* matches overflow - very improbable */
- if (can_stats->matches > (ULONG_MAX / 100))
+ if (pkg_stats->matches > (ULONG_MAX / 100))
can_init_stats(net);
/* calc total values */
- if (can_stats->rx_frames)
- can_stats->total_rx_match_ratio = (can_stats->matches * 100) /
- can_stats->rx_frames;
+ if (pkg_stats->rx_frames)
+ pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) /
+ pkg_stats->rx_frames;
- can_stats->total_tx_rate = calc_rate(can_stats->jiffies_init, j,
- can_stats->tx_frames);
- can_stats->total_rx_rate = calc_rate(can_stats->jiffies_init, j,
- can_stats->rx_frames);
+ pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j,
+ pkg_stats->tx_frames);
+ pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j,
+ pkg_stats->rx_frames);
/* calc current values */
- if (can_stats->rx_frames_delta)
- can_stats->current_rx_match_ratio =
- (can_stats->matches_delta * 100) /
- can_stats->rx_frames_delta;
+ if (pkg_stats->rx_frames_delta)
+ pkg_stats->current_rx_match_ratio =
+ (pkg_stats->matches_delta * 100) /
+ pkg_stats->rx_frames_delta;
- can_stats->current_tx_rate = calc_rate(0, HZ, can_stats->tx_frames_delta);
- can_stats->current_rx_rate = calc_rate(0, HZ, can_stats->rx_frames_delta);
+ pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta);
+ pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta);
/* check / update maximum values */
- if (can_stats->max_tx_rate < can_stats->current_tx_rate)
- can_stats->max_tx_rate = can_stats->current_tx_rate;
+ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate)
+ pkg_stats->max_tx_rate = pkg_stats->current_tx_rate;
- if (can_stats->max_rx_rate < can_stats->current_rx_rate)
- can_stats->max_rx_rate = can_stats->current_rx_rate;
+ if (pkg_stats->max_rx_rate < pkg_stats->current_rx_rate)
+ pkg_stats->max_rx_rate = pkg_stats->current_rx_rate;
- if (can_stats->max_rx_match_ratio < can_stats->current_rx_match_ratio)
- can_stats->max_rx_match_ratio = can_stats->current_rx_match_ratio;
+ if (pkg_stats->max_rx_match_ratio < pkg_stats->current_rx_match_ratio)
+ pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio;
/* clear values for 'current rate' calculation */
- can_stats->tx_frames_delta = 0;
- can_stats->rx_frames_delta = 0;
- can_stats->matches_delta = 0;
+ pkg_stats->tx_frames_delta = 0;
+ pkg_stats->rx_frames_delta = 0;
+ pkg_stats->matches_delta = 0;
/* restart timer (one second) */
- mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ));
+ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ));
}
/*
@@ -211,60 +213,60 @@ static void can_print_recv_banner(struct seq_file *m)
static int can_stats_proc_show(struct seq_file *m, void *v)
{
struct net *net = m->private;
- struct s_stats *can_stats = net->can.can_stats;
- struct s_pstats *can_pstats = net->can.can_pstats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
seq_putc(m, '\n');
- seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats->tx_frames);
- seq_printf(m, " %8ld received frames (RXF)\n", can_stats->rx_frames);
- seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats->matches);
+ seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames);
+ seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames);
+ seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches);
seq_putc(m, '\n');
- if (net->can.can_stattimer.function == can_stat_update) {
+ if (net->can.stattimer.function == can_stat_update) {
seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
- can_stats->total_rx_match_ratio);
+ pkg_stats->total_rx_match_ratio);
seq_printf(m, " %8ld frames/s total tx rate (TXR)\n",
- can_stats->total_tx_rate);
+ pkg_stats->total_tx_rate);
seq_printf(m, " %8ld frames/s total rx rate (RXR)\n",
- can_stats->total_rx_rate);
+ pkg_stats->total_rx_rate);
seq_putc(m, '\n');
seq_printf(m, " %8ld %% current match ratio (CRXMR)\n",
- can_stats->current_rx_match_ratio);
+ pkg_stats->current_rx_match_ratio);
seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n",
- can_stats->current_tx_rate);
+ pkg_stats->current_tx_rate);
seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n",
- can_stats->current_rx_rate);
+ pkg_stats->current_rx_rate);
seq_putc(m, '\n');
seq_printf(m, " %8ld %% max match ratio (MRXMR)\n",
- can_stats->max_rx_match_ratio);
+ pkg_stats->max_rx_match_ratio);
seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n",
- can_stats->max_tx_rate);
+ pkg_stats->max_tx_rate);
seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n",
- can_stats->max_rx_rate);
+ pkg_stats->max_rx_rate);
seq_putc(m, '\n');
}
seq_printf(m, " %8ld current receive list entries (CRCV)\n",
- can_pstats->rcv_entries);
+ rcv_lists_stats->rcv_entries);
seq_printf(m, " %8ld maximum receive list entries (MRCV)\n",
- can_pstats->rcv_entries_max);
+ rcv_lists_stats->rcv_entries_max);
- if (can_pstats->stats_reset)
+ if (rcv_lists_stats->stats_reset)
seq_printf(m, "\n %8ld statistic resets (STR)\n",
- can_pstats->stats_reset);
+ rcv_lists_stats->stats_reset);
- if (can_pstats->user_reset)
+ if (rcv_lists_stats->user_reset)
seq_printf(m, " %8ld user statistic resets (USTR)\n",
- can_pstats->user_reset);
+ rcv_lists_stats->user_reset);
seq_putc(m, '\n');
return 0;
@@ -273,20 +275,20 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
static int can_reset_stats_proc_show(struct seq_file *m, void *v)
{
struct net *net = m->private;
- struct s_pstats *can_pstats = net->can.can_pstats;
- struct s_stats *can_stats = net->can.can_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
user_reset = 1;
- if (net->can.can_stattimer.function == can_stat_update) {
+ if (net->can.stattimer.function == can_stat_update) {
seq_printf(m, "Scheduled statistic reset #%ld.\n",
- can_pstats->stats_reset + 1);
+ rcv_lists_stats->stats_reset + 1);
} else {
- if (can_stats->jiffies_init != jiffies)
+ if (pkg_stats->jiffies_init != jiffies)
can_init_stats(net);
seq_printf(m, "Performed statistic reset #%ld.\n",
- can_pstats->stats_reset);
+ rcv_lists_stats->stats_reset);
}
return 0;
}
@@ -299,11 +301,11 @@ static int can_version_proc_show(struct seq_file *m, void *v)
static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
struct net_device *dev,
- struct can_dev_rcv_lists *d)
+ struct can_dev_rcv_lists *dev_rcv_lists)
{
- if (!hlist_empty(&d->rx[idx])) {
+ if (!hlist_empty(&dev_rcv_lists->rx[idx])) {
can_print_recv_banner(m);
- can_print_rcvlist(m, &d->rx[idx], dev);
+ can_print_rcvlist(m, &dev_rcv_lists->rx[idx], dev);
} else
seq_printf(m, " (%s: no entry)\n", DNAME(dev));
@@ -314,7 +316,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
/* double cast to prevent GCC warning */
int idx = (int)(long)PDE_DATA(m->file->f_inode);
struct net_device *dev;
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = m->private;
seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
@@ -322,8 +324,8 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
rcu_read_lock();
/* receive list for 'all' CAN devices (dev == NULL) */
- d = net->can.can_rx_alldev_list;
- can_rcvlist_proc_show_one(m, idx, NULL, d);
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_one(m, idx, NULL, dev_rcv_lists);
/* receive list for registered CAN devices */
for_each_netdev_rcu(net, dev) {
@@ -365,7 +367,7 @@ static inline void can_rcvlist_proc_show_array(struct seq_file *m,
static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
{
struct net_device *dev;
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = m->private;
/* RX_SFF */
@@ -374,15 +376,16 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
rcu_read_lock();
/* sff receive list for 'all' CAN devices (dev == NULL) */
- d = net->can.can_rx_alldev_list;
- can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff));
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_sff,
+ ARRAY_SIZE(dev_rcv_lists->rx_sff));
/* sff receive list for registered CAN devices */
for_each_netdev_rcu(net, dev) {
if (dev->type == ARPHRD_CAN && dev->ml_priv) {
- d = dev->ml_priv;
- can_rcvlist_proc_show_array(m, dev, d->rx_sff,
- ARRAY_SIZE(d->rx_sff));
+ dev_rcv_lists = dev->ml_priv;
+ can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff,
+ ARRAY_SIZE(dev_rcv_lists->rx_sff));
}
}
@@ -395,7 +398,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
{
struct net_device *dev;
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = m->private;
/* RX_EFF */
@@ -404,15 +407,16 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
rcu_read_lock();
/* eff receive list for 'all' CAN devices (dev == NULL) */
- d = net->can.can_rx_alldev_list;
- can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff));
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_eff,
+ ARRAY_SIZE(dev_rcv_lists->rx_eff));
/* eff receive list for registered CAN devices */
for_each_netdev_rcu(net, dev) {
if (dev->type == ARPHRD_CAN && dev->ml_priv) {
- d = dev->ml_priv;
- can_rcvlist_proc_show_array(m, dev, d->rx_eff,
- ARRAY_SIZE(d->rx_eff));
+ dev_rcv_lists = dev->ml_priv;
+ can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff,
+ ARRAY_SIZE(dev_rcv_lists->rx_eff));
}
}
diff --git a/net/can/raw.c b/net/can/raw.c
index afcbff063a67..59c039d73c6d 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -1,5 +1,5 @@
-/*
- * raw.c - Raw sockets for protocol family CAN
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* raw.c - Raw sockets for protocol family CAN
*
* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
* All rights reserved.
@@ -64,8 +64,7 @@ MODULE_ALIAS("can-proto-1");
#define MASK_ALL 0
-/*
- * A raw socket has a list of can_filters attached to it, each receiving
+/* A raw socket has a list of can_filters attached to it, each receiving
* the CAN frames matching that filter. If the filter list is empty,
* no CAN frames will be received by the socket. The default after
* opening the socket, is to have one filter which receives all frames.
@@ -96,8 +95,7 @@ struct raw_sock {
struct uniqframe __percpu *uniq;
};
-/*
- * Return pointer to store the extra msg flags for raw_recvmsg().
+/* Return pointer to store the extra msg flags for raw_recvmsg().
* We use the space of one unsigned int beyond the 'struct sockaddr_can'
* in skb->cb.
*/
@@ -156,8 +154,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
if (!skb)
return;
- /*
- * Put the datagram to the queue so that raw_recvmsg() can
+ /* Put the datagram to the queue so that raw_recvmsg() can
* get it from there. We need to pass the interface index to
* raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
* containing the interface index.
@@ -283,7 +280,6 @@ static int raw_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
switch (msg) {
-
case NETDEV_UNREGISTER:
lock_sock(sk);
/* remove current filters & unregister */
@@ -369,8 +365,9 @@ static int raw_release(struct socket *sock)
raw_disable_allfilters(dev_net(dev), dev, sk);
dev_put(dev);
}
- } else
+ } else {
raw_disable_allfilters(sock_net(sk), NULL, sk);
+ }
}
if (ro->count > 1)
@@ -399,7 +396,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
int err = 0;
int notify_enetdown = 0;
- if (len < sizeof(*addr))
+ if (len < CAN_REQUIRED_SIZE(*addr, can_ifindex))
return -EINVAL;
if (addr->can_family != AF_CAN)
return -EINVAL;
@@ -450,8 +447,9 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
dev, sk);
dev_put(dev);
}
- } else
+ } else {
raw_disable_allfilters(sock_net(sk), NULL, sk);
+ }
}
ro->ifindex = ifindex;
ro->bound = 1;
@@ -502,7 +500,6 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
return -EINVAL;
switch (optname) {
-
case CAN_RAW_FILTER:
if (optlen % sizeof(struct can_filter) != 0)
return -EINVAL;
@@ -665,17 +662,18 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
return -EINVAL;
switch (optname) {
-
case CAN_RAW_FILTER:
lock_sock(sk);
if (ro->count > 0) {
int fsize = ro->count * sizeof(struct can_filter);
+
if (len > fsize)
len = fsize;
if (copy_to_user(optval, ro->filter, len))
err = -EFAULT;
- } else
+ } else {
len = 0;
+ }
release_sock(sk);
if (!err)
@@ -735,15 +733,16 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
- if (msg->msg_namelen < sizeof(*addr))
+ if (msg->msg_namelen < CAN_REQUIRED_SIZE(*addr, can_ifindex))
return -EINVAL;
if (addr->can_family != AF_CAN)
return -EINVAL;
ifindex = addr->can_ifindex;
- } else
+ } else {
ifindex = ro->ifindex;
+ }
dev = dev_get_by_index(sock_net(sk), ifindex);
if (!dev)
@@ -836,6 +835,13 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
return size;
}
+static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
static const struct proto_ops raw_ops = {
.family = PF_CAN,
.release = raw_release,
@@ -845,7 +851,7 @@ static const struct proto_ops raw_ops = {
.accept = sock_no_accept,
.getname = raw_getname,
.poll = datagram_poll,
- .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .ioctl = raw_sock_no_ioctlcmd,
.gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -879,7 +885,7 @@ static __init int raw_module_init(void)
err = can_proto_register(&raw_can_proto);
if (err < 0)
- printk(KERN_ERR "can: registration of raw protocol failed\n");
+ pr_err("can: registration of raw protocol failed\n");
return err;
}
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 59d0ba2072de..ce09bb4fb249 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -13,5 +13,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
auth.o auth_none.o \
crypto.o armor.o \
auth_x.o \
- ceph_fs.o ceph_strings.o ceph_hash.o \
+ ceph_strings.o ceph_hash.o \
pagevec.o snapshot.o string_table.o
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4eeea4d5c3ef..a0e97f6c1072 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -11,8 +11,9 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/statfs.h>
@@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
EXPORT_SYMBOL(ceph_compare_options);
+/*
+ * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
+ * compatible with (a superset of) GFP_KERNEL. This is because while the
+ * actual pages are allocated with the specified flags, the page table pages
+ * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take
+ * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
+ *
+ * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
+ */
void *ceph_kvmalloc(size_t size, gfp_t flags)
{
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *ptr = kmalloc(size, flags | __GFP_NOWARN);
- if (ptr)
- return ptr;
+ void *p;
+
+ if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
+ p = kvmalloc(size, flags);
+ } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
+ unsigned int nofs_flag = memalloc_nofs_save();
+ p = kvmalloc(size, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
+ } else {
+ unsigned int noio_flag = memalloc_noio_save();
+ p = kvmalloc(size, GFP_KERNEL);
+ memalloc_noio_restore(noio_flag);
}
- return __vmalloc(size, flags, PAGE_KERNEL);
+ return p;
}
-
static int parse_fsid(const char *str, struct ceph_fsid *fsid)
{
int i = 0;
@@ -237,58 +254,72 @@ enum {
Opt_mount_timeout,
Opt_osd_idle_ttl,
Opt_osd_request_timeout,
- Opt_last_int,
/* int args above */
Opt_fsid,
Opt_name,
Opt_secret,
Opt_key,
Opt_ip,
- Opt_last_string,
/* string args above */
Opt_share,
- Opt_noshare,
Opt_crc,
- Opt_nocrc,
Opt_cephx_require_signatures,
- Opt_nocephx_require_signatures,
Opt_cephx_sign_messages,
- Opt_nocephx_sign_messages,
Opt_tcp_nodelay,
- Opt_notcp_nodelay,
Opt_abort_on_full,
};
-static match_table_t opt_tokens = {
- {Opt_osdtimeout, "osdtimeout=%d"},
- {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
- {Opt_mount_timeout, "mount_timeout=%d"},
- {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
- {Opt_osd_request_timeout, "osd_request_timeout=%d"},
- /* int args above */
- {Opt_fsid, "fsid=%s"},
- {Opt_name, "name=%s"},
- {Opt_secret, "secret=%s"},
- {Opt_key, "key=%s"},
- {Opt_ip, "ip=%s"},
- /* string args above */
- {Opt_share, "share"},
- {Opt_noshare, "noshare"},
- {Opt_crc, "crc"},
- {Opt_nocrc, "nocrc"},
- {Opt_cephx_require_signatures, "cephx_require_signatures"},
- {Opt_nocephx_require_signatures, "nocephx_require_signatures"},
- {Opt_cephx_sign_messages, "cephx_sign_messages"},
- {Opt_nocephx_sign_messages, "nocephx_sign_messages"},
- {Opt_tcp_nodelay, "tcp_nodelay"},
- {Opt_notcp_nodelay, "notcp_nodelay"},
- {Opt_abort_on_full, "abort_on_full"},
- {-1, NULL}
+static const struct fs_parameter_spec ceph_parameters[] = {
+ fsparam_flag ("abort_on_full", Opt_abort_on_full),
+ fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures),
+ fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages),
+ fsparam_flag_no ("crc", Opt_crc),
+ fsparam_string ("fsid", Opt_fsid),
+ fsparam_string ("ip", Opt_ip),
+ fsparam_string ("key", Opt_key),
+ fsparam_u32 ("mount_timeout", Opt_mount_timeout),
+ fsparam_string ("name", Opt_name),
+ fsparam_u32 ("osd_idle_ttl", Opt_osd_idle_ttl),
+ fsparam_u32 ("osd_request_timeout", Opt_osd_request_timeout),
+ fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout),
+ __fsparam (fs_param_is_s32, "osdtimeout", Opt_osdtimeout,
+ fs_param_deprecated, NULL),
+ fsparam_string ("secret", Opt_secret),
+ fsparam_flag_no ("share", Opt_share),
+ fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay),
+ {}
};
+struct ceph_options *ceph_alloc_options(void)
+{
+ struct ceph_options *opt;
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return NULL;
+
+ opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+ GFP_KERNEL);
+ if (!opt->mon_addr) {
+ kfree(opt);
+ return NULL;
+ }
+
+ opt->flags = CEPH_OPT_DEFAULT;
+ opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
+ return opt;
+}
+EXPORT_SYMBOL(ceph_alloc_options);
+
void ceph_destroy_options(struct ceph_options *opt)
{
dout("destroy_options %p\n", opt);
+ if (!opt)
+ return;
+
kfree(opt->name);
if (opt->key) {
ceph_crypto_key_destroy(opt->key);
@@ -300,7 +331,9 @@ void ceph_destroy_options(struct ceph_options *opt)
EXPORT_SYMBOL(ceph_destroy_options);
/* get secret from key store */
-static int get_secret(struct ceph_crypto_key *dst, const char *name) {
+static int get_secret(struct ceph_crypto_key *dst, const char *name,
+ struct p_log *log)
+{
struct key *ukey;
int key_err;
int err = 0;
@@ -313,20 +346,20 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
key_err = PTR_ERR(ukey);
switch (key_err) {
case -ENOKEY:
- pr_warn("ceph: Mount failed due to key not found: %s\n",
- name);
+ error_plog(log, "Failed due to key not found: %s",
+ name);
break;
case -EKEYEXPIRED:
- pr_warn("ceph: Mount failed due to expired key: %s\n",
- name);
+ error_plog(log, "Failed due to expired key: %s",
+ name);
break;
case -EKEYREVOKED:
- pr_warn("ceph: Mount failed due to revoked key: %s\n",
- name);
+ error_plog(log, "Failed due to revoked key: %s",
+ name);
break;
default:
- pr_warn("ceph: Mount failed due to unknown key error %d: %s\n",
- key_err, name);
+ error_plog(log, "Failed due to key error %d: %s",
+ key_err, name);
}
err = -EPERM;
goto out;
@@ -344,217 +377,159 @@ out:
return err;
}
-struct ceph_options *
-ceph_parse_options(char *options, const char *dev_name,
- const char *dev_name_end,
- int (*parse_extra_token)(char *c, void *private),
- void *private)
+int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
+ struct fc_log *l)
{
- struct ceph_options *opt;
- const char *c;
- int err = -ENOMEM;
- substring_t argstr[MAX_OPT_ARGS];
-
- opt = kzalloc(sizeof(*opt), GFP_KERNEL);
- if (!opt)
- return ERR_PTR(-ENOMEM);
- opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
- GFP_KERNEL);
- if (!opt->mon_addr)
- goto out;
-
- dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
- dev_name);
-
- /* start with defaults */
- opt->flags = CEPH_OPT_DEFAULT;
- opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
- opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
- opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
- opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
+ struct p_log log = {.prefix = "libceph", .log = l};
+ int ret;
- /* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
- err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
- CEPH_MAX_MON, &opt->num_mon);
- if (err < 0)
- goto out;
+ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON,
+ &opt->num_mon);
+ if (ret) {
+ error_plog(&log, "Failed to parse monitor IPs: %d", ret);
+ return ret;
+ }
- /* parse mount options */
- while ((c = strsep(&options, ",")) != NULL) {
- int token, intval;
- if (!*c)
- continue;
- err = -EINVAL;
- token = match_token((char *)c, opt_tokens, argstr);
- if (token < 0 && parse_extra_token) {
- /* extra? */
- err = parse_extra_token((char *)c, private);
- if (err < 0) {
- pr_err("bad option at '%s'\n", c);
- goto out;
- }
- continue;
- }
- if (token < Opt_last_int) {
- err = match_int(&argstr[0], &intval);
- if (err < 0) {
- pr_err("bad option arg (not int) at '%s'\n", c);
- goto out;
- }
- dout("got int token %d val %d\n", token, intval);
- } else if (token > Opt_last_int && token < Opt_last_string) {
- dout("got string token %d val %s\n", token,
- argstr[0].from);
- } else {
- dout("got token %d\n", token);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_parse_mon_ips);
+
+int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
+ struct fc_log *l)
+{
+ struct fs_parse_result result;
+ int token, err;
+ struct p_log log = {.prefix = "libceph", .log = l};
+
+ token = __fs_parse(&log, ceph_parameters, param, &result);
+ dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_ip:
+ err = ceph_parse_ips(param->string,
+ param->string + param->size,
+ &opt->my_addr,
+ 1, NULL);
+ if (err) {
+ error_plog(&log, "Failed to parse ip: %d", err);
+ return err;
}
- switch (token) {
- case Opt_ip:
- err = ceph_parse_ips(argstr[0].from,
- argstr[0].to,
- &opt->my_addr,
- 1, NULL);
- if (err < 0)
- goto out;
- opt->flags |= CEPH_OPT_MYIP;
- break;
+ opt->flags |= CEPH_OPT_MYIP;
+ break;
- case Opt_fsid:
- err = parse_fsid(argstr[0].from, &opt->fsid);
- if (err == 0)
- opt->flags |= CEPH_OPT_FSID;
- break;
- case Opt_name:
- kfree(opt->name);
- opt->name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- if (!opt->name) {
- err = -ENOMEM;
- goto out;
- }
- break;
- case Opt_secret:
- ceph_crypto_key_destroy(opt->key);
- kfree(opt->key);
-
- opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
- if (!opt->key) {
- err = -ENOMEM;
- goto out;
- }
- err = ceph_crypto_key_unarmor(opt->key, argstr[0].from);
- if (err < 0)
- goto out;
- break;
- case Opt_key:
- ceph_crypto_key_destroy(opt->key);
- kfree(opt->key);
-
- opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
- if (!opt->key) {
- err = -ENOMEM;
- goto out;
- }
- err = get_secret(opt->key, argstr[0].from);
- if (err < 0)
- goto out;
- break;
+ case Opt_fsid:
+ err = parse_fsid(param->string, &opt->fsid);
+ if (err) {
+ error_plog(&log, "Failed to parse fsid: %d", err);
+ return err;
+ }
+ opt->flags |= CEPH_OPT_FSID;
+ break;
+ case Opt_name:
+ kfree(opt->name);
+ opt->name = param->string;
+ param->string = NULL;
+ break;
+ case Opt_secret:
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
- /* misc */
- case Opt_osdtimeout:
- pr_warn("ignoring deprecated osdtimeout option\n");
- break;
- case Opt_osdkeepalivetimeout:
- /* 0 isn't well defined right now, reject it */
- if (intval < 1 || intval > INT_MAX / 1000) {
- pr_err("osdkeepalive out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->osd_keepalive_timeout =
- msecs_to_jiffies(intval * 1000);
- break;
- case Opt_osd_idle_ttl:
- /* 0 isn't well defined right now, reject it */
- if (intval < 1 || intval > INT_MAX / 1000) {
- pr_err("osd_idle_ttl out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
- break;
- case Opt_mount_timeout:
- /* 0 is "wait forever" (i.e. infinite timeout) */
- if (intval < 0 || intval > INT_MAX / 1000) {
- pr_err("mount_timeout out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->mount_timeout = msecs_to_jiffies(intval * 1000);
- break;
- case Opt_osd_request_timeout:
- /* 0 is "wait forever" (i.e. infinite timeout) */
- if (intval < 0 || intval > INT_MAX / 1000) {
- pr_err("osd_request_timeout out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
- break;
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key)
+ return -ENOMEM;
+ err = ceph_crypto_key_unarmor(opt->key, param->string);
+ if (err) {
+ error_plog(&log, "Failed to parse secret: %d", err);
+ return err;
+ }
+ break;
+ case Opt_key:
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
- case Opt_share:
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key)
+ return -ENOMEM;
+ return get_secret(opt->key, param->string, &log);
+
+ case Opt_osdtimeout:
+ warn_plog(&log, "Ignoring osdtimeout");
+ break;
+ case Opt_osdkeepalivetimeout:
+ /* 0 isn't well defined right now, reject it */
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_keepalive_timeout =
+ msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_osd_idle_ttl:
+ /* 0 isn't well defined right now, reject it */
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_idle_ttl = msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_mount_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->mount_timeout = msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_osd_request_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_request_timeout =
+ msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+
+ case Opt_share:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOSHARE;
- break;
- case Opt_noshare:
+ else
opt->flags |= CEPH_OPT_NOSHARE;
- break;
-
- case Opt_crc:
+ break;
+ case Opt_crc:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOCRC;
- break;
- case Opt_nocrc:
+ else
opt->flags |= CEPH_OPT_NOCRC;
- break;
-
- case Opt_cephx_require_signatures:
+ break;
+ case Opt_cephx_require_signatures:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOMSGAUTH;
- break;
- case Opt_nocephx_require_signatures:
+ else
opt->flags |= CEPH_OPT_NOMSGAUTH;
- break;
- case Opt_cephx_sign_messages:
+ break;
+ case Opt_cephx_sign_messages:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOMSGSIGN;
- break;
- case Opt_nocephx_sign_messages:
+ else
opt->flags |= CEPH_OPT_NOMSGSIGN;
- break;
-
- case Opt_tcp_nodelay:
+ break;
+ case Opt_tcp_nodelay:
+ if (!result.negated)
opt->flags |= CEPH_OPT_TCP_NODELAY;
- break;
- case Opt_notcp_nodelay:
+ else
opt->flags &= ~CEPH_OPT_TCP_NODELAY;
- break;
+ break;
- case Opt_abort_on_full:
- opt->flags |= CEPH_OPT_ABORT_ON_FULL;
- break;
+ case Opt_abort_on_full:
+ opt->flags |= CEPH_OPT_ABORT_ON_FULL;
+ break;
- default:
- BUG_ON(token);
- }
+ default:
+ BUG();
}
- /* success */
- return opt;
+ return 0;
-out:
- ceph_destroy_options(opt);
- return ERR_PTR(err);
+out_of_range:
+ return inval_plog(&log, "%s out of range", param->key);
}
-EXPORT_SYMBOL(ceph_parse_options);
+EXPORT_SYMBOL(ceph_parse_param);
int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
bool show_all)
@@ -694,6 +669,14 @@ void ceph_destroy_client(struct ceph_client *client)
}
EXPORT_SYMBOL(ceph_destroy_client);
+void ceph_reset_client_addr(struct ceph_client *client)
+{
+ ceph_messenger_reset_nonce(&client->msgr);
+ ceph_monc_reopen_session(&client->monc);
+ ceph_osdc_reopen_osds(&client->osdc);
+}
+EXPORT_SYMBOL(ceph_reset_client_addr);
+
/*
* true if we have the mon map (and have thus joined the cluster)
*/
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
deleted file mode 100644
index 756a2dc10d27..000000000000
--- a/net/ceph/ceph_fs.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some non-inline ceph helpers
- */
-#include <linux/module.h>
-#include <linux/ceph/types.h>
-
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
- __u32 su = layout->stripe_unit;
- __u32 sc = layout->stripe_count;
- __u32 os = layout->object_size;
-
- /* stripe unit, object size must be non-zero, 64k increment */
- if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
- return 0;
- if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
- return 0;
- /* object size must be a multiple of stripe unit */
- if (os < su || os % su)
- return 0;
- /* stripe count must be non-zero */
- if (!sc)
- return 0;
- return 1;
-}
-
-void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
- struct ceph_file_layout_legacy *legacy)
-{
- fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
- fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
- fl->object_size = le32_to_cpu(legacy->fl_object_size);
- fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
- if (fl->pool_id == 0 && fl->stripe_unit == 0 &&
- fl->stripe_count == 0 && fl->object_size == 0)
- fl->pool_id = -1;
-}
-EXPORT_SYMBOL(ceph_file_layout_from_legacy);
-
-void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
- struct ceph_file_layout_legacy *legacy)
-{
- legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
- legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
- legacy->fl_object_size = cpu_to_le32(fl->object_size);
- if (fl->pool_id >= 0)
- legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
- else
- legacy->fl_pg_pool = 0;
-}
-EXPORT_SYMBOL(ceph_file_layout_to_legacy);
-
-int ceph_flags_to_mode(int flags)
-{
- int mode;
-
-#ifdef O_DIRECTORY /* fixme */
- if ((flags & O_DIRECTORY) == O_DIRECTORY)
- return CEPH_FILE_MODE_PIN;
-#endif
-
- switch (flags & O_ACCMODE) {
- case O_WRONLY:
- mode = CEPH_FILE_MODE_WR;
- break;
- case O_RDONLY:
- mode = CEPH_FILE_MODE_RD;
- break;
- case O_RDWR:
- case O_ACCMODE: /* this is what the VFS does */
- mode = CEPH_FILE_MODE_RDWR;
- break;
- }
-#ifdef O_LAZY
- if (flags & O_LAZY)
- mode |= CEPH_FILE_MODE_LAZY;
-#endif
-
- return mode;
-}
-EXPORT_SYMBOL(ceph_flags_to_mode);
-
-int ceph_caps_for_mode(int mode)
-{
- int caps = CEPH_CAP_PIN;
-
- if (mode & CEPH_FILE_MODE_RD)
- caps |= CEPH_CAP_FILE_SHARED |
- CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
- if (mode & CEPH_FILE_MODE_WR)
- caps |= CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
- CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
- CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
- if (mode & CEPH_FILE_MODE_LAZY)
- caps |= CEPH_CAP_FILE_LAZYIO;
-
- return caps;
-}
-EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 5d6724cee38f..4f75df40fb12 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -136,8 +136,10 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
if (key) {
kfree(key->key);
key->key = NULL;
- crypto_free_sync_skcipher(key->tfm);
- key->tfm = NULL;
+ if (key->tfm) {
+ crypto_free_sync_skcipher(key->tfm);
+ key->tfm = NULL;
+ }
}
}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 962f521c863e..5b4bd8261002 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2004,10 +2004,8 @@ int ceph_parse_ips(const char *c, const char *end,
return 0;
bad:
- pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
return ret;
}
-EXPORT_SYMBOL(ceph_parse_ips);
static int process_banner(struct ceph_connection *con)
{
@@ -3031,6 +3029,12 @@ static void con_fault(struct ceph_connection *con)
}
+void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
+{
+ u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
+ msgr->inst.addr.nonce = cpu_to_le32(nonce);
+ encode_my_addr(msgr);
+}
/*
* initialize a new messenger instance
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 0520bf9825aa..9d9e4e4ea600 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc)
__open_session(monc);
}
+void ceph_monc_reopen_session(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ reopen_session(monc);
+ mutex_unlock(&monc->mutex);
+}
+
static void un_backoff(struct ceph_mon_client *monc)
{
monc->hunt_mult /= 2; /* reduce by 50% */
@@ -1226,9 +1233,6 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
struct ceph_mon_client *monc = con->private;
int type = le16_to_cpu(msg->hdr.type);
- if (!monc)
- return;
-
switch (type) {
case CEPH_MSG_AUTH_REPLY:
handle_auth_reply(monc, msg);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0b2df09b2554..b68b376d8c2f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -402,7 +402,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_LIST_WATCHERS:
ceph_osd_data_release(&op->list_watchers.response_data);
break;
- case CEPH_OSD_OP_COPY_FROM:
+ case CEPH_OSD_OP_COPY_FROM2:
ceph_osd_data_release(&op->copy_from.osd_data);
break;
default:
@@ -697,7 +697,7 @@ static void get_num_data_items(struct ceph_osd_request *req,
case CEPH_OSD_OP_SETXATTR:
case CEPH_OSD_OP_CMPXATTR:
case CEPH_OSD_OP_NOTIFY_ACK:
- case CEPH_OSD_OP_COPY_FROM:
+ case CEPH_OSD_OP_COPY_FROM2:
*num_request_data_items += 1;
break;
@@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
struct ceph_pagelist *pagelist;
size_t payload_len = 0;
size_t size;
+ int ret;
op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
@@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
size = strlen(class);
BUG_ON(size > (size_t) U8_MAX);
op->cls.class_len = size;
- ceph_pagelist_append(pagelist, class, size);
+ ret = ceph_pagelist_append(pagelist, class, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
op->cls.method_name = method;
size = strlen(method);
BUG_ON(size > (size_t) U8_MAX);
op->cls.method_len = size;
- ceph_pagelist_append(pagelist, method, size);
+ ret = ceph_pagelist_append(pagelist, method, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
-
op->indata_len = payload_len;
return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
opcode, 0);
struct ceph_pagelist *pagelist;
size_t payload_len;
+ int ret;
BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
@@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
payload_len = strlen(name);
op->xattr.name_len = payload_len;
- ceph_pagelist_append(pagelist, name, payload_len);
+ ret = ceph_pagelist_append(pagelist, name, payload_len);
+ if (ret)
+ goto err_pagelist_free;
op->xattr.value_len = size;
- ceph_pagelist_append(pagelist, value, size);
+ ret = ceph_pagelist_append(pagelist, value, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
op->xattr.cmp_op = cmp_op;
@@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
op->indata_len = payload_len;
return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
@@ -1012,7 +1029,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE:
break;
- case CEPH_OSD_OP_COPY_FROM:
+ case CEPH_OSD_OP_COPY_FROM2:
dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
dst->copy_from.src_version =
cpu_to_le64(src->copy_from.src_version);
@@ -1488,7 +1505,6 @@ enum calc_target_result {
static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osd_request_target *t,
- struct ceph_connection *con,
bool any_change)
{
struct ceph_pg_pool_info *pi;
@@ -1496,7 +1512,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osds up, acting;
bool force_resend = false;
bool unpaused = false;
- bool legacy_change;
+ bool legacy_change = false;
bool split = false;
bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
bool recovery_deletes = ceph_osdmap_flag(osdc,
@@ -1584,15 +1600,14 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
t->osd = acting.primary;
}
- if (unpaused || legacy_change || force_resend ||
- (split && con && CEPH_HAVE_FEATURE(con->peer_features,
- RESEND_ON_SPLIT)))
+ if (unpaused || legacy_change || force_resend || split)
ct_res = CALC_TARGET_NEED_RESEND;
else
ct_res = CALC_TARGET_NO_ACTION;
out:
- dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+ dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
+ legacy_change, force_resend, split, ct_res, t->osd);
return ct_res;
}
@@ -1951,7 +1966,7 @@ static void setup_request_data(struct ceph_osd_request *req)
ceph_osdc_msg_data_add(request_msg,
&op->notify_ack.request_data);
break;
- case CEPH_OSD_OP_COPY_FROM:
+ case CEPH_OSD_OP_COPY_FROM2:
ceph_osdc_msg_data_add(request_msg,
&op->copy_from.osd_data);
break;
@@ -2273,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again:
- ct_res = calc_target(osdc, &req->r_t, NULL, false);
+ ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
goto promote;
@@ -2477,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
}
EXPORT_SYMBOL(ceph_osdc_abort_requests);
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
+{
+ down_write(&osdc->lock);
+ osdc->abort_err = 0;
+ up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
+
static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
{
if (likely(eb > osdc->epoch_barrier)) {
@@ -3088,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
}
- calc_target(osdc, &lreq->t, NULL, false);
+ calc_target(osdc, &lreq->t, false);
osd = lookup_create_osd(osdc, lreq->t.osd, true);
link_linger(osd, lreq);
@@ -3705,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
struct ceph_osd_client *osdc = lreq->osdc;
enum calc_target_result ct_res;
- ct_res = calc_target(osdc, &lreq->t, NULL, true);
+ ct_res = calc_target(osdc, &lreq->t, true);
if (ct_res == CALC_TARGET_NEED_RESEND) {
struct ceph_osd *osd;
@@ -3777,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd,
n = rb_next(n); /* unlink_request(), check_pool_dne() */
dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
- ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
- false);
+ ct_res = calc_target(osdc, &req->r_t, false);
switch (ct_res) {
case CALC_TARGET_NO_ACTION:
force_resend_writes = cleared_full ||
@@ -3887,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
n = rb_next(n);
if (req->r_t.epoch < osdc->osdmap->epoch) {
- ct_res = calc_target(osdc, &req->r_t, NULL, false);
+ ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE) {
erase_request(need_resend, req);
check_pool_dne(req);
@@ -5088,6 +5110,24 @@ out_put_req:
EXPORT_SYMBOL(ceph_osdc_call);
/*
+ * reset all osd connections
+ */
+void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ down_write(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ }
+ up_write(&osdc->lock);
+}
+
+/*
* init, shutdown
*/
int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
@@ -5275,6 +5315,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
struct ceph_object_locator *src_oloc,
u32 src_fadvise_flags,
u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
u8 copy_from_flags)
{
struct ceph_osd_req_op *op;
@@ -5285,7 +5326,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
if (IS_ERR(pages))
return PTR_ERR(pages);
- op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags);
+ op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2,
+ dst_fadvise_flags);
op->copy_from.snapid = src_snapid;
op->copy_from.src_version = src_version;
op->copy_from.flags = copy_from_flags;
@@ -5295,6 +5337,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
end = p + PAGE_SIZE;
ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
encode_oloc(&p, end, src_oloc);
+ ceph_encode_32(&p, truncate_seq);
+ ceph_encode_64(&p, truncate_size);
op->indata_len = PAGE_SIZE - (end - p);
ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
@@ -5310,6 +5354,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
struct ceph_object_id *dst_oid,
struct ceph_object_locator *dst_oloc,
u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
u8 copy_from_flags)
{
struct ceph_osd_request *req;
@@ -5326,7 +5371,8 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid,
src_oloc, src_fadvise_flags,
- dst_fadvise_flags, copy_from_flags);
+ dst_fadvise_flags, truncate_seq,
+ truncate_size, copy_from_flags);
if (ret)
goto out;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 90437906b7bc..4e0de14f80bb 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
struct ceph_pg_pool_info, node);
__remove_pg_pool(&map->pg_pools, pi);
}
- kfree(map->osd_state);
- kfree(map->osd_weight);
- kfree(map->osd_addr);
- kfree(map->osd_primary_affinity);
- kfree(map->crush_workspace);
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ kvfree(map->osd_primary_affinity);
+ kvfree(map->crush_workspace);
kfree(map);
}
@@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
*
* The new elements are properly initialized.
*/
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
{
u32 *state;
u32 *weight;
struct ceph_entity_addr *addr;
+ u32 to_copy;
int i;
- state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
- if (!state)
- return -ENOMEM;
- map->osd_state = state;
+ dout("%s old %u new %u\n", __func__, map->max_osd, max);
+ if (max == map->max_osd)
+ return 0;
- weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
- if (!weight)
+ state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
+ weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
+ addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
+ if (!state || !weight || !addr) {
+ kvfree(state);
+ kvfree(weight);
+ kvfree(addr);
return -ENOMEM;
- map->osd_weight = weight;
+ }
- addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
- if (!addr)
- return -ENOMEM;
- map->osd_addr = addr;
+ to_copy = min(map->max_osd, max);
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, to_copy * sizeof(*state));
+ memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
+ memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ }
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
for (i = map->max_osd; i < max; i++) {
map->osd_state[i] = 0;
map->osd_weight[i] = CEPH_OSD_OUT;
@@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
if (map->osd_primary_affinity) {
u32 *affinity;
- affinity = krealloc(map->osd_primary_affinity,
- max*sizeof(*affinity), GFP_NOFS);
+ affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
+ GFP_NOFS);
if (!affinity)
return -ENOMEM;
- map->osd_primary_affinity = affinity;
+ memcpy(affinity, map->osd_primary_affinity,
+ to_copy * sizeof(*affinity));
+ kvfree(map->osd_primary_affinity);
+
+ map->osd_primary_affinity = affinity;
for (i = map->max_osd; i < max; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
- workspace = kmalloc(work_size, GFP_NOIO);
+ workspace = ceph_kvmalloc(work_size, GFP_NOIO);
if (!workspace) {
crush_destroy(crush);
return -ENOMEM;
@@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
if (map->crush)
crush_destroy(map->crush);
- kfree(map->crush_workspace);
+ kvfree(map->crush_workspace);
map->crush = crush;
map->crush_workspace = workspace;
return 0;
@@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
if (!map->osd_primary_affinity) {
int i;
- map->osd_primary_affinity = kmalloc_array(map->max_osd,
- sizeof(u32),
- GFP_NOFS);
+ map->osd_primary_affinity = ceph_kvmalloc(
+ array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
+ GFP_NOFS);
if (!map->osd_primary_affinity)
return -ENOMEM;
@@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end,
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0) {
- kfree(map->osd_primary_affinity);
+ kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL;
return 0;
}
diff --git a/net/compat.c b/net/compat.c
index 0f7ded26059e..47d99c784947 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -232,7 +232,7 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
(type == SO_TIMESTAMPNS_OLD || type == SO_TIMESTAMPING_OLD)) {
int count = type == SO_TIMESTAMPNS_OLD ? 1 : 3;
int i;
- struct timespec *ts = (struct timespec *)data;
+ struct __kernel_old_timespec *ts = data;
for (i = 0; i < count; i++) {
cts[i].tv_sec = ts[i].tv_sec;
cts[i].tv_nsec = ts[i].tv_nsec;
diff --git a/net/core/Makefile b/net/core/Makefile
index a104dc8faafc..3e2c378e5f31 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -8,7 +8,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94c7f77ecb6b..3ab23f698221 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,6 +12,9 @@
static atomic_t cache_idx;
+#define SK_STORAGE_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
struct bucket {
struct hlist_head list;
raw_spinlock_t lock;
@@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
kfree_rcu(sk_storage, rcu);
}
-/* sk_storage->lock must be held and sk_storage->list cannot be empty */
static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_elem *selem)
{
@@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
return 0;
}
-/* Called by __sk_destruct() */
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_sk_storage_elem *selem;
@@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
smap = (struct bpf_sk_storage_map *)map;
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
synchronize_rcu();
/* bpf prog and the userspace can no longer access this map
@@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
- if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
+ if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
!attr->btf_key_type_id || !attr->btf_value_type_id)
@@ -634,9 +643,10 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
- nbuckets = 1U << smap->bucket_log;
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
ret = bpf_map_charge_init(&smap->map.memory, cost);
@@ -739,6 +749,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+ struct bpf_sk_storage_map *smap,
+ struct bpf_sk_storage_elem *selem)
+{
+ struct bpf_sk_storage_elem *copy_selem;
+
+ copy_selem = selem_alloc(smap, newsk, NULL, true);
+ if (!copy_selem)
+ return NULL;
+
+ if (map_value_has_spin_lock(&smap->map))
+ copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data, true);
+ else
+ copy_map_value(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data);
+
+ return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct bpf_sk_storage *new_sk_storage = NULL;
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_elem *selem;
+ int ret = 0;
+
+ RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+ if (!sk_storage || hlist_empty(&sk_storage->list))
+ goto out;
+
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ struct bpf_sk_storage_elem *copy_selem;
+ struct bpf_sk_storage_map *smap;
+ struct bpf_map *map;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ if (!(smap->map.map_flags & BPF_F_CLONE))
+ continue;
+
+ /* Note that for lockless listeners adding new element
+ * here can race with cleanup in bpf_sk_storage_map_free.
+ * Try to grab map refcnt to make sure that it's still
+ * alive and prevent concurrent removal.
+ */
+ map = bpf_map_inc_not_zero(&smap->map);
+ if (IS_ERR(map))
+ continue;
+
+ copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+ if (!copy_selem) {
+ ret = -ENOMEM;
+ bpf_map_put(map);
+ goto out;
+ }
+
+ if (new_sk_storage) {
+ selem_link_map(smap, copy_selem);
+ __selem_link_sk(new_sk_storage, copy_selem);
+ } else {
+ ret = sk_storage_alloc(newsk, smap, copy_selem);
+ if (ret) {
+ kfree(copy_selem);
+ atomic_sub(smap->elem_size,
+ &newsk->sk_omem_alloc);
+ bpf_map_put(map);
+ goto out;
+ }
+
+ new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+ }
+ bpf_map_put(map);
+ }
+
+out:
+ rcu_read_unlock();
+
+ /* In case of an error, don't free anything explicitly here, the
+ * caller is responsible to call bpf_sk_storage_free.
+ */
+
+ return ret;
+}
+
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 45a162ef5e02..a78e7f864c1e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -84,7 +84,8 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i
/*
* Wait for the last received packet to be different from skb
*/
-int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
+ int *err, long *timeo_p,
const struct sk_buff *skb)
{
int error;
@@ -97,7 +98,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
if (error)
goto out_err;
- if (sk->sk_receive_queue.prev != skb)
+ if (READ_ONCE(queue->prev) != skb)
goto out;
/* Socket shut down? */
@@ -209,6 +210,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
/**
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
+ * @queue: socket queue from which to receive
* @flags: MSG\_ flags
* @destructor: invoked under the receive lock on successful dequeue
* @off: an offset in bytes to peek skb from. Returns an offset
@@ -241,13 +243,14 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
+ struct sk_buff_head *queue,
+ unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb),
int *off, int *err,
struct sk_buff **last)
{
- struct sk_buff_head *queue = &sk->sk_receive_queue;
struct sk_buff *skb;
unsigned long cpu_flags;
/*
@@ -278,7 +281,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT);
- } while (sk->sk_receive_queue.prev != *last);
+ } while (READ_ONCE(queue->prev) != *last);
error = -EAGAIN;
@@ -288,7 +291,9 @@ no_packet:
}
EXPORT_SYMBOL(__skb_try_recv_datagram);
-struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+struct sk_buff *__skb_recv_datagram(struct sock *sk,
+ struct sk_buff_head *sk_queue,
+ unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb),
int *off, int *err)
@@ -299,15 +304,16 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, destructor, off, err,
- &last);
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor,
+ off, err, &last);
if (skb)
return skb;
if (*err != -EAGAIN)
break;
} while (timeo &&
- !__skb_wait_for_more_packets(sk, err, &timeo, last));
+ !__skb_wait_for_more_packets(sk, sk_queue, err,
+ &timeo, last));
return NULL;
}
@@ -318,7 +324,8 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
{
int off = 0;
- return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+ return __skb_recv_datagram(sk, &sk->sk_receive_queue,
+ flags | (noblock ? MSG_DONTWAIT : 0),
NULL, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -442,8 +449,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- n = cb(vaddr + frag->page_offset +
- offset - start, copy, data, to);
+ n = cb(vaddr + skb_frag_off(frag) + offset - start,
+ copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
@@ -573,7 +580,7 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
copied = copy_page_from_iter(skb_frag_page(frag),
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, from);
if (copied != copy)
goto fault;
@@ -640,7 +647,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
skb->len += copied;
skb->truesize += truesize;
if (sk && sk->sk_type == SOCK_STREAM) {
- sk->sk_wmem_queued += truesize;
+ sk_wmem_queued_add(sk, truesize);
sk_mem_charge(sk, truesize);
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
@@ -767,7 +774,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
mask = 0;
/* exceptional events? */
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
@@ -777,7 +784,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
diff --git a/net/core/dev.c b/net/core/dev.c
index 0891f499c1bb..a6316b336128 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,6 +146,7 @@
#include "net-sysfs.h"
#define MAX_GRO_SKBS 8
+#define MAX_NEST_DEV 8
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
@@ -228,6 +229,122 @@ static inline void rps_unlock(struct softnet_data *sd)
#endif
}
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
+ const char *name)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
+ if (!name_node)
+ return NULL;
+ INIT_HLIST_NODE(&name_node->hlist);
+ name_node->dev = dev;
+ name_node->name = name;
+ return name_node;
+}
+
+static struct netdev_name_node *
+netdev_name_node_head_alloc(struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = netdev_name_node_alloc(dev, dev->name);
+ if (!name_node)
+ return NULL;
+ INIT_LIST_HEAD(&name_node->list);
+ return name_node;
+}
+
+static void netdev_name_node_free(struct netdev_name_node *name_node)
+{
+ kfree(name_node);
+}
+
+static void netdev_name_node_add(struct net *net,
+ struct netdev_name_node *name_node)
+{
+ hlist_add_head_rcu(&name_node->hlist,
+ dev_name_hash(net, name_node->name));
+}
+
+static void netdev_name_node_del(struct netdev_name_node *name_node)
+{
+ hlist_del_rcu(&name_node->hlist);
+}
+
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry_rcu(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (name_node)
+ return -EEXIST;
+ name_node = netdev_name_node_alloc(dev, name);
+ if (!name_node)
+ return -ENOMEM;
+ netdev_name_node_add(net, name_node);
+ /* The node that holds dev->name acts as a head of per-device list. */
+ list_add_tail(&name_node->list, &dev->name_node->list);
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_name_node_alt_create);
+
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+ list_del(&name_node->list);
+ netdev_name_node_del(name_node);
+ kfree(name_node->name);
+ netdev_name_node_free(name_node);
+}
+
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (!name_node)
+ return -ENOENT;
+ __netdev_name_node_alt_destroy(name_node);
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_name_node_alt_destroy);
+
+static void netdev_name_node_alt_flush(struct net_device *dev)
+{
+ struct netdev_name_node *name_node, *tmp;
+
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
+ __netdev_name_node_alt_destroy(name_node);
+}
+
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
@@ -237,7 +354,7 @@ static void list_netdevice(struct net_device *dev)
write_lock_bh(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
write_unlock_bh(&dev_base_lock);
@@ -255,7 +372,7 @@ static void unlist_netdevice(struct net_device *dev)
/* Unlink dev from the device chain */
write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
- hlist_del_rcu(&dev->name_hlist);
+ netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
write_unlock_bh(&dev_base_lock);
@@ -276,88 +393,6 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);
-#ifdef CONFIG_LOCKDEP
-/*
- * register_netdevice() inits txq->_xmit_lock and sets lockdep class
- * according to dev->type
- */
-static const unsigned short netdev_lock_type[] = {
- ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
- ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
- ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
- ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
- ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
- ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
- ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
- ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
- ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
- ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
- ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
- ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
- ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
- ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
- ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
-
-static const char *const netdev_lock_name[] = {
- "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
- "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
- "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
- "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
- "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
- "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
- "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
- "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
- "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
- "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
- "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
- "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
- "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
- "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
- "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
-
-static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
-static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
-
-static inline unsigned short netdev_lock_pos(unsigned short dev_type)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
- if (netdev_lock_type[i] == dev_type)
- return i;
- /* the last key is used by default */
- return ARRAY_SIZE(netdev_lock_type) - 1;
-}
-
-static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
- unsigned short dev_type)
-{
- int i;
-
- i = netdev_lock_pos(dev_type);
- lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
- netdev_lock_name[i]);
-}
-
-static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
-{
- int i;
-
- i = netdev_lock_pos(dev->type);
- lockdep_set_class_and_name(&dev->addr_list_lock,
- &netdev_addr_lock_key[i],
- netdev_lock_name[i]);
-}
-#else
-static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
- unsigned short dev_type)
-{
-}
-static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
-{
-}
-#endif
-
/*******************************************************************************
*
* Protocol management and registration routines
@@ -733,14 +768,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
- struct net_device *dev;
- struct hlist_head *head = dev_name_hash(net, name);
-
- hlist_for_each_entry(dev, head, name_hlist)
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
+ struct netdev_name_node *node_name;
- return NULL;
+ node_name = netdev_name_node_lookup(net, name);
+ return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);
@@ -758,14 +789,10 @@ EXPORT_SYMBOL(__dev_get_by_name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
- struct net_device *dev;
- struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *node_name;
- hlist_for_each_entry_rcu(dev, head, name_hlist)
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
-
- return NULL;
+ node_name = netdev_name_node_lookup_rcu(net, name);
+ return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);
@@ -901,7 +928,7 @@ EXPORT_SYMBOL(dev_get_by_napi_id);
*
* The use of raw_seqcount_begin() and cond_resched() before
* retrying is required as we want to give the writers a chance
- * to complete when CONFIG_PREEMPT is not set.
+ * to complete when CONFIG_PREEMPTION is not set.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
@@ -1141,8 +1168,8 @@ int dev_alloc_name(struct net_device *dev, const char *name)
}
EXPORT_SYMBOL(dev_alloc_name);
-int dev_get_valid_name(struct net *net, struct net_device *dev,
- const char *name)
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
+ const char *name)
{
BUG_ON(!net);
@@ -1158,7 +1185,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev,
return 0;
}
-EXPORT_SYMBOL(dev_get_valid_name);
/**
* dev_change_name - change name of a device
@@ -1232,13 +1258,13 @@ rollback:
netdev_adjacent_rename_links(dev, oldname);
write_lock_bh(&dev_base_lock);
- hlist_del_rcu(&dev->name_hlist);
+ netdev_name_node_del(dev->name_node);
write_unlock_bh(&dev_base_lock);
synchronize_rcu();
write_lock_bh(&dev_base_lock);
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ netdev_name_node_add(net, dev->name_node);
write_unlock_bh(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
@@ -1288,8 +1314,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
}
mutex_lock(&ifalias_mutex);
- rcu_swap_protected(dev->ifalias, new_alias,
- mutex_is_locked(&ifalias_mutex));
+ new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
+ mutex_is_locked(&ifalias_mutex));
mutex_unlock(&ifalias_mutex);
if (new_alias)
@@ -1617,6 +1643,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
return nb->notifier_call(nb, val, &info);
}
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ int err;
+
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
+ return 0;
+}
+
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+}
+
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ err = call_netdevice_register_notifiers(nb, dev);
+ if (err)
+ goto rollback;
+ }
+ return 0;
+
+rollback:
+ for_each_netdev_continue_reverse(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+ return err;
+}
+
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+
+ for_each_netdev(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+}
+
static int dev_boot_phase = 1;
/**
@@ -1635,8 +1717,6 @@ static int dev_boot_phase = 1;
int register_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
- struct net_device *last;
struct net *net;
int err;
@@ -1649,17 +1729,9 @@ int register_netdevice_notifier(struct notifier_block *nb)
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
- for_each_netdev(net, dev) {
- err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
- err = notifier_to_errno(err);
- if (err)
- goto rollback;
-
- if (!(dev->flags & IFF_UP))
- continue;
-
- call_netdevice_notifier(nb, NETDEV_UP, dev);
- }
+ err = call_netdevice_register_net_notifiers(nb, net);
+ if (err)
+ goto rollback;
}
unlock:
@@ -1668,22 +1740,9 @@ unlock:
return err;
rollback:
- last = dev;
- for_each_net(net) {
- for_each_netdev(net, dev) {
- if (dev == last)
- goto outroll;
-
- if (dev->flags & IFF_UP) {
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
- dev);
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
- }
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
- }
- }
+ for_each_net_continue_reverse(net)
+ call_netdevice_unregister_net_notifiers(nb, net);
-outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
@@ -1705,7 +1764,6 @@ EXPORT_SYMBOL(register_netdevice_notifier);
int unregister_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
struct net *net;
int err;
@@ -1716,16 +1774,9 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net) {
- for_each_netdev(net, dev) {
- if (dev->flags & IFF_UP) {
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
- dev);
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
- }
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
- }
- }
+ for_each_net(net)
+ call_netdevice_unregister_net_notifiers(nb, net);
+
unlock:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
@@ -1733,6 +1784,138 @@ unlock:
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
+static int __register_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb,
+ bool ignore_call_fail)
+{
+ int err;
+
+ err = raw_notifier_chain_register(&net->netdev_chain, nb);
+ if (err)
+ return err;
+ if (dev_boot_phase)
+ return 0;
+
+ err = call_netdevice_register_net_notifiers(nb, net);
+ if (err && !ignore_call_fail)
+ goto chain_unregister;
+
+ return 0;
+
+chain_unregister:
+ raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ return err;
+}
+
+static int __unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ if (err)
+ return err;
+
+ call_netdevice_unregister_net_notifiers(nb, net);
+ return 0;
+}
+
+/**
+ * register_netdevice_notifier_net - register a per-netns network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = __register_netdevice_notifier_net(net, nb, false);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_net);
+
+/**
+ * unregister_netdevice_notifier_net - unregister a per-netns
+ * network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = __unregister_netdevice_notifier_net(net, nb);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
+
+int register_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_lock();
+ err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
+ if (!err) {
+ nn->nb = nb;
+ list_add(&nn->list, &dev->net_notifier_list);
+ }
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
+
+int unregister_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_lock();
+ list_del(&nn->list);
+ err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
+
+static void move_netdevice_notifiers_dev_net(struct net_device *dev,
+ struct net *net)
+{
+ struct netdev_net_notifier *nn;
+
+ list_for_each_entry(nn, &dev->net_notifier_list, list) {
+ __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
+ __register_netdevice_notifier_net(net, nn->nb, true);
+ }
+}
+
/**
* call_netdevice_notifiers_info - call all network notifier blocks
* @val: value passed unmodified to notifier function
@@ -1745,7 +1928,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
static int call_netdevice_notifiers_info(unsigned long val,
struct netdev_notifier_info *info)
{
+ struct net *net = dev_net(info->dev);
+ int ret;
+
ASSERT_RTNL();
+
+ /* Run per-netns notifier block chain first, then run the global one.
+ * Hopefully, one day, the global one is going to be removed after
+ * all notifier block registrators get converted to be per-netns.
+ */
+ ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
+ if (ret & NOTIFY_STOP_MASK)
+ return ret;
return raw_notifier_call_chain(&netdev_chain, val, info);
}
@@ -2771,7 +2965,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
void netif_schedule_queue(struct netdev_queue *txq)
{
rcu_read_lock();
- if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ if (!netif_xmit_stopped(txq)) {
struct Qdisc *q = rcu_dereference(txq->qdisc);
__netif_schedule(q);
@@ -2939,12 +3133,9 @@ int skb_checksum_help(struct sk_buff *skb)
offset += skb->csum_offset;
BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(__sum16))) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (ret)
- goto out;
- }
+ ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
+ if (ret)
+ goto out;
*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
@@ -2979,12 +3170,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
ret = -EINVAL;
goto out;
}
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(__le32))) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (ret)
- goto out;
- }
+
+ ret = skb_ensure_writable(skb, offset + sizeof(__le32));
+ if (ret)
+ goto out;
+
crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
skb->len - start, ~(__u32)0,
crc32c_csum_stub));
@@ -3109,7 +3299,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
segs = skb_mac_gso_segment(skb, features);
- if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
skb_warn_bad_offload(skb);
return segs;
@@ -3467,18 +3657,22 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- __qdisc_drop(skb, &to_free);
- rc = NET_XMIT_DROP;
- } else if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
- qdisc_run_begin(q)) {
+ if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) &&
+ qdisc_run_begin(q)) {
+ if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state))) {
+ __qdisc_drop(skb, &to_free);
+ rc = NET_XMIT_DROP;
+ goto end_run;
+ }
qdisc_bstats_cpu_update(q, skb);
+ rc = NET_XMIT_SUCCESS;
if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
__qdisc_run(q);
+end_run:
qdisc_run_end(q);
- rc = NET_XMIT_SUCCESS;
} else {
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
qdisc_run(q);
@@ -3963,6 +4157,8 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
int dev_rx_weight __read_mostly = 64;
int dev_tx_weight __read_mostly = 64;
+/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
+int gro_normal_batch __read_mostly = 8;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
@@ -4331,14 +4527,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_cloned(skb) || skb_is_tc_redirected(skb))
+ if (skb_is_tc_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
- if (skb_is_nonlinear(skb) ||
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
@@ -4786,7 +4982,6 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
-#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
@@ -4800,7 +4995,6 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
rcu_read_unlock();
return ingress_retval;
}
-#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
}
@@ -5345,9 +5539,29 @@ static void flush_all_backlogs(void)
put_online_cpus();
}
+/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
+static void gro_normal_list(struct napi_struct *napi)
+{
+ if (!napi->rx_count)
+ return;
+ netif_receive_skb_list_internal(&napi->rx_list);
+ INIT_LIST_HEAD(&napi->rx_list);
+ napi->rx_count = 0;
+}
+
+/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
+ * pass the whole batch up to the stack.
+ */
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
+{
+ list_add_tail(&skb->list, &napi->rx_list);
+ if (++napi->rx_count >= gro_normal_batch)
+ gro_normal_list(napi);
+}
+
INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
-static int napi_gro_complete(struct sk_buff *skb)
+static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
struct packet_offload *ptype;
__be16 type = skb->protocol;
@@ -5380,7 +5594,8 @@ static int napi_gro_complete(struct sk_buff *skb)
}
out:
- return netif_receive_skb_internal(skb);
+ gro_normal_one(napi, skb);
+ return NET_RX_SUCCESS;
}
static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
@@ -5393,7 +5608,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
skb_list_del_init(skb);
- napi_gro_complete(skb);
+ napi_gro_complete(napi, skb);
napi->gro_hash[index].count--;
}
@@ -5440,7 +5655,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
if (skb_vlan_tag_present(p))
- diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
@@ -5465,8 +5680,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
NAPI_GRO_CB(skb)->frag0 = NULL;
NAPI_GRO_CB(skb)->frag0_len = 0;
- if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
- pinfo->nr_frags &&
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
!PageHighMem(skb_frag_page(frag0))) {
NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
@@ -5486,7 +5700,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
skb->data_len -= grow;
skb->tail += grow;
- pinfo->frags[0].page_offset += grow;
+ skb_frag_off_add(&pinfo->frags[0], grow);
skb_frag_size_sub(&pinfo->frags[0], grow);
if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
@@ -5496,7 +5710,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
}
}
-static void gro_flush_oldest(struct list_head *head)
+static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
{
struct sk_buff *oldest;
@@ -5512,7 +5726,7 @@ static void gro_flush_oldest(struct list_head *head)
* SKB to the chain.
*/
skb_list_del_init(oldest);
- napi_gro_complete(oldest);
+ napi_gro_complete(napi, oldest);
}
INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
@@ -5578,7 +5792,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (&ptype->list == head)
goto normal;
- if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
+ if (PTR_ERR(pp) == -EINPROGRESS) {
ret = GRO_CONSUMED;
goto ok;
}
@@ -5588,7 +5802,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (pp) {
skb_list_del_init(pp);
- napi_gro_complete(pp);
+ napi_gro_complete(napi, pp);
napi->gro_hash[hash].count--;
}
@@ -5599,7 +5813,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
goto normal;
if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(gro_head);
+ gro_flush_oldest(napi, gro_head);
} else {
napi->gro_hash[hash].count++;
}
@@ -5660,16 +5874,17 @@ EXPORT_SYMBOL(gro_find_complete_by_type);
static void napi_skb_free_stolen_head(struct sk_buff *skb)
{
skb_dst_drop(skb);
- secpath_reset(skb);
+ skb_ext_put(skb);
kmem_cache_free(skbuff_head_cache, skb);
}
-static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
+static gro_result_t napi_skb_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- if (netif_receive_skb_internal(skb))
- ret = GRO_DROP;
+ gro_normal_one(napi, skb);
break;
case GRO_DROP:
@@ -5701,7 +5916,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
skb_gro_reset_offset(skb);
- ret = napi_skb_finish(dev_gro_receive(napi, skb), skb);
+ ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_receive_exit(ret);
return ret;
@@ -5727,7 +5942,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- secpath_reset(skb);
+ skb_ext_reset(skb);
napi->skb = skb;
}
@@ -5756,8 +5971,8 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
case GRO_HELD:
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
- if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
- ret = GRO_DROP;
+ if (ret == GRO_NORMAL)
+ gro_normal_one(napi, skb);
break;
case GRO_DROP:
@@ -6049,6 +6264,9 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
}
+
+ gro_normal_list(n);
+
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
@@ -6119,10 +6337,19 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ /* We can't gro_normal_list() here, because napi->poll() might have
+ * rearmed the napi (napi_complete_done()) in which case it could
+ * already be running on another CPU.
+ */
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET)
+ if (rc == BUSY_POLL_BUDGET) {
+ /* As the whole budget was spent, we still own the napi so can
+ * safely handle the rx_list.
+ */
+ gro_normal_list(napi);
__napi_schedule(napi);
+ }
local_bh_enable();
}
@@ -6167,6 +6394,7 @@ restart:
}
work = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ gro_normal_list(napi);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
@@ -6272,6 +6500,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
napi->skb = NULL;
+ INIT_LIST_HEAD(&napi->rx_list);
+ napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
@@ -6375,6 +6605,8 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
napi_gro_flush(n, HZ >= 1000);
}
+ gro_normal_list(n);
+
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
@@ -6447,6 +6679,9 @@ struct netdev_adjacent {
/* upper master flag, there can only be one master device per list */
bool master;
+ /* lookup ignore flag */
+ bool ignore;
+
/* counter for the number of times this device was added to us */
u16 ref_nr;
@@ -6469,7 +6704,7 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
{
struct net_device *dev = data;
@@ -6490,7 +6725,7 @@ bool netdev_has_upper_dev(struct net_device *dev,
{
ASSERT_RTNL();
- return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
@@ -6508,7 +6743,7 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev)
{
- return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
upper_dev);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
@@ -6552,6 +6787,22 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master) && !upper->ignore)
+ return upper->dev;
+ return NULL;
+}
+
/**
* netdev_has_any_lower_dev - Check if device is linked to some device
* @dev: device
@@ -6602,6 +6853,23 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+ *ignore = upper->ignore;
+
+ return upper->dev;
+}
+
static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
@@ -6619,34 +6887,111 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
return upper->dev;
}
+static int __netdev_walk_all_upper_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = __netdev_next_upper_dev(now, &iter, &ignore);
+ if (!udev)
+ break;
+ if (ignore)
+ continue;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
void *data),
void *data)
{
- struct net_device *udev;
- struct list_head *iter;
- int ret;
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
- for (iter = &dev->adj_list.upper,
- udev = netdev_next_upper_dev_rcu(dev, &iter);
- udev;
- udev = netdev_next_upper_dev_rcu(dev, &iter)) {
- /* first is the upper device itself */
- ret = fn(udev, data);
- if (ret)
- return ret;
+ now = dev;
+ iter = &dev->adj_list.upper;
- /* then look at all of its upper devices */
- ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
- if (ret)
- return ret;
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev_rcu(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
+static bool __netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ ASSERT_RTNL();
+
+ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
+ upper_dev);
+}
+
/**
* netdev_lower_get_next_private - Get the next ->private from the
* lower neighbour list
@@ -6743,34 +7088,119 @@ static struct net_device *netdev_next_lower_dev(struct net_device *dev,
return lower->dev;
}
+static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+ *ignore = lower->ignore;
+
+ return lower->dev;
+}
+
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
void *data),
void *data)
{
- struct net_device *ldev;
- struct list_head *iter;
- int ret;
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
- for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev(dev, &iter);
- ldev;
- ldev = netdev_next_lower_dev(dev, &iter)) {
- /* first is the lower device itself */
- ret = fn(ldev, data);
- if (ret)
- return ret;
+ now = dev;
+ iter = &dev->adj_list.lower;
- /* then look at all of its lower devices */
- ret = netdev_walk_all_lower_dev(ldev, fn, data);
- if (ret)
- return ret;
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+static int __netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = __netdev_next_lower_dev(now, &iter, &ignore);
+ if (!ldev)
+ break;
+ if (ignore)
+ continue;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
@@ -6785,28 +7215,99 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
return lower->dev;
}
-int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
- int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+static u8 __netdev_upper_depth(struct net_device *dev)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+ bool ignore;
+
+ for (iter = &dev->adj_list.upper,
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore);
+ udev;
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
+ if (max_depth < udev->upper_level)
+ max_depth = udev->upper_level;
+ }
+
+ return max_depth;
+}
+
+static u8 __netdev_lower_depth(struct net_device *dev)
{
struct net_device *ldev;
struct list_head *iter;
- int ret;
+ u8 max_depth = 0;
+ bool ignore;
for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
ldev;
- ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
- /* first is the lower device itself */
- ret = fn(ldev, data);
- if (ret)
- return ret;
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
+ if (max_depth < ldev->lower_level)
+ max_depth = ldev->lower_level;
+ }
- /* then look at all of its lower devices */
- ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
- if (ret)
- return ret;
+ return max_depth;
+}
+
+static int __netdev_update_upper_level(struct net_device *dev, void *data)
+{
+ dev->upper_level = __netdev_upper_depth(dev) + 1;
+ return 0;
+}
+
+static int __netdev_update_lower_level(struct net_device *dev, void *data)
+{
+ dev->lower_level = __netdev_lower_depth(dev) + 1;
+ return 0;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ void *data),
+ void *data)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, data);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev_rcu(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
@@ -6910,6 +7411,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->master = master;
adj->ref_nr = 1;
adj->private = private;
+ adj->ignore = false;
dev_hold(adj_dev);
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
@@ -7060,14 +7562,17 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (netdev_has_upper_dev(upper_dev, dev))
+ if (__netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
+ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+ return -EMLINK;
+
if (!master) {
- if (netdev_has_upper_dev(dev, upper_dev))
+ if (__netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
} else {
- master_dev = netdev_master_upper_dev_get(dev);
+ master_dev = __netdev_master_upper_dev_get(dev);
if (master_dev)
return master_dev == upper_dev ? -EEXIST : -EBUSY;
}
@@ -7089,6 +7594,13 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
goto rollback;
+ __netdev_update_upper_level(dev, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, NULL);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ NULL);
+
return 0;
rollback:
@@ -7171,9 +7683,96 @@ void netdev_upper_dev_unlink(struct net_device *dev,
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
+
+ __netdev_update_upper_level(dev, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, NULL);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ NULL);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
+ struct net_device *lower_dev,
+ bool val)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
+ if (adj)
+ adj->ignore = val;
+
+ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
+ if (adj)
+ adj->ignore = val;
+}
+
+static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
+}
+
+static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
+}
+
+int netdev_adjacent_change_prepare(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!new_dev)
+ return 0;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_disable(dev, old_dev);
+
+ err = netdev_upper_dev_link(new_dev, dev, extack);
+ if (err) {
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_adjacent_change_prepare);
+
+void netdev_adjacent_change_commit(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ if (!new_dev || !old_dev)
+ return;
+
+ if (new_dev == old_dev)
+ return;
+
+ netdev_adjacent_dev_enable(dev, old_dev);
+ netdev_upper_dev_unlink(old_dev, dev);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_commit);
+
+void netdev_adjacent_change_abort(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ if (!new_dev)
+ return;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+
+ netdev_upper_dev_unlink(new_dev, dev);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_abort);
+
/**
* netdev_bonding_info_change - Dispatch event about slave change
* @dev: device
@@ -7287,25 +7886,6 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
EXPORT_SYMBOL(netdev_lower_dev_get_private);
-int dev_get_nest_level(struct net_device *dev)
-{
- struct net_device *lower = NULL;
- struct list_head *iter;
- int max_nest = -1;
- int nest;
-
- ASSERT_RTNL();
-
- netdev_for_each_lower_dev(dev, lower, iter) {
- nest = dev_get_nest_level(lower);
- if (max_nest < nest)
- max_nest = nest;
- }
-
- return max_nest + 1;
-}
-EXPORT_SYMBOL(dev_get_nest_level);
-
/**
* netdev_lower_change - Dispatch event about lower device state change
* @lower_dev: device
@@ -7658,11 +8238,28 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
if (ops->ndo_change_mtu)
return ops->ndo_change_mtu(dev, new_mtu);
- dev->mtu = new_mtu;
+ /* Pairs with all the lockless reads of dev->mtu in the stack */
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
EXPORT_SYMBOL(__dev_set_mtu);
+int dev_validate_mtu(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ /* MTU must be positive, and in range */
+ if (new_mtu < 0 || new_mtu < dev->min_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu less than device minimum");
+ return -EINVAL;
+ }
+
+ if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
+ NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
/**
* dev_set_mtu_ext - Change maximum transfer unit
* @dev: device
@@ -7679,16 +8276,9 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
if (new_mtu == dev->mtu)
return 0;
- /* MTU must be positive, and in range */
- if (new_mtu < 0 || new_mtu < dev->min_mtu) {
- NL_SET_ERR_MSG(extack, "mtu less than device minimum");
- return -EINVAL;
- }
-
- if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
- NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
- return -EINVAL;
- }
+ err = dev_validate_mtu(dev, new_mtu, extack);
+ if (err)
+ return err;
if (!netif_device_present(dev))
return -ENODEV;
@@ -8011,7 +8601,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
struct netlink_ext_ack *extack, u32 flags,
struct bpf_prog *prog)
{
+ bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
+ struct bpf_prog *prev_prog = NULL;
struct netdev_bpf xdp;
+ int err;
+
+ if (non_hw) {
+ prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
+ XDP_QUERY_PROG));
+ if (IS_ERR(prev_prog))
+ prev_prog = NULL;
+ }
memset(&xdp, 0, sizeof(xdp));
if (flags & XDP_FLAGS_HW_MODE)
@@ -8022,7 +8622,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
xdp.flags = flags;
xdp.prog = prog;
- return bpf_op(dev, &xdp);
+ err = bpf_op(dev, &xdp);
+ if (!err && non_hw)
+ bpf_prog_change_xdp(prev_prog, prog);
+
+ if (prev_prog)
+ bpf_prog_put(prev_prog);
+
+ return err;
}
static void dev_xdp_uninstall(struct net_device *dev)
@@ -8088,12 +8695,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
+ u32 prog_id;
+
if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
return -EEXIST;
}
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_query(dev, bpf_op, query)) {
+
+ prog_id = __dev_xdp_query(dev, bpf_op, query);
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
}
@@ -8108,6 +8718,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_prog_put(prog);
return -EINVAL;
}
+
+ /* prog->aux->id may be 0 for orphaned device-bound progs */
+ if (prog->aux->id && prog->aux->id == prog_id) {
+ bpf_prog_put(prog);
+ return 0;
+ }
+ } else {
+ if (!__dev_xdp_query(dev, bpf_op, query))
+ return 0;
}
err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
@@ -8211,6 +8830,9 @@ static void rollback_registered_many(struct list_head *head)
dev_uc_flush(dev);
dev_mc_flush(dev);
+ netdev_name_node_alt_flush(dev);
+ netdev_name_node_free(dev->name_node);
+
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
@@ -8566,7 +9188,7 @@ static void netdev_init_one_queue(struct net_device *dev,
{
/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
- netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+ lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
@@ -8613,6 +9235,31 @@ void netif_tx_stop_all_queues(struct net_device *dev)
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
+static void netdev_register_lockdep_key(struct net_device *dev)
+{
+ lockdep_register_key(&dev->qdisc_tx_busylock_key);
+ lockdep_register_key(&dev->qdisc_running_key);
+ lockdep_register_key(&dev->qdisc_xmit_lock_key);
+ lockdep_register_key(&dev->addr_list_lock_key);
+}
+
+static void netdev_unregister_lockdep_key(struct net_device *dev)
+{
+ lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
+ lockdep_unregister_key(&dev->qdisc_running_key);
+ lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
+ lockdep_unregister_key(&dev->addr_list_lock_key);
+}
+
+void netdev_update_lockdep_key(struct net_device *dev)
+{
+ lockdep_unregister_key(&dev->addr_list_lock_key);
+ lockdep_register_key(&dev->addr_list_lock_key);
+
+ lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
+}
+EXPORT_SYMBOL(netdev_update_lockdep_key);
+
/**
* register_netdevice - register a network device
* @dev: device to register
@@ -8647,19 +9294,24 @@ int register_netdevice(struct net_device *dev)
BUG_ON(!net);
spin_lock_init(&dev->addr_list_lock);
- netdev_set_addr_lockdep_class(dev);
+ lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
+ ret = -ENOMEM;
+ dev->name_node = netdev_name_node_head_alloc(dev);
+ if (!dev->name_node)
+ goto out;
+
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
- goto out;
+ goto err_free_name;
}
}
@@ -8681,7 +9333,7 @@ int register_netdevice(struct net_device *dev)
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
- dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
if (dev->netdev_ops->ndo_udp_tunnel_add) {
@@ -8726,8 +9378,10 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
ret = netdev_register_kobject(dev);
- if (ret)
+ if (ret) {
+ dev->reg_state = NETREG_UNREGISTERED;
goto err_uninit;
+ }
dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
@@ -8758,6 +9412,8 @@ int register_netdevice(struct net_device *dev)
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
+ rcu_barrier();
+
dev->reg_state = NETREG_UNREGISTERED;
}
/*
@@ -8776,6 +9432,8 @@ err_uninit:
dev->netdev_ops->ndo_uninit(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
+err_free_name:
+ netdev_name_node_free(dev->name_node);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
@@ -9155,8 +9813,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
+ netdev_register_lockdep_key(dev);
+
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->upper_level = 1;
+ dev->lower_level = 1;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -9166,6 +9828,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
+ INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
@@ -9236,6 +9899,10 @@ void free_netdev(struct net_device *dev)
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+ free_percpu(dev->xdp_bulkq);
+ dev->xdp_bulkq = NULL;
+
+ netdev_unregister_lockdep_key(dev);
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
@@ -9405,7 +10072,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
- new_nsid = peernet2id_alloc(dev_net(dev), net);
+ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
/* If there is an ifindex conflict assign a new one */
if (__dev_get_by_index(net, dev->ifindex))
new_ifindex = dev_new_index(net);
@@ -9425,6 +10092,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
netdev_adjacent_del_links(dev);
+ /* Move per-net netdevice notifiers that are following the netdevice */
+ move_netdevice_notifiers_dev_net(dev, net);
+
/* Actually switch the network namespace */
dev_net_set(dev, net);
dev->ifindex = new_ifindex;
@@ -9567,7 +10237,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+ 8 * sizeof_field(struct napi_struct, gro_bitmask));
if (net != &init_net)
INIT_LIST_HEAD(&net->dev_base_head);
@@ -9580,6 +10250,8 @@ static int __net_init netdev_init(struct net *net)
if (net->dev_index_head == NULL)
goto err_idx;
+ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
+
return 0;
err_idx:
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 6393ba930097..2f949b5a1eb9 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -691,7 +691,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)
return;
netif_addr_lock_bh(from);
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
@@ -858,7 +858,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -888,7 +888,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -912,7 +912,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
return;
netif_addr_lock_bh(from);
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 5163d900bb4f..dbaebbe573f0 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -187,6 +187,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_TX_OFF:
case HWTSTAMP_TX_ON:
case HWTSTAMP_TX_ONESTEP_SYNC:
+ case HWTSTAMP_TX_ONESTEP_P2P:
tx_type_valid = 1;
break;
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4f40aeace902..549ee56b7a21 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -18,6 +18,8 @@
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/timekeeping.h>
#include <rdma/ib_verbs.h>
#include <net/netlink.h>
#include <net/genetlink.h>
@@ -25,6 +27,7 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/devlink.h>
+#include <net/drop_monitor.h>
#define CREATE_TRACE_POINTS
#include <trace/events/devlink.h>
@@ -92,16 +95,25 @@ static LIST_HEAD(devlink_list);
*/
static DEFINE_MUTEX(devlink_mutex);
-static struct net *devlink_net(const struct devlink *devlink)
+struct net *devlink_net(const struct devlink *devlink)
{
return read_pnet(&devlink->_net);
}
+EXPORT_SYMBOL_GPL(devlink_net);
-static void devlink_net_set(struct devlink *devlink, struct net *net)
+static void __devlink_net_set(struct devlink *devlink, struct net *net)
{
write_pnet(&devlink->_net, net);
}
+void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+ if (WARN_ON(devlink->registered))
+ return;
+ __devlink_net_set(devlink, net);
+}
+EXPORT_SYMBOL_GPL(devlink_net_set);
+
static struct devlink *devlink_get_from_attrs(struct net *net,
struct nlattr **attrs)
{
@@ -133,7 +145,7 @@ static struct devlink *devlink_get_from_info(struct genl_info *info)
}
static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
- int port_index)
+ unsigned int port_index)
{
struct devlink_port *devlink_port;
@@ -144,7 +156,8 @@ static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
return NULL;
}
-static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+static bool devlink_port_index_exists(struct devlink *devlink,
+ unsigned int port_index)
{
return devlink_port_get_by_index(devlink, port_index);
}
@@ -342,7 +355,6 @@ struct devlink_snapshot {
struct list_head list;
struct devlink_region *region;
devlink_snapshot_data_dest_t *data_destructor;
- u64 data_len;
u8 *data;
u32 id;
};
@@ -371,14 +383,6 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
return NULL;
}
-static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot)
-{
- snapshot->region->cur_snapshots--;
- list_del(&snapshot->list);
- (*snapshot->data_destructor)(snapshot->data);
- kfree(snapshot);
-}
-
#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
@@ -439,8 +443,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
{
struct devlink *devlink;
- devlink = devlink_get_from_info(info);
- if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+ /* When devlink changes netns, it would not be found
+ * by devlink_get_from_info(). So try if it is stored first.
+ */
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
+ devlink = info->user_ptr[0];
+ } else {
+ devlink = devlink_get_from_info(info);
+ WARN_ON(IS_ERR(devlink));
+ }
+ if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
mutex_unlock(&devlink->lock);
mutex_unlock(&devlink_mutex);
}
@@ -476,6 +488,8 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
+ goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
@@ -515,32 +529,37 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return 0;
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
return -EMSGSIZE;
- if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {
+ switch (devlink_port->attrs.flavour) {
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
attrs->pci_pf.pf))
return -EMSGSIZE;
- } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
attrs->pci_vf.pf) ||
nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
attrs->pci_vf.vf))
return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
+ attrs->phys.port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ attrs->phys.port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->phys.split_subport_number))
+ return -EMSGSIZE;
+ break;
+ default:
+ break;
}
- if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL &&
- devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
- devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA)
- return 0;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
- attrs->phys.port_number))
- return -EMSGSIZE;
- if (!attrs->split)
- return 0;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
- attrs->phys.port_number))
- return -EMSGSIZE;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
- attrs->phys.split_subport_number))
- return -EMSGSIZE;
return 0;
}
@@ -560,7 +579,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
- spin_lock(&devlink_port->type_lock);
+ spin_lock_bh(&devlink_port->type_lock);
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
goto nla_put_failure_type_locked;
if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
@@ -585,7 +604,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
ibdev->name))
goto nla_put_failure_type_locked;
}
- spin_unlock(&devlink_port->type_lock);
+ spin_unlock_bh(&devlink_port->type_lock);
if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
@@ -593,7 +612,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
return 0;
nla_put_failure_type_locked:
- spin_unlock(&devlink_port->type_lock);
+ spin_unlock_bh(&devlink_port->type_lock);
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -1033,7 +1052,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
struct devlink_sb *devlink_sb;
int start = cb->args[0];
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
@@ -1056,6 +1075,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
out:
mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
cb->args[0] = idx;
return msg->len;
}
@@ -1231,7 +1253,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
struct devlink_sb *devlink_sb;
int start = cb->args[0];
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
@@ -1254,6 +1276,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
out:
mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
cb->args[0] = idx;
return msg->len;
}
@@ -1458,7 +1483,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
struct devlink_sb *devlink_sb;
int start = cb->args[0];
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
@@ -1483,6 +1508,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
out:
mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
cb->args[0] = idx;
return msg->len;
}
@@ -2672,12 +2700,119 @@ devlink_resources_validate(struct devlink *devlink,
return err;
}
+static struct net *devlink_netns_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
+ struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
+ struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
+ struct net *net;
+
+ if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
+ NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (netns_pid_attr) {
+ net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
+ } else if (netns_fd_attr) {
+ net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
+ } else if (netns_id_attr) {
+ net = get_net_ns_by_id(sock_net(skb->sk),
+ nla_get_u32(netns_id_attr));
+ if (!net)
+ net = ERR_PTR(-EINVAL);
+ } else {
+ WARN_ON(1);
+ net = ERR_PTR(-EINVAL);
+ }
+ if (IS_ERR(net)) {
+ NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return ERR_PTR(-EPERM);
+ }
+ return net;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+ unsigned int port_index,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd);
+
+static void devlink_reload_netns_change(struct devlink *devlink,
+ struct net *dest_net)
+{
+ struct devlink_param_item *param_item;
+
+ /* Userspace needs to be notified about devlink objects
+ * removed from original and entering new network namespace.
+ * The rest of the devlink objects are re-created during
+ * reload process so the notifications are generated separatelly.
+ */
+
+ list_for_each_entry(param_item, &devlink->param_list, list)
+ devlink_param_notify(devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_DEL);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+
+ __devlink_net_set(devlink, dest_net);
+
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ list_for_each_entry(param_item, &devlink->param_list, list)
+ devlink_param_notify(devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_NEW);
+}
+
+static bool devlink_reload_supported(struct devlink *devlink)
+{
+ return devlink->ops->reload_down && devlink->ops->reload_up;
+}
+
+static void devlink_reload_failed_set(struct devlink *devlink,
+ bool reload_failed)
+{
+ if (devlink->reload_failed == reload_failed)
+ return;
+ devlink->reload_failed = reload_failed;
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+bool devlink_is_reload_failed(const struct devlink *devlink)
+{
+ return devlink->reload_failed;
+}
+EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+
+static int devlink_reload(struct devlink *devlink, struct net *dest_net,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!devlink->reload_enabled)
+ return -EOPNOTSUPP;
+
+ err = devlink->ops->reload_down(devlink, !!dest_net, extack);
+ if (err)
+ return err;
+
+ if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
+ devlink_reload_netns_change(devlink, dest_net);
+
+ err = devlink->ops->reload_up(devlink, extack);
+ devlink_reload_failed_set(devlink, !!err);
+ return err;
+}
+
static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
+ struct net *dest_net = NULL;
int err;
- if (!devlink->ops->reload)
+ if (!devlink_reload_supported(devlink) || !devlink->reload_enabled)
return -EOPNOTSUPP;
err = devlink_resources_validate(devlink, NULL, info);
@@ -2685,7 +2820,21 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
return err;
}
- return devlink->ops->reload(devlink, info->extack);
+
+ if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
+ info->attrs[DEVLINK_ATTR_NETNS_FD] ||
+ info->attrs[DEVLINK_ATTR_NETNS_ID]) {
+ dest_net = devlink_netns_get(skb, info);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+ }
+
+ err = devlink_reload(devlink, dest_net, info->extack);
+
+ if (dest_net)
+ put_net(dest_net);
+
+ return err;
}
static int devlink_nl_flash_update_fill(struct sk_buff *msg,
@@ -2852,6 +3001,16 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
.type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
+ .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -3123,7 +3282,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
int start = cb->args[0];
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
@@ -3140,7 +3299,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
- if (err) {
+ if (err && err != -EOPNOTSUPP) {
mutex_unlock(&devlink->lock);
goto out;
}
@@ -3151,6 +3310,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
out:
mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
cb->args[0] = idx;
return msg->len;
}
@@ -3379,7 +3541,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
int start = cb->args[0];
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
@@ -3400,7 +3562,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
- if (err) {
+ if (err && err != -EOPNOTSUPP) {
mutex_unlock(&devlink->lock);
goto out;
}
@@ -3412,6 +3574,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
out:
mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
cb->args[0] = idx;
return msg->len;
}
@@ -3596,6 +3761,16 @@ out_free_msg:
nlmsg_free(msg);
}
+static void devlink_region_snapshot_del(struct devlink_region *region,
+ struct devlink_snapshot *snapshot)
+{
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+ region->cur_snapshots--;
+ list_del(&snapshot->list);
+ (*snapshot->data_destructor)(snapshot->data);
+ kfree(snapshot);
+}
+
static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
@@ -3691,8 +3866,7 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
if (!snapshot)
return -EINVAL;
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
- devlink_region_snapshot_del(snapshot);
+ devlink_region_snapshot_del(region, snapshot);
return 0;
}
@@ -3748,8 +3922,8 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
if (!snapshot)
return -EINVAL;
- if (end_offset > snapshot->data_len || dump)
- end_offset = snapshot->data_len;
+ if (end_offset > region->size || dump)
+ end_offset = region->size;
while (curr_offset < end_offset) {
u32 data_size;
@@ -3777,29 +3951,19 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
u64 ret_offset, start_offset, end_offset = 0;
+ struct nlattr **attrs = info->attrs;
struct devlink_region *region;
struct nlattr *chunks_attr;
const char *region_name;
struct devlink *devlink;
- struct nlattr **attrs;
bool dump = true;
void *hdr;
int err;
start_offset = *((u64 *)&cb->args[0]);
- attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL);
- if (!attrs)
- return -ENOMEM;
-
- err = nlmsg_parse_deprecated(cb->nlh,
- GENL_HDRLEN + devlink_nl_family.hdrsize,
- attrs, DEVLINK_ATTR_MAX,
- devlink_nl_family.policy, cb->extack);
- if (err)
- goto out_free;
-
mutex_lock(&devlink_mutex);
devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
if (IS_ERR(devlink)) {
@@ -3822,6 +3986,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto out_unlock;
}
+ /* return 0 if there is no further data to read */
+ if (start_offset >= region->size) {
+ err = 0;
+ goto out_unlock;
+ }
+
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
DEVLINK_CMD_REGION_READ);
@@ -3876,7 +4046,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
genlmsg_end(skb, hdr);
mutex_unlock(&devlink->lock);
mutex_unlock(&devlink_mutex);
- kfree(attrs);
return skb->len;
@@ -3886,8 +4055,6 @@ out_unlock:
mutex_unlock(&devlink->lock);
out_dev:
mutex_unlock(&devlink_mutex);
-out_free:
- kfree(attrs);
return err;
}
@@ -4025,7 +4192,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
int start = cb->args[0];
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
list_for_each_entry(devlink, &devlink_list, list) {
@@ -4047,12 +4214,15 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->extack);
mutex_unlock(&devlink->lock);
- if (err)
+ if (err && err != -EOPNOTSUPP)
break;
idx++;
}
mutex_unlock(&devlink_mutex);
+ if (err != -EMSGSIZE)
+ return err;
+
cb->args[0] = idx;
return msg->len;
}
@@ -4255,12 +4425,11 @@ int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
-int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
- u16 value_len)
+static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len)
{
return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
}
-EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
bool value)
@@ -4368,19 +4537,26 @@ int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
- const void *value, u16 value_len)
+ const void *value, u32 value_len)
{
+ u32 data_size;
+ u32 offset;
int err;
- err = devlink_fmsg_pair_nest_start(fmsg, name);
+ err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
if (err)
return err;
- err = devlink_fmsg_binary_put(fmsg, value, value_len);
- if (err)
- return err;
+ for (offset = 0; offset < value_len; offset += data_size) {
+ data_size = value_len - offset;
+ if (data_size > DEVLINK_FMSG_MAX_SIZE)
+ data_size = DEVLINK_FMSG_MAX_SIZE;
+ err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
+ if (err)
+ return err;
+ }
- err = devlink_fmsg_pair_nest_end(fmsg);
+ err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
@@ -4577,6 +4753,7 @@ struct devlink_health_reporter {
bool auto_recover;
u8 health_state;
u64 dump_ts;
+ u64 dump_real_ts;
u64 error_count;
u64 recovery_count;
u64 last_recovery_ts;
@@ -4672,39 +4849,120 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
-void
-devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
- enum devlink_health_reporter_state state)
+static int
+devlink_nl_health_reporter_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_health_reporter *reporter,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
{
- if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY &&
- state != DEVLINK_HEALTH_REPORTER_STATE_ERROR))
+ struct nlattr *reporter_attr;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ reporter_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_HEALTH_REPORTER);
+ if (!reporter_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ reporter->ops->name))
+ goto reporter_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
+ reporter->health_state))
+ goto reporter_nest_cancel;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,
+ reporter->error_count, DEVLINK_ATTR_PAD))
+ goto reporter_nest_cancel;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,
+ reporter->recovery_count, DEVLINK_ATTR_PAD))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
+ reporter->graceful_period,
+ DEVLINK_ATTR_PAD))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
+ reporter->auto_recover))
+ goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
+ jiffies_to_msecs(reporter->dump_ts),
+ DEVLINK_ATTR_PAD))
+ goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
+ reporter->dump_real_ts, DEVLINK_ATTR_PAD))
+ goto reporter_nest_cancel;
+
+ nla_nest_end(msg, reporter_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+reporter_nest_cancel:
+ nla_nest_end(msg, reporter_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_recover_notify(struct devlink_health_reporter *reporter,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
return;
- if (reporter->health_state == state)
+ err = devlink_nl_health_reporter_fill(msg, reporter->devlink,
+ reporter, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
return;
+ }
- reporter->health_state = state;
- trace_devlink_health_reporter_state_update(reporter->devlink,
- reporter->ops->name, state);
+ genlmsg_multicast_netns(&devlink_nl_family,
+ devlink_net(reporter->devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
-EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update);
+
+void
+devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter)
+{
+ reporter->recovery_count++;
+ reporter->last_recovery_ts = jiffies;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done);
static int
devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
- void *priv_ctx)
+ void *priv_ctx, struct netlink_ext_ack *extack)
{
int err;
+ if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return 0;
+
if (!reporter->ops->recover)
return -EOPNOTSUPP;
- err = reporter->ops->recover(reporter, priv_ctx);
+ err = reporter->ops->recover(reporter, priv_ctx, extack);
if (err)
return err;
- reporter->recovery_count++;
+ devlink_health_reporter_recovery_done(reporter);
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
- reporter->last_recovery_ts = jiffies;
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
return 0;
}
@@ -4719,7 +4977,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter)
}
static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
- void *priv_ctx)
+ void *priv_ctx,
+ struct netlink_ext_ack *extack)
{
int err;
@@ -4740,7 +4999,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
goto dump_err;
err = reporter->ops->dump(reporter, reporter->dump_fmsg,
- priv_ctx);
+ priv_ctx, extack);
if (err)
goto dump_err;
@@ -4749,6 +5008,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
goto dump_err;
reporter->dump_ts = jiffies;
+ reporter->dump_real_ts = ktime_get_real_ns();
return 0;
@@ -4769,6 +5029,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->error_count++;
prev_health_state = reporter->health_state;
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
/* abort if the previous error wasn't recovered */
if (reporter->auto_recover &&
@@ -4787,11 +5048,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
mutex_lock(&reporter->dump_lock);
/* store current dump of current error, for later analysis */
- devlink_health_do_dump(reporter, priv_ctx);
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
mutex_unlock(&reporter->dump_lock);
if (reporter->auto_recover)
- return devlink_health_reporter_recover(reporter, priv_ctx);
+ return devlink_health_reporter_recover(reporter,
+ priv_ctx, NULL);
return 0;
}
@@ -4826,21 +5088,10 @@ devlink_health_reporter_get_from_info(struct devlink *devlink,
static struct devlink_health_reporter *
devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
struct devlink_health_reporter *reporter;
+ struct nlattr **attrs = info->attrs;
struct devlink *devlink;
- struct nlattr **attrs;
- int err;
-
- attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL);
- if (!attrs)
- return NULL;
-
- err = nlmsg_parse_deprecated(cb->nlh,
- GENL_HDRLEN + devlink_nl_family.hdrsize,
- attrs, DEVLINK_ATTR_MAX,
- devlink_nl_family.policy, cb->extack);
- if (err)
- goto free;
mutex_lock(&devlink_mutex);
devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
@@ -4849,12 +5100,9 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
mutex_unlock(&devlink_mutex);
- kfree(attrs);
return reporter;
unlock:
mutex_unlock(&devlink_mutex);
-free:
- kfree(attrs);
return NULL;
}
@@ -4864,64 +5112,23 @@ devlink_health_reporter_put(struct devlink_health_reporter *reporter)
refcount_dec(&reporter->refcount);
}
-static int
-devlink_nl_health_reporter_fill(struct sk_buff *msg,
- struct devlink *devlink,
- struct devlink_health_reporter *reporter,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
+void
+devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state state)
{
- struct nlattr *reporter_attr;
- void *hdr;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto genlmsg_cancel;
-
- reporter_attr = nla_nest_start_noflag(msg,
- DEVLINK_ATTR_HEALTH_REPORTER);
- if (!reporter_attr)
- goto genlmsg_cancel;
- if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
- reporter->ops->name))
- goto reporter_nest_cancel;
- if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
- reporter->health_state))
- goto reporter_nest_cancel;
- if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,
- reporter->error_count, DEVLINK_ATTR_PAD))
- goto reporter_nest_cancel;
- if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,
- reporter->recovery_count, DEVLINK_ATTR_PAD))
- goto reporter_nest_cancel;
- if (reporter->ops->recover &&
- nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
- reporter->graceful_period,
- DEVLINK_ATTR_PAD))
- goto reporter_nest_cancel;
- if (reporter->ops->recover &&
- nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
- reporter->auto_recover))
- goto reporter_nest_cancel;
- if (reporter->dump_fmsg &&
- nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
- jiffies_to_msecs(reporter->dump_ts),
- DEVLINK_ATTR_PAD))
- goto reporter_nest_cancel;
+ if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY &&
+ state != DEVLINK_HEALTH_REPORTER_STATE_ERROR))
+ return;
- nla_nest_end(msg, reporter_attr);
- genlmsg_end(msg, hdr);
- return 0;
+ if (reporter->health_state == state)
+ return;
-reporter_nest_cancel:
- nla_nest_end(msg, reporter_attr);
-genlmsg_cancel:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ reporter->health_state = state;
+ trace_devlink_health_reporter_state_update(reporter->devlink,
+ reporter->ops->name, state);
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update);
static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
struct genl_info *info)
@@ -5043,7 +5250,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
if (!reporter)
return -EINVAL;
- err = devlink_health_reporter_recover(reporter, NULL);
+ err = devlink_health_reporter_recover(reporter, NULL, info->extack);
devlink_health_reporter_put(reporter);
return err;
@@ -5076,7 +5283,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
if (err)
goto out;
- err = reporter->ops->diagnose(reporter, fmsg);
+ err = reporter->ops->diagnose(reporter, fmsg, info->extack);
if (err)
goto out;
@@ -5111,7 +5318,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb,
}
mutex_lock(&reporter->dump_lock);
if (!start) {
- err = devlink_health_do_dump(reporter, NULL);
+ err = devlink_health_do_dump(reporter, NULL, cb->extack);
if (err)
goto unlock;
cb->args[1] = reporter->dump_ts;
@@ -5154,6 +5361,571 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
return 0;
}
+struct devlink_stats {
+ u64 rx_bytes;
+ u64 rx_packets;
+ struct u64_stats_sync syncp;
+};
+
+/**
+ * struct devlink_trap_group_item - Packet trap group attributes.
+ * @group: Immutable packet trap group attributes.
+ * @refcount: Number of trap items using the group.
+ * @list: trap_group_list member.
+ * @stats: Trap group statistics.
+ *
+ * Describes packet trap group attributes. Created by devlink during trap
+ * registration.
+ */
+struct devlink_trap_group_item {
+ const struct devlink_trap_group *group;
+ refcount_t refcount;
+ struct list_head list;
+ struct devlink_stats __percpu *stats;
+};
+
+/**
+ * struct devlink_trap_item - Packet trap attributes.
+ * @trap: Immutable packet trap attributes.
+ * @group_item: Associated group item.
+ * @list: trap_list member.
+ * @action: Trap action.
+ * @stats: Trap statistics.
+ * @priv: Driver private information.
+ *
+ * Describes both mutable and immutable packet trap attributes. Created by
+ * devlink during trap registration and used for all trap related operations.
+ */
+struct devlink_trap_item {
+ const struct devlink_trap *trap;
+ struct devlink_trap_group_item *group_item;
+ struct list_head list;
+ enum devlink_trap_action action;
+ struct devlink_stats __percpu *stats;
+ void *priv;
+};
+
+static struct devlink_trap_item *
+devlink_trap_item_lookup(struct devlink *devlink, const char *name)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (!strcmp(trap_item->trap->name, name))
+ return trap_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_item *
+devlink_trap_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ struct nlattr *attr;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_NAME])
+ return NULL;
+ attr = info->attrs[DEVLINK_ATTR_TRAP_NAME];
+
+ return devlink_trap_item_lookup(devlink, nla_data(attr));
+}
+
+static int
+devlink_trap_action_get_from_info(struct genl_info *info,
+ enum devlink_trap_action *p_trap_action)
+{
+ u8 val;
+
+ val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
+ switch (val) {
+ case DEVLINK_TRAP_ACTION_DROP: /* fall-through */
+ case DEVLINK_TRAP_ACTION_TRAP:
+ *p_trap_action = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_metadata_put(struct sk_buff *msg,
+ const struct devlink_trap *trap)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
+ struct devlink_stats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(i) {
+ struct devlink_stats *cpu_stats;
+ u64 rx_packets, rx_bytes;
+ unsigned int start;
+
+ cpu_stats = per_cpu_ptr(trap_stats, i);
+ do {
+ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ rx_packets = cpu_stats->rx_packets;
+ rx_bytes = cpu_stats->rx_bytes;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+ stats->rx_packets += rx_packets;
+ stats->rx_bytes += rx_bytes;
+ }
+}
+
+static int devlink_trap_stats_put(struct sk_buff *msg,
+ struct devlink_stats __percpu *trap_stats)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+
+ devlink_trap_stats_read(trap_stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ stats.rx_packets, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ stats.rx_bytes, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ struct devlink_trap_group_item *group_item = trap_item->group_item;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+ group_item->group->name))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type))
+ goto nla_put_failure;
+
+ if (trap_item->trap->generic &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action))
+ goto nla_put_failure;
+
+ err = devlink_trap_metadata_put(msg, trap_item->trap);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_trap_stats_put(msg, trap_item->stats);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_item *trap_item;
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_list))
+ return -EOPNOTSUPP;
+
+ trap_item = devlink_trap_item_get_from_info(devlink, info);
+ if (!trap_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_fill(msg, devlink, trap_item,
+ DEVLINK_CMD_TRAP_NEW, info->snd_portid,
+ info->snd_seq, 0);
+ if (err)
+ goto err_trap_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_trap_item *trap_item;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_fill(msg, devlink, trap_item,
+ DEVLINK_CMD_TRAP_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int __devlink_trap_action_set(struct devlink *devlink,
+ struct devlink_trap_item *trap_item,
+ enum devlink_trap_action trap_action,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot change action of non-drop traps. Skipping");
+ return 0;
+ }
+
+ err = devlink->ops->trap_action_set(devlink, trap_item->trap,
+ trap_action);
+ if (err)
+ return err;
+
+ trap_item->action = trap_action;
+
+ return 0;
+}
+
+static int devlink_trap_action_set(struct devlink *devlink,
+ struct devlink_trap_item *trap_item,
+ struct genl_info *info)
+{
+ enum devlink_trap_action trap_action;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+ return 0;
+
+ err = devlink_trap_action_get_from_info(info, &trap_action);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action");
+ return -EINVAL;
+ }
+
+ return __devlink_trap_action_set(devlink, trap_item, trap_action,
+ info->extack);
+}
+
+static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ if (list_empty(&devlink->trap_list))
+ return -EOPNOTSUPP;
+
+ trap_item = devlink_trap_item_get_from_info(devlink, info);
+ if (!trap_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap");
+ return -ENOENT;
+ }
+
+ err = devlink_trap_action_set(devlink, trap_item, info);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (!strcmp(group_item->group->name, name))
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ char *name;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME])
+ return NULL;
+ name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]);
+
+ return devlink_trap_group_item_lookup(devlink, name);
+}
+
+static int
+devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+ group_item->group->name))
+ goto nla_put_failure;
+
+ if (group_item->group->generic &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+ goto nla_put_failure;
+
+ err = devlink_trap_stats_put(msg, group_item->stats);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_group_item *group_item;
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_group_list))
+ return -EOPNOTSUPP;
+
+ group_item = devlink_trap_group_item_get_from_info(devlink, info);
+ if (!group_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_group_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_group_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ enum devlink_command cmd = DEVLINK_CMD_TRAP_GROUP_NEW;
+ struct devlink_trap_group_item *group_item;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(group_item, &devlink->trap_group_list,
+ list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_group_fill(msg, devlink,
+ group_item, cmd,
+ portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+__devlink_trap_group_action_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ enum devlink_trap_action trap_action,
+ struct netlink_ext_ack *extack)
+{
+ const char *group_name = group_item->group->name;
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->trap->group.name, group_name))
+ continue;
+ err = __devlink_trap_action_set(devlink, trap_item,
+ trap_action, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+devlink_trap_group_action_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ enum devlink_trap_action trap_action;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+ return 0;
+
+ err = devlink_trap_action_get_from_info(info, &trap_action);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action");
+ return -EINVAL;
+ }
+
+ err = __devlink_trap_group_action_set(devlink, group_item, trap_action,
+ info->extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ if (list_empty(&devlink->trap_group_list))
+ return -EOPNOTSUPP;
+
+ group_item = devlink_trap_group_item_get_from_info(devlink, info);
+ if (!group_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group");
+ return -ENOENT;
+ }
+
+ err = devlink_trap_group_action_set(devlink, group_item, info);
+ if (err)
+ return err;
+
+ return 0;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -5184,6 +5956,12 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -5414,7 +6192,8 @@ static const struct genl_ops devlink_nl_ops[] = {
},
{
.cmd = DEVLINK_CMD_REGION_READ,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_region_read_dumpit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
@@ -5462,7 +6241,8 @@ static const struct genl_ops devlink_nl_ops[] = {
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
@@ -5483,6 +6263,32 @@ static const struct genl_ops devlink_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
+ {
+ .cmd = DEVLINK_CMD_TRAP_GET,
+ .doit = devlink_nl_cmd_trap_get_doit,
+ .dumpit = devlink_nl_cmd_trap_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_SET,
+ .doit = devlink_nl_cmd_trap_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_GET,
+ .doit = devlink_nl_cmd_trap_group_get_doit,
+ .dumpit = devlink_nl_cmd_trap_group_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_SET,
+ .doit = devlink_nl_cmd_trap_group_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
static struct genl_family devlink_nl_family __ro_after_init = {
@@ -5520,7 +6326,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
if (!devlink)
return NULL;
devlink->ops = ops;
- devlink_net_set(devlink, &init_net);
+ __devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
INIT_LIST_HEAD(&devlink->sb_list);
INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
@@ -5528,6 +6334,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->param_list);
INIT_LIST_HEAD(&devlink->region_list);
INIT_LIST_HEAD(&devlink->reporter_list);
+ INIT_LIST_HEAD(&devlink->trap_list);
+ INIT_LIST_HEAD(&devlink->trap_group_list);
mutex_init(&devlink->lock);
mutex_init(&devlink->reporters_lock);
return devlink;
@@ -5544,6 +6352,7 @@ int devlink_register(struct devlink *devlink, struct device *dev)
{
mutex_lock(&devlink_mutex);
devlink->dev = dev;
+ devlink->registered = true;
list_add_tail(&devlink->list, &devlink_list);
devlink_notify(devlink, DEVLINK_CMD_NEW);
mutex_unlock(&devlink_mutex);
@@ -5559,6 +6368,8 @@ EXPORT_SYMBOL_GPL(devlink_register);
void devlink_unregister(struct devlink *devlink)
{
mutex_lock(&devlink_mutex);
+ WARN_ON(devlink_reload_supported(devlink) &&
+ devlink->reload_enabled);
devlink_notify(devlink, DEVLINK_CMD_DEL);
list_del(&devlink->list);
mutex_unlock(&devlink_mutex);
@@ -5566,6 +6377,41 @@ void devlink_unregister(struct devlink *devlink)
EXPORT_SYMBOL_GPL(devlink_unregister);
/**
+ * devlink_reload_enable - Enable reload of devlink instance
+ *
+ * @devlink: devlink
+ *
+ * Should be called at end of device initialization
+ * process when reload operation is supported.
+ */
+void devlink_reload_enable(struct devlink *devlink)
+{
+ mutex_lock(&devlink_mutex);
+ devlink->reload_enabled = true;
+ mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_reload_enable);
+
+/**
+ * devlink_reload_disable - Disable reload of devlink instance
+ *
+ * @devlink: devlink
+ *
+ * Should be called at the beginning of device cleanup
+ * process when reload operation is supported.
+ */
+void devlink_reload_disable(struct devlink *devlink)
+{
+ mutex_lock(&devlink_mutex);
+ /* Mutex is taken which ensures that no reload operation is in
+ * progress while setting up forbidded flag.
+ */
+ devlink->reload_enabled = false;
+ mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_reload_disable);
+
+/**
* devlink_free - Free devlink instance resources
*
* @devlink: devlink
@@ -5574,6 +6420,8 @@ void devlink_free(struct devlink *devlink)
{
mutex_destroy(&devlink->reporters_lock);
mutex_destroy(&devlink->lock);
+ WARN_ON(!list_empty(&devlink->trap_group_list));
+ WARN_ON(!list_empty(&devlink->trap_list));
WARN_ON(!list_empty(&devlink->reporter_list));
WARN_ON(!list_empty(&devlink->region_list));
WARN_ON(!list_empty(&devlink->param_list));
@@ -5598,7 +6446,7 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
}
-#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30)
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)
static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
{
@@ -5678,10 +6526,10 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
if (WARN_ON(!devlink_port->registered))
return;
devlink_port_type_warn_cancel(devlink_port);
- spin_lock(&devlink_port->type_lock);
+ spin_lock_bh(&devlink_port->type_lock);
devlink_port->type = type;
devlink_port->type_dev = type_dev;
- spin_unlock(&devlink_port->type_lock);
+ spin_unlock_bh(&devlink_port->type_lock);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
@@ -6538,7 +7386,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink,
int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
union devlink_param_value *init_val)
{
- if (!devlink->ops->reload)
+ if (!devlink_reload_supported(devlink))
return -EOPNOTSUPP;
return __devlink_param_driverinit_value_get(&devlink->param_list,
@@ -6585,7 +7433,7 @@ int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port,
{
struct devlink *devlink = devlink_port->devlink;
- if (!devlink->ops->reload)
+ if (!devlink_reload_supported(devlink))
return -EOPNOTSUPP;
return __devlink_param_driverinit_value_get(&devlink_port->param_list,
@@ -6744,7 +7592,7 @@ void devlink_region_destroy(struct devlink_region *region)
/* Free all snapshots of region */
list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
- devlink_region_snapshot_del(snapshot);
+ devlink_region_snapshot_del(region, snapshot);
list_del(&region->list);
@@ -6755,7 +7603,7 @@ void devlink_region_destroy(struct devlink_region *region)
EXPORT_SYMBOL_GPL(devlink_region_destroy);
/**
- * devlink_region_shapshot_id_get - get snapshot ID
+ * devlink_region_snapshot_id_get - get snapshot ID
*
* This callback should be called when adding a new snapshot,
* Driver should use the same id for multiple snapshots taken
@@ -6763,7 +7611,7 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy);
*
* @devlink: devlink
*/
-u32 devlink_region_shapshot_id_get(struct devlink *devlink)
+u32 devlink_region_snapshot_id_get(struct devlink *devlink)
{
u32 id;
@@ -6773,7 +7621,7 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink)
return id;
}
-EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
/**
* devlink_region_snapshot_create - create a new snapshot
@@ -6784,12 +7632,11 @@ EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
* The @snapshot_id should be obtained using the getter function.
*
* @region: devlink region of the snapshot
- * @data_len: size of snapshot data
* @data: snapshot data
* @snapshot_id: snapshot id to be created
* @data_destructor: pointer to destructor function to free data
*/
-int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
+int devlink_region_snapshot_create(struct devlink_region *region,
u8 *data, u32 snapshot_id,
devlink_snapshot_data_dest_t *data_destructor)
{
@@ -6819,7 +7666,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
snapshot->id = snapshot_id;
snapshot->region = region;
snapshot->data = data;
- snapshot->data_len = data_len;
snapshot->data_destructor = data_destructor;
list_add_tail(&snapshot->list, &region->snapshot_list);
@@ -6836,6 +7682,494 @@ unlock:
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
+#define DEVLINK_TRAP(_id, _type) \
+ { \
+ .type = DEVLINK_TRAP_TYPE_##_type, \
+ .id = DEVLINK_TRAP_GENERIC_ID_##_id, \
+ .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \
+ }
+
+static const struct devlink_trap devlink_trap_generic[] = {
+ DEVLINK_TRAP(SMAC_MC, DROP),
+ DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP),
+ DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP),
+ DEVLINK_TRAP(INGRESS_STP_FILTER, DROP),
+ DEVLINK_TRAP(EMPTY_TX_LIST, DROP),
+ DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP),
+ DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP),
+ DEVLINK_TRAP(TTL_ERROR, EXCEPTION),
+ DEVLINK_TRAP(TAIL_DROP, DROP),
+ DEVLINK_TRAP(NON_IP_PACKET, DROP),
+ DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP),
+ DEVLINK_TRAP(DIP_LB, DROP),
+ DEVLINK_TRAP(SIP_MC, DROP),
+ DEVLINK_TRAP(SIP_LB, DROP),
+ DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP),
+ DEVLINK_TRAP(IPV4_SIP_BC, DROP),
+ DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP),
+ DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP),
+ DEVLINK_TRAP(MTU_ERROR, EXCEPTION),
+ DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION),
+ DEVLINK_TRAP(RPF, EXCEPTION),
+ DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION),
+ DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION),
+ DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION),
+ DEVLINK_TRAP(NON_ROUTABLE, DROP),
+ DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
+ DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
+};
+
+#define DEVLINK_TRAP_GROUP(_id) \
+ { \
+ .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \
+ .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \
+ }
+
+static const struct devlink_trap_group devlink_trap_group_generic[] = {
+ DEVLINK_TRAP_GROUP(L2_DROPS),
+ DEVLINK_TRAP_GROUP(L3_DROPS),
+ DEVLINK_TRAP_GROUP(BUFFER_DROPS),
+ DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
+};
+
+static int devlink_trap_generic_verify(const struct devlink_trap *trap)
+{
+ if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ if (strcmp(trap->name, devlink_trap_generic[trap->id].name))
+ return -EINVAL;
+
+ if (trap->type != devlink_trap_generic[trap->id].type)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int devlink_trap_driver_verify(const struct devlink_trap *trap)
+{
+ int i;
+
+ if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) {
+ if (!strcmp(trap->name, devlink_trap_generic[i].name))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_verify(const struct devlink_trap *trap)
+{
+ if (!trap || !trap->name || !trap->group.name)
+ return -EINVAL;
+
+ if (trap->generic)
+ return devlink_trap_generic_verify(trap);
+ else
+ return devlink_trap_driver_verify(trap);
+}
+
+static int
+devlink_trap_group_generic_verify(const struct devlink_trap_group *group)
+{
+ if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ if (strcmp(group->name, devlink_trap_group_generic[group->id].name))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_driver_verify(const struct devlink_trap_group *group)
+{
+ int i;
+
+ if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) {
+ if (!strcmp(group->name, devlink_trap_group_generic[i].name))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_group_verify(const struct devlink_trap_group *group)
+{
+ if (group->generic)
+ return devlink_trap_group_generic_verify(group);
+ else
+ return devlink_trap_group_driver_verify(group);
+}
+
+static void
+devlink_trap_group_notify(struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW &&
+ cmd != DEVLINK_CMD_TRAP_GROUP_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0,
+ 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_create(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ err = devlink_trap_group_verify(group);
+ if (err)
+ return ERR_PTR(err);
+
+ group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+ if (!group_item)
+ return ERR_PTR(-ENOMEM);
+
+ group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!group_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ group_item->group = group;
+ refcount_set(&group_item->refcount, 1);
+
+ if (devlink->ops->trap_group_init) {
+ err = devlink->ops->trap_group_init(devlink, group);
+ if (err)
+ goto err_group_init;
+ }
+
+ list_add_tail(&group_item->list, &devlink->trap_group_list);
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ return group_item;
+
+err_group_init:
+ free_percpu(group_item->stats);
+err_stats_alloc:
+ kfree(group_item);
+ return ERR_PTR(err);
+}
+
+static void
+devlink_trap_group_item_destroy(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_del(&group_item->list);
+ free_percpu(group_item->stats);
+ kfree(group_item);
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_get(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup(devlink, group->name);
+ if (group_item) {
+ refcount_inc(&group_item->refcount);
+ return group_item;
+ }
+
+ return devlink_trap_group_item_create(devlink, group);
+}
+
+static void
+devlink_trap_group_item_put(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ if (!refcount_dec_and_test(&group_item->refcount))
+ return;
+
+ devlink_trap_group_item_destroy(devlink, group_item);
+}
+
+static int
+devlink_trap_item_group_link(struct devlink *devlink,
+ struct devlink_trap_item *trap_item)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_get(devlink,
+ &trap_item->trap->group);
+ if (IS_ERR(group_item))
+ return PTR_ERR(group_item);
+
+ trap_item->group_item = group_item;
+
+ return 0;
+}
+
+static void
+devlink_trap_item_group_unlink(struct devlink *devlink,
+ struct devlink_trap_item *trap_item)
+{
+ devlink_trap_group_item_put(devlink, trap_item->group_item);
+}
+
+static void devlink_trap_notify(struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW &&
+ cmd != DEVLINK_CMD_TRAP_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int
+devlink_trap_register(struct devlink *devlink,
+ const struct devlink_trap *trap, void *priv)
+{
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ if (devlink_trap_item_lookup(devlink, trap->name))
+ return -EEXIST;
+
+ trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL);
+ if (!trap_item)
+ return -ENOMEM;
+
+ trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!trap_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ trap_item->trap = trap;
+ trap_item->action = trap->init_action;
+ trap_item->priv = priv;
+
+ err = devlink_trap_item_group_link(devlink, trap_item);
+ if (err)
+ goto err_group_link;
+
+ err = devlink->ops->trap_init(devlink, trap, trap_item);
+ if (err)
+ goto err_trap_init;
+
+ list_add_tail(&trap_item->list, &devlink->trap_list);
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+
+ return 0;
+
+err_trap_init:
+ devlink_trap_item_group_unlink(devlink, trap_item);
+err_group_link:
+ free_percpu(trap_item->stats);
+err_stats_alloc:
+ kfree(trap_item);
+ return err;
+}
+
+static void devlink_trap_unregister(struct devlink *devlink,
+ const struct devlink_trap *trap)
+{
+ struct devlink_trap_item *trap_item;
+
+ trap_item = devlink_trap_item_lookup(devlink, trap->name);
+ if (WARN_ON_ONCE(!trap_item))
+ return;
+
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+ list_del(&trap_item->list);
+ if (devlink->ops->trap_fini)
+ devlink->ops->trap_fini(devlink, trap, trap_item);
+ devlink_trap_item_group_unlink(devlink, trap_item);
+ free_percpu(trap_item->stats);
+ kfree(trap_item);
+}
+
+static void devlink_trap_disable(struct devlink *devlink,
+ const struct devlink_trap *trap)
+{
+ struct devlink_trap_item *trap_item;
+
+ trap_item = devlink_trap_item_lookup(devlink, trap->name);
+ if (WARN_ON_ONCE(!trap_item))
+ return;
+
+ devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP);
+ trap_item->action = DEVLINK_TRAP_ACTION_DROP;
+}
+
+/**
+ * devlink_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
+{
+ int i, err;
+
+ if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
+ return -EINVAL;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < traps_count; i++) {
+ const struct devlink_trap *trap = &traps[i];
+
+ err = devlink_trap_verify(trap);
+ if (err)
+ goto err_trap_verify;
+
+ err = devlink_trap_register(devlink, trap, priv);
+ if (err)
+ goto err_trap_register;
+ }
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+
+err_trap_register:
+err_trap_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_unregister(devlink, &traps[i]);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_traps_register);
+
+/**
+ * devlink_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ */
+void devlink_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
+{
+ int i;
+
+ mutex_lock(&devlink->lock);
+ /* Make sure we do not have any packets in-flight while unregistering
+ * traps by disabling all of them and waiting for a grace period.
+ */
+ for (i = traps_count - 1; i >= 0; i--)
+ devlink_trap_disable(devlink, &traps[i]);
+ synchronize_rcu();
+ for (i = traps_count - 1; i >= 0; i--)
+ devlink_trap_unregister(devlink, &traps[i]);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_traps_unregister);
+
+static void
+devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
+ size_t skb_len)
+{
+ struct devlink_stats *stats;
+
+ stats = this_cpu_ptr(trap_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_bytes += skb_len;
+ stats->rx_packets++;
+ u64_stats_update_end(&stats->syncp);
+}
+
+static void
+devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
+ const struct devlink_trap_item *trap_item,
+ struct devlink_port *in_devlink_port)
+{
+ struct devlink_trap_group_item *group_item = trap_item->group_item;
+
+ hw_metadata->trap_group_name = group_item->group->name;
+ hw_metadata->trap_name = trap_item->trap->name;
+
+ spin_lock(&in_devlink_port->type_lock);
+ if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+ hw_metadata->input_dev = in_devlink_port->type_dev;
+ spin_unlock(&in_devlink_port->type_lock);
+}
+
+/**
+ * devlink_trap_report - Report trapped packet to drop monitor.
+ * @devlink: devlink.
+ * @skb: Trapped packet.
+ * @trap_ctx: Trap context.
+ * @in_devlink_port: Input devlink port.
+ */
+void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
+ void *trap_ctx, struct devlink_port *in_devlink_port)
+{
+ struct devlink_trap_item *trap_item = trap_ctx;
+ struct net_dm_hw_metadata hw_metadata = {};
+
+ devlink_trap_stats_update(trap_item->stats, skb->len);
+ devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
+
+ devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
+ in_devlink_port);
+ net_dm_hw_report(skb, &hw_metadata);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_report);
+
+/**
+ * devlink_trap_ctx_priv - Trap context to driver private information.
+ * @trap_ctx: Trap context.
+ *
+ * Return: Driver private information passed during registration.
+ */
+void *devlink_trap_ctx_priv(void *trap_ctx)
+{
+ struct devlink_trap_item *trap_item = trap_ctx;
+
+ return trap_item->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+
static void __devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
@@ -6941,11 +8275,10 @@ int devlink_compat_switch_id_get(struct net_device *dev,
{
struct devlink_port *devlink_port;
- /* RTNL mutex is held here which ensures that devlink_port
- * instance cannot disappear in the middle. No need to take
+ /* Caller must hold RTNL mutex or reference to dev, which ensures that
+ * devlink_port instance cannot disappear in the middle. No need to take
* any devlink lock as only permanent values are accessed.
*/
- ASSERT_RTNL();
devlink_port = netdev_to_devlink_port(dev);
if (!devlink_port || !devlink_port->attrs.switch_port)
return -EOPNOTSUPP;
@@ -6955,9 +8288,43 @@ int devlink_compat_switch_id_get(struct net_device *dev,
return 0;
}
+static void __net_exit devlink_pernet_pre_exit(struct net *net)
+{
+ struct devlink *devlink;
+ int err;
+
+ /* In case network namespace is getting destroyed, reload
+ * all devlink instances from this namespace into init_net.
+ */
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (net_eq(devlink_net(devlink), net)) {
+ if (WARN_ON(!devlink_reload_supported(devlink)))
+ continue;
+ err = devlink_reload(devlink, &init_net, NULL);
+ if (err && err != -EOPNOTSUPP)
+ pr_warn("Failed to reload devlink instance into init_net\n");
+ }
+ }
+ mutex_unlock(&devlink_mutex);
+}
+
+static struct pernet_operations devlink_pernet_ops __net_initdata = {
+ .pre_exit = devlink_pernet_pre_exit,
+};
+
static int __init devlink_init(void)
{
- return genl_register_family(&devlink_nl_family);
+ int err;
+
+ err = genl_register_family(&devlink_nl_family);
+ if (err)
+ goto out;
+ err = register_pernet_subsys(&devlink_pernet_ops);
+
+out:
+ WARN_ON(err);
+ return err;
}
subsys_initcall(devlink_init);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 4ea4347f5062..31700e0c3928 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -26,6 +26,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <net/drop_monitor.h>
#include <net/genetlink.h>
#include <net/netevent.h>
@@ -43,13 +44,44 @@
* netlink alerts
*/
static int trace_state = TRACE_OFF;
-static DEFINE_MUTEX(trace_state_mutex);
+static bool monitor_hw;
+
+/* net_dm_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards the global 'hw_stats_list' list.
+ */
+static DEFINE_MUTEX(net_dm_mutex);
+
+struct net_dm_stats {
+ u64 dropped;
+ struct u64_stats_sync syncp;
+};
+
+#define NET_DM_MAX_HW_TRAP_NAME_LEN 40
+
+struct net_dm_hw_entry {
+ char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN];
+ u32 count;
+};
+
+struct net_dm_hw_entries {
+ u32 num_entries;
+ struct net_dm_hw_entry entries[0];
+};
struct per_cpu_dm_data {
- spinlock_t lock;
- struct sk_buff *skb;
+ spinlock_t lock; /* Protects 'skb', 'hw_entries' and
+ * 'send_timer'
+ */
+ union {
+ struct sk_buff *skb;
+ struct net_dm_hw_entries *hw_entries;
+ };
+ struct sk_buff_head drop_queue;
struct work_struct dm_alert_work;
struct timer_list send_timer;
+ struct net_dm_stats stats;
};
struct dm_hw_stat_delta {
@@ -63,12 +95,37 @@ struct dm_hw_stat_delta {
static struct genl_family net_drop_monitor_family;
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
static unsigned long dm_hw_check_delta = 2*HZ;
static LIST_HEAD(hw_stats_list);
+static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
+static u32 net_dm_trunc_len;
+static u32 net_dm_queue_len = 1000;
+
+struct net_dm_alert_ops {
+ void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
+ void *location);
+ void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
+ int work, int budget);
+ void (*work_item_func)(struct work_struct *work);
+ void (*hw_work_item_func)(struct work_struct *work);
+ void (*hw_probe)(struct sk_buff *skb,
+ const struct net_dm_hw_metadata *hw_metadata);
+};
+
+struct net_dm_skb_cb {
+ union {
+ struct net_dm_hw_metadata *hw_metadata;
+ void *pc;
+ };
+};
+
+#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
+
static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
{
size_t al;
@@ -235,48 +292,842 @@ static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
rcu_read_unlock();
}
-static int set_all_monitor_traces(int state)
+static struct net_dm_hw_entries *
+net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
{
- int rc = 0;
- struct dm_hw_stat_delta *new_stat = NULL;
- struct dm_hw_stat_delta *temp;
+ struct net_dm_hw_entries *hw_entries;
+ unsigned long flags;
+
+ hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit),
+ GFP_KERNEL);
+ if (!hw_entries) {
+ /* If the memory allocation failed, we try to perform another
+ * allocation in 1/10 second. Otherwise, the probe function
+ * will constantly bail out.
+ */
+ mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
+ }
- mutex_lock(&trace_state_mutex);
+ spin_lock_irqsave(&hw_data->lock, flags);
+ swap(hw_data->hw_entries, hw_entries);
+ spin_unlock_irqrestore(&hw_data->lock, flags);
- if (state == trace_state) {
- rc = -EAGAIN;
- goto out_unlock;
+ return hw_entries;
+}
+
+static int net_dm_hw_entry_put(struct sk_buff *msg,
+ const struct net_dm_hw_entry *hw_entry)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_hw_entries_put(struct sk_buff *msg,
+ const struct net_dm_hw_entries *hw_entries)
+{
+ struct nlattr *attr;
+ int i;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES);
+ if (!attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < hw_entries->num_entries; i++) {
+ int rc;
+
+ rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]);
+ if (rc)
+ goto nla_put_failure;
}
- switch (state) {
- case TRACE_ON:
- if (!try_module_get(THIS_MODULE)) {
- rc = -ENODEV;
- break;
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+net_dm_hw_summary_report_fill(struct sk_buff *msg,
+ const struct net_dm_hw_entries *hw_entries)
+{
+ struct net_dm_alert_msg anc_hdr = { 0 };
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ /* We need to put the ancillary header in order not to break user
+ * space.
+ */
+ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr))
+ goto nla_put_failure;
+
+ rc = net_dm_hw_entries_put(msg, hw_entries);
+ if (rc)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void net_dm_hw_summary_work(struct work_struct *work)
+{
+ struct net_dm_hw_entries *hw_entries;
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff *msg;
+ int rc;
+
+ hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+ if (!hw_entries)
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_hw_summary_report_fill(msg, hw_entries);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ kfree(hw_entries);
+}
+
+static void
+net_dm_hw_summary_probe(struct sk_buff *skb,
+ const struct net_dm_hw_metadata *hw_metadata)
+{
+ struct net_dm_hw_entries *hw_entries;
+ struct net_dm_hw_entry *hw_entry;
+ struct per_cpu_dm_data *hw_data;
+ unsigned long flags;
+ int i;
+
+ hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+ spin_lock_irqsave(&hw_data->lock, flags);
+ hw_entries = hw_data->hw_entries;
+
+ if (!hw_entries)
+ goto out;
+
+ for (i = 0; i < hw_entries->num_entries; i++) {
+ hw_entry = &hw_entries->entries[i];
+ if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name,
+ NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
+ hw_entry->count++;
+ goto out;
}
+ }
+ if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit))
+ goto out;
- rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
- rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
- break;
+ hw_entry = &hw_entries->entries[hw_entries->num_entries];
+ strlcpy(hw_entry->trap_name, hw_metadata->trap_name,
+ NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
+ hw_entry->count = 1;
+ hw_entries->num_entries++;
- case TRACE_OFF:
- rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
- rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
+ if (!timer_pending(&hw_data->send_timer)) {
+ hw_data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&hw_data->send_timer);
+ }
- tracepoint_synchronize_unregister();
+out:
+ spin_unlock_irqrestore(&hw_data->lock, flags);
+}
- /*
- * Clean the device list
+static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
+ .kfree_skb_probe = trace_kfree_skb_hit,
+ .napi_poll_probe = trace_napi_poll_hit,
+ .work_item_func = send_dm_alert,
+ .hw_work_item_func = net_dm_hw_summary_work,
+ .hw_probe = net_dm_hw_summary_probe,
+};
+
+static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
+ struct sk_buff *skb,
+ void *location)
+{
+ ktime_t tstamp = ktime_get_real();
+ struct per_cpu_dm_data *data;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ if (!skb_mac_header_was_set(skb))
+ return;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ NET_DM_SKB_CB(nskb)->pc = location;
+ /* Override the timestamp because we care about the time when the
+ * packet was dropped.
+ */
+ nskb->tstamp = tstamp;
+
+ data = this_cpu_ptr(&dm_cpu_data);
+
+ spin_lock_irqsave(&data->drop_queue.lock, flags);
+ if (skb_queue_len(&data->drop_queue) < net_dm_queue_len)
+ __skb_queue_tail(&data->drop_queue, nskb);
+ else
+ goto unlock_free;
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+ schedule_work(&data->dm_alert_work);
+
+ return;
+
+unlock_free:
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+ u64_stats_update_begin(&data->stats.syncp);
+ data->stats.dropped++;
+ u64_stats_update_end(&data->stats.syncp);
+ consume_skb(nskb);
+}
+
+static void net_dm_packet_trace_napi_poll_hit(void *ignore,
+ struct napi_struct *napi,
+ int work, int budget)
+{
+}
+
+static size_t net_dm_in_port_size(void)
+{
+ /* NET_DM_ATTR_IN_PORT nest */
+ return nla_total_size(0) +
+ /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PORT_NETDEV_NAME */
+ nla_total_size(IFNAMSIZ + 1);
+}
+
+#define NET_DM_MAX_SYMBOL_LEN 40
+
+static size_t net_dm_packet_report_size(size_t payload_len)
+{
+ size_t size;
+
+ size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+ return NLMSG_ALIGN(size) +
+ /* NET_DM_ATTR_ORIGIN */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PC */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_SYMBOL */
+ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) +
+ /* NET_DM_ATTR_IN_PORT */
+ net_dm_in_port_size() +
+ /* NET_DM_ATTR_TIMESTAMP */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_ORIG_LEN */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PROTO */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PAYLOAD */
+ nla_total_size(payload_len);
+}
+
+static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex,
+ const char *name)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (ifindex &&
+ nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
+ goto nla_put_failure;
+
+ if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
+ size_t payload_len)
+{
+ u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+ char buf[NET_DM_MAX_SYMBOL_LEN];
+ struct nlattr *attr;
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_PACKET_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+ if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
+ goto nla_put_failure;
+
+ rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL);
+ if (rc)
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+ ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+ goto nla_put_failure;
+
+ if (!payload_len)
+ goto out;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+ goto nla_put_failure;
+
+ attr = skb_put(msg, nla_total_size(payload_len));
+ attr->nla_type = NET_DM_ATTR_PAYLOAD;
+ attr->nla_len = nla_attr_size(payload_len);
+ if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO)
+
+static void net_dm_packet_report(struct sk_buff *skb)
+{
+ struct sk_buff *msg;
+ size_t payload_len;
+ int rc;
+
+ /* Make sure we start copying the packet from the MAC header */
+ if (skb->data > skb_mac_header(skb))
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else
+ skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+ /* Ensure packet fits inside a single netlink attribute */
+ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+ if (net_dm_trunc_len)
+ payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_packet_report_fill(msg, skb, payload_len);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ consume_skb(skb);
+}
+
+static void net_dm_packet_work(struct work_struct *work)
+{
+ struct per_cpu_dm_data *data;
+ struct sk_buff_head list;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_irqsave(&data->drop_queue.lock, flags);
+ skb_queue_splice_tail_init(&data->drop_queue, &list);
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+ while ((skb = __skb_dequeue(&list)))
+ net_dm_packet_report(skb);
+}
+
+static size_t
+net_dm_hw_packet_report_size(size_t payload_len,
+ const struct net_dm_hw_metadata *hw_metadata)
+{
+ size_t size;
+
+ size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+ return NLMSG_ALIGN(size) +
+ /* NET_DM_ATTR_ORIGIN */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */
+ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) +
+ /* NET_DM_ATTR_HW_TRAP_NAME */
+ nla_total_size(strlen(hw_metadata->trap_name) + 1) +
+ /* NET_DM_ATTR_IN_PORT */
+ net_dm_in_port_size() +
+ /* NET_DM_ATTR_TIMESTAMP */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_ORIG_LEN */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PROTO */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PAYLOAD */
+ nla_total_size(payload_len);
+}
+
+static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
+ struct sk_buff *skb, size_t payload_len)
+{
+ struct net_dm_hw_metadata *hw_metadata;
+ struct nlattr *attr;
+ void *hdr;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_PACKET_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME,
+ hw_metadata->trap_group_name))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME,
+ hw_metadata->trap_name))
+ goto nla_put_failure;
+
+ if (hw_metadata->input_dev) {
+ struct net_device *dev = hw_metadata->input_dev;
+ int rc;
+
+ rc = net_dm_packet_report_in_port_put(msg, dev->ifindex,
+ dev->name);
+ if (rc)
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+ ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+ goto nla_put_failure;
+
+ if (!payload_len)
+ goto out;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+ goto nla_put_failure;
+
+ attr = skb_put(msg, nla_total_size(payload_len));
+ attr->nla_type = NET_DM_ATTR_PAYLOAD;
+ attr->nla_len = nla_attr_size(payload_len);
+ if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static struct net_dm_hw_metadata *
+net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
+{
+ struct net_dm_hw_metadata *n_hw_metadata;
+ const char *trap_group_name;
+ const char *trap_name;
+
+ n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ if (!n_hw_metadata)
+ return NULL;
+
+ trap_group_name = kstrdup(hw_metadata->trap_group_name, GFP_ATOMIC);
+ if (!trap_group_name)
+ goto free_hw_metadata;
+ n_hw_metadata->trap_group_name = trap_group_name;
+
+ trap_name = kstrdup(hw_metadata->trap_name, GFP_ATOMIC);
+ if (!trap_name)
+ goto free_trap_group;
+ n_hw_metadata->trap_name = trap_name;
+
+ n_hw_metadata->input_dev = hw_metadata->input_dev;
+ if (n_hw_metadata->input_dev)
+ dev_hold(n_hw_metadata->input_dev);
+
+ return n_hw_metadata;
+
+free_trap_group:
+ kfree(trap_group_name);
+free_hw_metadata:
+ kfree(n_hw_metadata);
+ return NULL;
+}
+
+static void
+net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
+{
+ if (hw_metadata->input_dev)
+ dev_put(hw_metadata->input_dev);
+ kfree(hw_metadata->trap_name);
+ kfree(hw_metadata->trap_group_name);
+ kfree(hw_metadata);
+}
+
+static void net_dm_hw_packet_report(struct sk_buff *skb)
+{
+ struct net_dm_hw_metadata *hw_metadata;
+ struct sk_buff *msg;
+ size_t payload_len;
+ int rc;
+
+ if (skb->data > skb_mac_header(skb))
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else
+ skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+ if (net_dm_trunc_len)
+ payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata),
+ GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_hw_packet_report_fill(msg, skb, payload_len);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata);
+ consume_skb(skb);
+}
+
+static void net_dm_hw_packet_work(struct work_struct *work)
+{
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff_head list;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+ skb_queue_splice_tail_init(&hw_data->drop_queue, &list);
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+ while ((skb = __skb_dequeue(&list)))
+ net_dm_hw_packet_report(skb);
+}
+
+static void
+net_dm_hw_packet_probe(struct sk_buff *skb,
+ const struct net_dm_hw_metadata *hw_metadata)
+{
+ struct net_dm_hw_metadata *n_hw_metadata;
+ ktime_t tstamp = ktime_get_real();
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ if (!skb_mac_header_was_set(skb))
+ return;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata);
+ if (!n_hw_metadata)
+ goto free;
+
+ NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata;
+ nskb->tstamp = tstamp;
+
+ hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+
+ spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+ if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len)
+ __skb_queue_tail(&hw_data->drop_queue, nskb);
+ else
+ goto unlock_free;
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+ schedule_work(&hw_data->dm_alert_work);
+
+ return;
+
+unlock_free:
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+ u64_stats_update_begin(&hw_data->stats.syncp);
+ hw_data->stats.dropped++;
+ u64_stats_update_end(&hw_data->stats.syncp);
+ net_dm_hw_metadata_free(n_hw_metadata);
+free:
+ consume_skb(nskb);
+}
+
+static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
+ .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit,
+ .napi_poll_probe = net_dm_packet_trace_napi_poll_hit,
+ .work_item_func = net_dm_packet_work,
+ .hw_work_item_func = net_dm_hw_packet_work,
+ .hw_probe = net_dm_hw_packet_probe,
+};
+
+static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
+ [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops,
+ [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops,
+};
+
+void net_dm_hw_report(struct sk_buff *skb,
+ const struct net_dm_hw_metadata *hw_metadata)
+{
+ rcu_read_lock();
+
+ if (!monitor_hw)
+ goto out;
+
+ net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata);
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(net_dm_hw_report);
+
+static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu;
+
+ if (monitor_hw) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
+ return -EAGAIN;
+ }
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ if (!try_module_get(THIS_MODULE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+ return -ENODEV;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct net_dm_hw_entries *hw_entries;
+
+ INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func);
+ timer_setup(&hw_data->send_timer, sched_send_work, 0);
+ hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+ kfree(hw_entries);
+ }
+
+ monitor_hw = true;
+
+ return 0;
+}
+
+static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
+{
+ int cpu;
+
+ if (!monitor_hw) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled");
+ return;
+ }
+
+ monitor_hw = false;
+
+ /* After this call returns we are guaranteed that no CPU is processing
+ * any hardware drops.
+ */
+ synchronize_rcu();
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ del_timer_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct net_dm_hw_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
+ }
+ }
+
+ module_put(THIS_MODULE);
+}
+
+static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu, rc;
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ if (!try_module_get(THIS_MODULE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+ return -ENODEV;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ INIT_WORK(&data->dm_alert_work, ops->work_item_func);
+ timer_setup(&data->send_timer, sched_send_work, 0);
+ /* Allocate a new per-CPU skb for the summary alert message and
+ * free the old one which might contain stale data from
+ * previous tracing.
*/
- list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
- if (new_stat->dev == NULL) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- }
+ skb = reset_per_cpu_data(data);
+ consume_skb(skb);
+ }
+
+ rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint");
+ goto err_module_put;
+ }
+
+ rc = register_trace_napi_poll(ops->napi_poll_probe, NULL);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint");
+ goto err_unregister_trace;
+ }
+
+ return 0;
+
+err_unregister_trace:
+ unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+err_module_put:
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static void net_dm_trace_off_set(void)
+{
+ struct dm_hw_stat_delta *new_stat, *temp;
+ const struct net_dm_alert_ops *ops;
+ int cpu;
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ unregister_trace_napi_poll(ops->napi_poll_probe, NULL);
+ unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+
+ tracepoint_synchronize_unregister();
+
+ /* Make sure we do not send notifications to user space after request
+ * to stop tracing returns.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
+
+ list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+ if (new_stat->dev == NULL) {
+ list_del_rcu(&new_stat->list);
+ kfree_rcu(new_stat, rcu);
}
+ }
- module_put(THIS_MODULE);
+ module_put(THIS_MODULE);
+}
+
+static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack)
+{
+ int rc = 0;
+ if (state == trace_state) {
+ NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state");
+ return -EAGAIN;
+ }
+
+ switch (state) {
+ case TRACE_ON:
+ rc = net_dm_trace_on_set(extack);
+ break;
+ case TRACE_OFF:
+ net_dm_trace_off_set();
break;
default:
rc = 1;
@@ -288,30 +1139,331 @@ static int set_all_monitor_traces(int state)
else
rc = -EINPROGRESS;
-out_unlock:
- mutex_unlock(&trace_state_mutex);
-
return rc;
}
+static bool net_dm_is_monitoring(void)
+{
+ return trace_state == TRACE_ON || monitor_hw;
+}
+
+static int net_dm_alert_mode_get_from_info(struct genl_info *info,
+ enum net_dm_alert_mode *p_alert_mode)
+{
+ u8 val;
+
+ val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]);
+
+ switch (val) {
+ case NET_DM_ALERT_MODE_SUMMARY: /* fall-through */
+ case NET_DM_ALERT_MODE_PACKET:
+ *p_alert_mode = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int net_dm_alert_mode_set(struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ enum net_dm_alert_mode alert_mode;
+ int rc;
+
+ if (!info->attrs[NET_DM_ATTR_ALERT_MODE])
+ return 0;
+
+ rc = net_dm_alert_mode_get_from_info(info, &alert_mode);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode");
+ return -EINVAL;
+ }
+
+ net_dm_alert_mode = alert_mode;
+
+ return 0;
+}
+
+static void net_dm_trunc_len_set(struct genl_info *info)
+{
+ if (!info->attrs[NET_DM_ATTR_TRUNC_LEN])
+ return;
+
+ net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
+}
+
+static void net_dm_queue_len_set(struct genl_info *info)
+{
+ if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
+ return;
+
+ net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]);
+}
static int net_dm_cmd_config(struct sk_buff *skb,
struct genl_info *info)
{
- return -ENOTSUPP;
+ struct netlink_ext_ack *extack = info->extack;
+ int rc;
+
+ if (net_dm_is_monitoring()) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring");
+ return -EBUSY;
+ }
+
+ rc = net_dm_alert_mode_set(info);
+ if (rc)
+ return rc;
+
+ net_dm_trunc_len_set(info);
+
+ net_dm_queue_len_set(info);
+
+ return 0;
+}
+
+static int net_dm_monitor_start(bool set_sw, bool set_hw,
+ struct netlink_ext_ack *extack)
+{
+ bool sw_set = false;
+ int rc;
+
+ if (set_sw) {
+ rc = set_all_monitor_traces(TRACE_ON, extack);
+ if (rc)
+ return rc;
+ sw_set = true;
+ }
+
+ if (set_hw) {
+ rc = net_dm_hw_monitor_start(extack);
+ if (rc)
+ goto err_monitor_hw;
+ }
+
+ return 0;
+
+err_monitor_hw:
+ if (sw_set)
+ set_all_monitor_traces(TRACE_OFF, extack);
+ return rc;
+}
+
+static void net_dm_monitor_stop(bool set_sw, bool set_hw,
+ struct netlink_ext_ack *extack)
+{
+ if (set_hw)
+ net_dm_hw_monitor_stop(extack);
+ if (set_sw)
+ set_all_monitor_traces(TRACE_OFF, extack);
}
static int net_dm_cmd_trace(struct sk_buff *skb,
struct genl_info *info)
{
+ bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS];
+ bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS];
+ struct netlink_ext_ack *extack = info->extack;
+
+ /* To maintain backward compatibility, we start / stop monitoring of
+ * software drops if no flag is specified.
+ */
+ if (!set_sw && !set_hw)
+ set_sw = true;
+
switch (info->genlhdr->cmd) {
case NET_DM_CMD_START:
- return set_all_monitor_traces(TRACE_ON);
+ return net_dm_monitor_start(set_sw, set_hw, extack);
case NET_DM_CMD_STOP:
- return set_all_monitor_traces(TRACE_OFF);
+ net_dm_monitor_stop(set_sw, set_hw, extack);
+ return 0;
}
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+}
+
+static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ int rc;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rc = net_dm_config_fill(msg, info);
+ if (rc)
+ goto free_msg;
+
+ return genlmsg_reply(msg, info);
+
+free_msg:
+ nlmsg_free(msg);
+ return rc;
+}
+
+static void net_dm_stats_read(struct net_dm_stats *stats)
+{
+ int cpu;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct net_dm_stats *cpu_stats = &data->stats;
+ unsigned int start;
+ u64 dropped;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ dropped = cpu_stats->dropped;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+ stats->dropped += dropped;
+ }
+}
+
+static int net_dm_stats_put(struct sk_buff *msg)
+{
+ struct net_dm_stats stats;
+ struct nlattr *attr;
+
+ net_dm_stats_read(&stats);
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+ stats.dropped, NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static void net_dm_hw_stats_read(struct net_dm_stats *stats)
+{
+ int cpu;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct net_dm_stats *cpu_stats = &hw_data->stats;
+ unsigned int start;
+ u64 dropped;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ dropped = cpu_stats->dropped;
+ } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+ stats->dropped += dropped;
+ }
+}
+
+static int net_dm_hw_stats_put(struct sk_buff *msg)
+{
+ struct net_dm_stats stats;
+ struct nlattr *attr;
+
+ net_dm_hw_stats_read(&stats);
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+ stats.dropped, NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
+{
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ rc = net_dm_stats_put(msg);
+ if (rc)
+ goto nla_put_failure;
+
+ rc = net_dm_hw_stats_put(msg);
+ if (rc)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ int rc;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rc = net_dm_stats_fill(msg, info);
+ if (rc)
+ goto free_msg;
+
+ return genlmsg_reply(msg, info);
+
+free_msg:
+ nlmsg_free(msg);
+ return rc;
}
static int dropmon_net_event(struct notifier_block *ev_block,
@@ -330,12 +1482,12 @@ static int dropmon_net_event(struct notifier_block *ev_block,
new_stat->dev = dev;
new_stat->last_rx = jiffies;
- mutex_lock(&trace_state_mutex);
+ mutex_lock(&net_dm_mutex);
list_add_rcu(&new_stat->list, &hw_stats_list);
- mutex_unlock(&trace_state_mutex);
+ mutex_unlock(&net_dm_mutex);
break;
case NETDEV_UNREGISTER:
- mutex_lock(&trace_state_mutex);
+ mutex_lock(&net_dm_mutex);
list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
if (new_stat->dev == dev) {
new_stat->dev = NULL;
@@ -346,18 +1498,28 @@ static int dropmon_net_event(struct notifier_block *ev_block,
}
}
}
- mutex_unlock(&trace_state_mutex);
+ mutex_unlock(&net_dm_mutex);
break;
}
out:
return NOTIFY_DONE;
}
+static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
+ [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
+ [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
+ [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
+ [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
+ [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG },
+ [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG },
+};
+
static const struct genl_ops dropmon_ops[] = {
{
.cmd = NET_DM_CMD_CONFIG,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = net_dm_cmd_config,
+ .flags = GENL_ADMIN_PERM,
},
{
.cmd = NET_DM_CMD_START,
@@ -369,12 +1531,38 @@ static const struct genl_ops dropmon_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = net_dm_cmd_trace,
},
+ {
+ .cmd = NET_DM_CMD_CONFIG_GET,
+ .doit = net_dm_cmd_config_get,
+ },
+ {
+ .cmd = NET_DM_CMD_STATS_GET,
+ .doit = net_dm_cmd_stats_get,
+ },
};
+static int net_dm_nl_pre_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ mutex_lock(&net_dm_mutex);
+
+ return 0;
+}
+
+static void net_dm_nl_post_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ mutex_unlock(&net_dm_mutex);
+}
+
static struct genl_family net_drop_monitor_family __ro_after_init = {
.hdrsize = 0,
.name = "NET_DM",
.version = 2,
+ .maxattr = NET_DM_ATTR_MAX,
+ .policy = net_dm_nl_policy,
+ .pre_doit = net_dm_nl_pre_doit,
+ .post_doit = net_dm_nl_post_doit,
.module = THIS_MODULE,
.ops = dropmon_ops,
.n_ops = ARRAY_SIZE(dropmon_ops),
@@ -386,9 +1574,57 @@ static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
-static int __init init_net_drop_monitor(void)
+static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)
+{
+ spin_lock_init(&data->lock);
+ skb_queue_head_init(&data->drop_queue);
+ u64_stats_init(&data->stats.syncp);
+}
+
+static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data)
+{
+ WARN_ON(!skb_queue_empty(&data->drop_queue));
+}
+
+static void net_dm_cpu_data_init(int cpu)
+{
+ struct per_cpu_dm_data *data;
+
+ data = &per_cpu(dm_cpu_data, cpu);
+ __net_dm_cpu_data_init(data);
+}
+
+static void net_dm_cpu_data_fini(int cpu)
{
struct per_cpu_dm_data *data;
+
+ data = &per_cpu(dm_cpu_data, cpu);
+ /* At this point, we should have exclusive access
+ * to this struct and can free the skb inside it.
+ */
+ consume_skb(data->skb);
+ __net_dm_cpu_data_fini(data);
+}
+
+static void net_dm_hw_cpu_data_init(int cpu)
+{
+ struct per_cpu_dm_data *hw_data;
+
+ hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ __net_dm_cpu_data_init(hw_data);
+}
+
+static void net_dm_hw_cpu_data_fini(int cpu)
+{
+ struct per_cpu_dm_data *hw_data;
+
+ hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ kfree(hw_data->hw_entries);
+ __net_dm_cpu_data_fini(hw_data);
+}
+
+static int __init init_net_drop_monitor(void)
+{
int cpu, rc;
pr_info("Initializing network drop monitor service\n");
@@ -414,14 +1650,10 @@ static int __init init_net_drop_monitor(void)
rc = 0;
for_each_possible_cpu(cpu) {
- data = &per_cpu(dm_cpu_data, cpu);
- INIT_WORK(&data->dm_alert_work, send_dm_alert);
- timer_setup(&data->send_timer, sched_send_work, 0);
- spin_lock_init(&data->lock);
- reset_per_cpu_data(data);
+ net_dm_cpu_data_init(cpu);
+ net_dm_hw_cpu_data_init(cpu);
}
-
goto out;
out_unreg:
@@ -432,7 +1664,6 @@ out:
static void exit_net_drop_monitor(void)
{
- struct per_cpu_dm_data *data;
int cpu;
BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
@@ -440,19 +1671,11 @@ static void exit_net_drop_monitor(void)
/*
* Because of the module_get/put we do in the trace state change path
* we are guarnateed not to have any current users when we get here
- * all we need to do is make sure that we don't have any running timers
- * or pending schedule calls
*/
for_each_possible_cpu(cpu) {
- data = &per_cpu(dm_cpu_data, cpu);
- del_timer_sync(&data->send_timer);
- cancel_work_sync(&data->dm_alert_work);
- /*
- * At this point, we should have exclusive access
- * to this struct and can free the skb inside it
- */
- kfree_skb(data->skb);
+ net_dm_hw_cpu_data_fini(cpu);
+ net_dm_cpu_data_fini(cpu);
}
BUG_ON(genl_unregister_family(&net_drop_monitor_family));
diff --git a/net/core/dst.c b/net/core/dst.c
index 1325316d9eab..193af526e908 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -172,7 +172,7 @@ void dst_release(struct dst_entry *dst)
int newrefcnt;
newrefcnt = atomic_dec_return(&dst->__refcnt);
- if (unlikely(newrefcnt < 0))
+ if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
if (!newrefcnt)
@@ -187,7 +187,7 @@ void dst_release_immediate(struct dst_entry *dst)
int newrefcnt;
newrefcnt = atomic_dec_return(&dst->__refcnt);
- if (unlikely(newrefcnt < 0))
+ if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
if (!newrefcnt)
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 13a40b831d6d..fc96259807b6 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -5,17 +5,22 @@
#include <linux/module.h>
#include <linux/init.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/fib_notifier.h>
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
+static unsigned int fib_notifier_net_id;
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
+struct fib_notifier_net {
+ struct list_head fib_notifier_ops;
+ struct atomic_notifier_head fib_chain;
+};
+
+int call_fib_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
int err;
- info->net = net;
err = nb->notifier_call(nb, event_type, info);
return notifier_to_errno(err);
}
@@ -24,115 +29,113 @@ EXPORT_SYMBOL(call_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
int err;
- info->net = net;
- err = atomic_notifier_call_chain(&fib_chain, event_type, info);
+ err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info);
return notifier_to_errno(err);
}
EXPORT_SYMBOL(call_fib_notifiers);
-static unsigned int fib_seq_sum(void)
+static unsigned int fib_seq_sum(struct net *net)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
struct fib_notifier_ops *ops;
unsigned int fib_seq = 0;
- struct net *net;
rtnl_lock();
- down_read(&net_rwsem);
- for_each_net(net) {
- rcu_read_lock();
- list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
- if (!try_module_get(ops->owner))
- continue;
- fib_seq += ops->fib_seq_read(net);
- module_put(ops->owner);
- }
- rcu_read_unlock();
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
+ if (!try_module_get(ops->owner))
+ continue;
+ fib_seq += ops->fib_seq_read(net);
+ module_put(ops->owner);
}
- up_read(&net_rwsem);
+ rcu_read_unlock();
rtnl_unlock();
return fib_seq;
}
-static int fib_net_dump(struct net *net, struct notifier_block *nb)
+static int fib_net_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
struct fib_notifier_ops *ops;
+ int err = 0;
- list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
- int err;
-
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
if (!try_module_get(ops->owner))
continue;
- err = ops->fib_dump(net, nb);
+ err = ops->fib_dump(net, nb, extack);
module_put(ops->owner);
if (err)
- return err;
+ goto unlock;
}
- return 0;
+unlock:
+ rcu_read_unlock();
+
+ return err;
}
-static bool fib_dump_is_consistent(struct notifier_block *nb,
+static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb,
void (*cb)(struct notifier_block *nb),
unsigned int fib_seq)
{
- atomic_notifier_chain_register(&fib_chain, nb);
- if (fib_seq == fib_seq_sum())
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ atomic_notifier_chain_register(&fn_net->fib_chain, nb);
+ if (fib_seq == fib_seq_sum(net))
return true;
- atomic_notifier_chain_unregister(&fib_chain, nb);
+ atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
if (cb)
cb(nb);
return false;
}
#define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb))
+int register_fib_notifier(struct net *net, struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ struct netlink_ext_ack *extack)
{
int retries = 0;
int err;
do {
- unsigned int fib_seq = fib_seq_sum();
- struct net *net;
-
- rcu_read_lock();
- for_each_net_rcu(net) {
- err = fib_net_dump(net, nb);
- if (err)
- goto err_fib_net_dump;
- }
- rcu_read_unlock();
-
- if (fib_dump_is_consistent(nb, cb, fib_seq))
+ unsigned int fib_seq = fib_seq_sum(net);
+
+ err = fib_net_dump(net, nb, extack);
+ if (err)
+ return err;
+
+ if (fib_dump_is_consistent(net, nb, cb, fib_seq))
return 0;
} while (++retries < FIB_DUMP_MAX_RETRIES);
return -EBUSY;
-
-err_fib_net_dump:
- rcu_read_unlock();
- return err;
}
EXPORT_SYMBOL(register_fib_notifier);
-int unregister_fib_notifier(struct notifier_block *nb)
+int unregister_fib_notifier(struct net *net, struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&fib_chain, nb);
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
static int __fib_notifier_ops_register(struct fib_notifier_ops *ops,
struct net *net)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
struct fib_notifier_ops *o;
- list_for_each_entry(o, &net->fib_notifier_ops, list)
+ list_for_each_entry(o, &fn_net->fib_notifier_ops, list)
if (ops->family == o->family)
return -EEXIST;
- list_add_tail_rcu(&ops->list, &net->fib_notifier_ops);
+ list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops);
return 0;
}
@@ -167,18 +170,25 @@ EXPORT_SYMBOL(fib_notifier_ops_unregister);
static int __net_init fib_notifier_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->fib_notifier_ops);
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ INIT_LIST_HEAD(&fn_net->fib_notifier_ops);
+ ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain);
return 0;
}
static void __net_exit fib_notifier_net_exit(struct net *net)
{
- WARN_ON_ONCE(!list_empty(&net->fib_notifier_ops));
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops));
}
static struct pernet_operations fib_notifier_net_ops = {
.init = fib_notifier_net_init,
.exit = fib_notifier_net_exit,
+ .id = &fib_notifier_net_id,
+ .size = sizeof(struct fib_notifier_net),
};
static int __init fib_notifier_init(void)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index dd220ce7ca7a..3e7e15278c46 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -321,16 +321,18 @@ out:
}
EXPORT_SYMBOL_GPL(fib_rules_lookup);
-static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_rule_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
- struct fib_rule *rule, int family)
+ struct fib_rule *rule, int family,
+ struct netlink_ext_ack *extack)
{
struct fib_rule_notifier_info info = {
.info.family = family,
+ .info.extack = extack,
.rule = rule,
};
- return call_fib_notifier(nb, net, event_type, &info.info);
+ return call_fib_notifier(nb, event_type, &info.info);
}
static int call_fib_rule_notifiers(struct net *net,
@@ -350,20 +352,25 @@ static int call_fib_rule_notifiers(struct net *net,
}
/* Called with rcu_read_lock() */
-int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
+ struct netlink_ext_ack *extack)
{
struct fib_rules_ops *ops;
struct fib_rule *rule;
+ int err = 0;
ops = lookup_rules_ops(net, family);
if (!ops)
return -EAFNOSUPPORT;
- list_for_each_entry_rcu(rule, &ops->rules_list, list)
- call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule,
- family);
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD,
+ rule, family, extack);
+ if (err)
+ break;
+ }
rules_ops_put(ops);
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(fib_rules_dump);
diff --git a/net/core/filter.c b/net/core/filter.c
index 7878f918b8c0..c180871e606d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -274,7 +274,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
switch (skb_field) {
case SKF_AD_MARK:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
offsetof(struct sk_buff, mark));
@@ -289,14 +289,14 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
break;
case SKF_AD_QUEUE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
offsetof(struct sk_buff, queue_mapping));
break;
case SKF_AD_VLAN_TAG:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
/* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
@@ -322,7 +322,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PROTOCOL:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
/* A = *(u16 *) (CTX + offsetof(protocol)) */
*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
@@ -338,8 +338,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
case SKF_AD_OFF + SKF_AD_IFINDEX:
case SKF_AD_OFF + SKF_AD_HATYPE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
BPF_REG_TMP, BPF_REG_CTX,
@@ -361,7 +361,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_RXHASH:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
offsetof(struct sk_buff, hash));
@@ -385,7 +385,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_VLAN_TPID:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
/* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
@@ -1573,7 +1573,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
return -EPERM;
prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
- if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
+ if (PTR_ERR(prog) == -EINVAL)
prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -2055,6 +2055,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
}
skb->dev = dev;
+ skb->tstamp = 0;
dev_xmit_recursion_inc();
ret = dev_queue_xmit(skb);
@@ -2230,10 +2231,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += len;
len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
- offset += len;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2245,7 +2246,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!msg->sg.copy[i] && bytes_sg_total <= len)
+ if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2299,7 +2300,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
WARN_ON_ONCE(last_sge == first_sge);
shift = last_sge > first_sge ?
last_sge - first_sge - 1 :
- MAX_SKB_FRAGS - first_sge + last_sge - 1;
+ NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
if (!shift)
goto out;
@@ -2308,8 +2309,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
do {
u32 move_from;
- if (i + shift >= MAX_MSG_FRAGS)
- move_from = i + shift - MAX_MSG_FRAGS;
+ if (i + shift >= NR_MSG_FRAG_IDS)
+ move_from = i + shift - NR_MSG_FRAG_IDS;
else
move_from = i + shift;
if (move_from == msg->sg.end)
@@ -2323,7 +2324,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
} while (1);
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
- msg->sg.end - shift + MAX_MSG_FRAGS :
+ msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
out:
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2345,7 +2346,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
- u32 new, i = 0, l, space, copy = 0, offset = 0;
+ u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
u8 *raw, *to, *from;
struct page *page;
@@ -2355,11 +2356,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
- offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -2414,6 +2415,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
+ sg_unmark_end(&rsge);
sk_msg_iter_next(msg, end);
}
@@ -2450,7 +2452,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- msg->sg.copy[new] = false;
+ __clear_bit(new, &msg->sg.copy);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
@@ -2505,7 +2507,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i)
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
- u32 i = 0, l, space, offset = 0;
+ u32 i = 0, l = 0, space, offset = 0;
u64 last = start + len;
int pop;
@@ -2515,11 +2517,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
+ offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
- offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
@@ -3457,123 +3459,38 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp(struct net_device *dev,
- struct bpf_map *map,
- struct xdp_buff *xdp,
- u32 index)
-{
- struct xdp_frame *xdpf;
- int err, sent;
-
- if (!dev->netdev_ops->ndo_xdp_xmit) {
- return -EOPNOTSUPP;
- }
-
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
- if (unlikely(err))
- return err;
-
- xdpf = convert_to_xdp_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
- if (sent <= 0)
- return sent;
- return 0;
-}
-
-static noinline int
-xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
-{
- struct net_device *fwd;
- u32 index = ri->tgt_index;
- int err;
-
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->tgt_index = 0;
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
-
- err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
- if (unlikely(err))
- goto err;
-
- _trace_xdp_redirect(dev, xdp_prog, index);
- return 0;
-err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
- return err;
-}
-
static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map,
- struct xdp_buff *xdp,
- u32 index)
+ struct bpf_map *map, struct xdp_buff *xdp)
{
- int err;
-
switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP: {
- struct bpf_dtab_netdev *dst = fwd;
-
- err = dev_map_enqueue(dst, xdp, dev_rx);
- if (unlikely(err))
- return err;
- break;
- }
- case BPF_MAP_TYPE_CPUMAP: {
- struct bpf_cpu_map_entry *rcpu = fwd;
-
- err = cpu_map_enqueue(rcpu, xdp, dev_rx);
- if (unlikely(err))
- return err;
- break;
- }
- case BPF_MAP_TYPE_XSKMAP: {
- struct xdp_sock *xs = fwd;
-
- err = __xsk_map_redirect(map, xdp, xs);
- return err;
- }
+ case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ return dev_map_enqueue(fwd, xdp, dev_rx);
+ case BPF_MAP_TYPE_CPUMAP:
+ return cpu_map_enqueue(fwd, xdp, dev_rx);
+ case BPF_MAP_TYPE_XSKMAP:
+ return __xsk_map_redirect(fwd, xdp);
default:
- break;
+ return -EBADRQC;
}
return 0;
}
-void xdp_do_flush_map(void)
+void xdp_do_flush(void)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = ri->map_to_flush;
-
- ri->map_to_flush = NULL;
- if (map) {
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- __dev_map_flush(map);
- break;
- case BPF_MAP_TYPE_CPUMAP:
- __cpu_map_flush(map);
- break;
- case BPF_MAP_TYPE_XSKMAP:
- __xsk_map_flush(map);
- break;
- default:
- break;
- }
- }
+ __dev_flush();
+ __cpu_map_flush();
+ __xsk_map_flush();
}
-EXPORT_SYMBOL_GPL(xdp_do_flush_map);
+EXPORT_SYMBOL_GPL(xdp_do_flush);
static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
{
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
return __dev_map_lookup_elem(map, index);
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ return __dev_map_hash_lookup_elem(map, index);
case BPF_MAP_TYPE_CPUMAP:
return __cpu_map_lookup_elem(map, index);
case BPF_MAP_TYPE_XSKMAP:
@@ -3600,10 +3517,11 @@ void bpf_clear_redirect_map(struct bpf_map *map)
}
}
-static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog, struct bpf_map *map,
- struct bpf_redirect_info *ri)
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map = READ_ONCE(ri->map);
u32 index = ri->tgt_index;
void *fwd = ri->tgt_value;
int err;
@@ -3612,32 +3530,27 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
- xdp_do_flush_map();
+ if (unlikely(!map)) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = dev_xdp_enqueue(fwd, xdp, dev);
+ } else {
+ err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ }
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
if (unlikely(err))
goto err;
- ri->map_to_flush = map;
_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
return 0;
err:
_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
return err;
}
-
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
-{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
-
- if (likely(map))
- return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
-
- return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri);
-}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
static int xdp_do_generic_redirect_map(struct net_device *dev,
@@ -3655,7 +3568,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
+ map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
struct bpf_dtab_netdev *dst = fwd;
err = dev_map_generic_redirect(dst, skb, xdp_prog);
@@ -3793,7 +3707,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(skb_size > skb->len))
+ if (unlikely(!skb || skb_size > skb->len))
return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
@@ -3811,6 +3725,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static int bpf_skb_output_btf_ids[5];
+const struct bpf_func_proto bpf_skb_output_proto = {
+ .func = bpf_skb_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_skb_output_btf_ids,
+};
+
static unsigned short bpf_tunnel_key_af(u64 flags)
{
return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
@@ -4084,7 +4011,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return cgrp->kn->id.id;
+ return cgroup_id(cgrp);
}
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@@ -4109,7 +4036,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
if (!ancestor)
return 0;
- return ancestor->kn->id.id;
+ return cgroup_id(ancestor);
}
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
@@ -4247,12 +4174,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
case SO_RCVBUF:
val = min_t(u32, val, sysctl_rmem_max);
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_SNDBUF:
val = min_t(u32, val, sysctl_wmem_max);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
break;
case SO_MAX_PACING_RATE: /* 32bit version */
if (val != ~0U)
@@ -4269,7 +4198,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
break;
case SO_MARK:
if (sk->sk_mark != val) {
@@ -5297,8 +5226,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ sock_gen_put(sk);
return NULL;
}
}
@@ -5335,8 +5263,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ sock_gen_put(sk);
return NULL;
}
}
@@ -5403,7 +5330,8 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- if (!sock_flag(sk, SOCK_RCU_FREE))
+ /* Only full sockets have sk->sk_flags. */
+ if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
sock_gen_put(sk);
return 0;
}
@@ -5569,8 +5497,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
#define BPF_TCP_SOCK_GET_COMMON(FIELD) \
do { \
- BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \
- FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \
+ sizeof_field(struct bpf_tcp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
si->dst_reg, si->src_reg, \
offsetof(struct tcp_sock, FIELD)); \
@@ -5578,9 +5506,9 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
#define BPF_INET_SOCK_GET_COMMON(FIELD) \
do { \
- BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \
+ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \
FIELD) > \
- FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
+ sizeof_field(struct bpf_tcp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct inet_connection_sock, \
FIELD), \
@@ -5595,7 +5523,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_tcp_sock, rtt_min):
- BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
sizeof(struct minmax));
BUILD_BUG_ON(sizeof(struct minmax) <
sizeof(struct minmax_sample));
@@ -5760,8 +5688,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
#define BPF_XDP_SOCK_GET(FIELD) \
do { \
- BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \
- FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \
+ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \
+ sizeof_field(struct bpf_xdp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
si->dst_reg, si->src_reg, \
offsetof(struct xdp_sock, FIELD)); \
@@ -5850,6 +5778,75 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+ struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+ return -ENOENT;
+
+ if (!th->syn || th->ack || th->fin || th->rst)
+ return -EINVAL;
+
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && sk->sk_ipv6only)
+ return -EINVAL;
+
+ mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
+ break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ case 6:
+ if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
+ break;
+#endif /* CONFIG_IPV6 */
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ if (mss == 0)
+ return -ENOENT;
+
+ return cookie | ((u64)mss << 32);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYN_COOKIES */
+}
+
+static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
+ .func = bpf_tcp_gen_syncookie,
+ .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -5886,7 +5883,7 @@ bool bpf_helper_changes_pkt_data(void *func)
return false;
}
-static const struct bpf_func_proto *
+const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -5926,6 +5923,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_spin_unlock_proto;
case BPF_FUNC_trace_printk:
return bpf_get_trace_printk_proto();
+ case BPF_FUNC_jiffies64:
+ return &bpf_jiffies64_proto;
default:
return NULL;
}
@@ -5999,6 +5998,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -6019,6 +6020,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
@@ -6135,6 +6138,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_check_syncookie_proto;
case BPF_FUNC_skb_ecn_set_ce:
return &bpf_skb_ecn_set_ce_proto;
+ case BPF_FUNC_tcp_gen_syncookie:
+ return &bpf_tcp_gen_syncookie_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6174,6 +6179,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_check_syncookie:
return &bpf_tcp_check_syncookie_proto;
+ case BPF_FUNC_tcp_gen_syncookie:
+ return &bpf_tcp_gen_syncookie_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6267,6 +6274,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_redirect_map_proto;
case BPF_FUNC_sk_redirect_hash:
return &bpf_sk_redirect_hash_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sk_lookup_tcp_proto;
@@ -7245,7 +7254,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, cb[0]) ...
offsetofend(struct __sk_buff, cb[4]) - 1:
- BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
offsetof(struct qdisc_skb_cb, data)) %
sizeof(__u64));
@@ -7264,7 +7273,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, tc_classid):
- BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);
off = si->off;
off -= offsetof(struct __sk_buff, tc_classid);
@@ -7335,7 +7344,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
#endif
break;
case offsetof(struct __sk_buff, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -7346,7 +7355,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
2, target_size));
break;
case offsetof(struct __sk_buff, remote_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -7357,7 +7366,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
4, target_size));
break;
case offsetof(struct __sk_buff, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
@@ -7371,7 +7380,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, remote_ip6[0]) ...
offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]) != 4);
off = si->off;
@@ -7391,7 +7400,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, local_ip6[0]) ...
offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) != 4);
off = si->off;
@@ -7410,7 +7419,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, remote_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -7425,7 +7434,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, local_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -7436,7 +7445,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, tstamp):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_DW,
@@ -7474,7 +7483,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
target_size));
break;
case offsetof(struct __sk_buff, wire_len):
- BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4);
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
off = si->off;
off -= offsetof(struct __sk_buff, wire_len);
@@ -7504,7 +7513,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sock, bound_dev_if):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
@@ -7515,7 +7524,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock, mark):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
@@ -7526,7 +7535,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock, priority):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
@@ -7542,34 +7551,34 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common,
skc_family,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_family),
target_size));
break;
case offsetof(struct bpf_sock, type):
- BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2);
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, __sk_flags_offset));
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
- *target_size = 2;
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_type),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_type,
+ sizeof_field(struct sock, sk_type),
+ target_size));
break;
case offsetof(struct bpf_sock, protocol):
- BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, __sk_flags_offset));
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
- *target_size = 1;
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_protocol),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_protocol,
+ sizeof_field(struct sock, sk_protocol),
+ target_size));
break;
case offsetof(struct bpf_sock, src_ip4):
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_rcv_saddr,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_rcv_saddr),
target_size));
break;
@@ -7578,7 +7587,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_daddr,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_daddr),
target_size));
break;
@@ -7592,7 +7601,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(
struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0],
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]),
target_size) + off);
#else
@@ -7609,7 +7618,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common,
skc_v6_daddr.s6_addr32[0],
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]),
target_size) + off);
#else
@@ -7623,7 +7632,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct sock_common, skc_num),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_num,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_num),
target_size));
break;
@@ -7633,7 +7642,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_dport,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_dport),
target_size));
break;
@@ -7643,7 +7652,7 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct sock_common, skc_state),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_state,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_state),
target_size));
break;
@@ -7738,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
si->src_reg, offsetof(S, F)); \
*insn++ = BPF_LDX_MEM( \
SIZE, si->dst_reg, si->dst_reg, \
- bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
target_size) \
+ OFF); \
} while (0)
@@ -7769,7 +7778,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
si->dst_reg, offsetof(S, F)); \
*insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \
- bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
target_size) \
+ OFF); \
*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
@@ -7831,8 +7840,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
*/
BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
offsetof(struct sockaddr_in6, sin6_port));
- BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) !=
- FIELD_SIZEOF(struct sockaddr_in6, sin6_port));
+ BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
+ sizeof_field(struct sockaddr_in6, sin6_port));
SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
struct sockaddr_in6, uaddr,
sin6_port, tmp_reg);
@@ -7844,20 +7853,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_addr, type):
- SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
- struct bpf_sock_addr_kern, struct sock, sk,
- __sk_flags_offset, BPF_W, 0);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_type);
break;
case offsetof(struct bpf_sock_addr, protocol):
- SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
- struct bpf_sock_addr_kern, struct sock, sk,
- __sk_flags_offset, BPF_W, 0);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
- SK_FL_PROTO_SHIFT);
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_protocol);
break;
case offsetof(struct bpf_sock_addr, msg_src_ip4):
@@ -7898,8 +7900,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
- BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
- FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
+ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
+ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
@@ -7932,8 +7934,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
int reg = BPF_REG_9; \
- BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
- FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
+ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
+ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
if (si->dst_reg == reg || si->src_reg == reg) \
reg--; \
if (si->dst_reg == reg || si->src_reg == reg) \
@@ -7974,12 +7976,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sock_ops, op) ...
offsetof(struct bpf_sock_ops, replylong[3]):
- BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
- FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
- BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
- FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
- BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
- FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) !=
+ sizeof_field(struct bpf_sock_ops_kern, op));
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
+ sizeof_field(struct bpf_sock_ops_kern, reply));
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
+ sizeof_field(struct bpf_sock_ops_kern, replylong));
off = si->off;
off -= offsetof(struct bpf_sock_ops, op);
off += offsetof(struct bpf_sock_ops_kern, op);
@@ -7992,7 +7994,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -8003,7 +8005,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, remote_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -8014,7 +8016,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
@@ -8029,7 +8031,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]) != 4);
off = si->off;
@@ -8050,7 +8052,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) != 4);
off = si->off;
@@ -8069,7 +8071,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, remote_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -8083,7 +8085,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -8103,7 +8105,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, state):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -8114,7 +8116,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, rtt_min):
- BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
sizeof(struct minmax));
BUILD_BUG_ON(sizeof(struct minmax) <
sizeof(struct minmax_sample));
@@ -8125,7 +8127,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
offsetof(struct bpf_sock_ops_kern, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct tcp_sock, rtt_min) +
- FIELD_SIZEOF(struct minmax_sample, t));
+ sizeof_field(struct minmax_sample, t));
break;
case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
@@ -8267,7 +8269,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sk_msg, data_end));
break;
case offsetof(struct sk_msg_md, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg, sk),
@@ -8278,7 +8280,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_msg_md, remote_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg, sk),
@@ -8289,7 +8291,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_msg_md, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
@@ -8304,7 +8306,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_msg_md, remote_ip6[0]) ...
offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]) != 4);
off = si->off;
@@ -8325,7 +8327,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_msg_md, local_ip6[0]) ...
offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) != 4);
off = si->off;
@@ -8344,7 +8346,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_msg_md, remote_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg, sk),
@@ -8358,7 +8360,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_msg_md, local_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct sk_msg, sk),
@@ -8585,16 +8587,6 @@ out:
}
#ifdef CONFIG_INET
-struct sk_reuseport_kern {
- struct sk_buff *skb;
- struct sock *sk;
- struct sock *selected_sk;
- void *data_end;
- u32 hash;
- u32 reuseport_id;
- bool bind_inany;
-};
-
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
@@ -8757,13 +8749,13 @@ sk_reuseport_is_valid_access(int off, int size,
return size == size_default;
/* Fields that allow narrowing */
- case offsetof(struct sk_reuseport_md, eth_protocol):
- if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
+ if (size < sizeof_field(struct sk_buff, protocol))
return false;
/* fall through */
- case offsetof(struct sk_reuseport_md, ip_protocol):
- case offsetof(struct sk_reuseport_md, bind_inany):
- case offsetof(struct sk_reuseport_md, len):
+ case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
+ case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
+ case bpf_ctx_range(struct sk_reuseport_md, len):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
@@ -8776,7 +8768,7 @@ sk_reuseport_is_valid_access(int off, int size,
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
si->dst_reg, si->src_reg, \
bpf_target_off(struct sk_reuseport_kern, F, \
- FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ sizeof_field(struct sk_reuseport_kern, F), \
target_size)); \
})
@@ -8786,11 +8778,11 @@ sk_reuseport_is_valid_access(int off, int size,
skb, \
SKB_FIELD)
-#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
- SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \
- struct sock, \
- sk, \
- SK_FIELD, BPF_SIZE, EXTRA_OFF)
+#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sock, \
+ sk, \
+ SK_FIELD)
static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -8814,16 +8806,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_reuseport_md, ip_protocol):
- BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
- SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
- BPF_W, 0);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
- SK_FL_PROTO_SHIFT);
- /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
- * aware. No further narrowing or masking is needed.
- */
- *target_size = 1;
+ SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
break;
case offsetof(struct sk_reuseport_md, data_end):
@@ -8851,3 +8834,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
const struct bpf_prog_ops sk_reuseport_prog_ops = {
};
#endif /* CONFIG_INET */
+
+DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+{
+ bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
+ prev_prog, prog);
+}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3e6fedb57bc1..a1670dff0629 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -114,19 +114,50 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
{
struct bpf_prog *attached;
struct net *net;
+ int ret = 0;
net = current->nsproxy->net_ns;
mutex_lock(&flow_dissector_mutex);
+
+ if (net == &init_net) {
+ /* BPF flow dissector in the root namespace overrides
+ * any per-net-namespace one. When attaching to root,
+ * make sure we don't have any BPF program attached
+ * to the non-root namespaces.
+ */
+ struct net *ns;
+
+ for_each_net(ns) {
+ if (ns == &init_net)
+ continue;
+ if (rcu_access_pointer(ns->flow_dissector_prog)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ } else {
+ /* Make sure root flow dissector is not attached
+ * when attaching to the non-root namespace.
+ */
+ if (rcu_access_pointer(init_net.flow_dissector_prog)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+
attached = rcu_dereference_protected(net->flow_dissector_prog,
lockdep_is_held(&flow_dissector_mutex));
- if (attached) {
- /* Only one BPF program can be attached at a time */
- mutex_unlock(&flow_dissector_mutex);
- return -EEXIST;
+ if (attached == prog) {
+ /* The same program cannot be attached twice */
+ ret = -EINVAL;
+ goto out;
}
rcu_assign_pointer(net->flow_dissector_prog, prog);
+ if (attached)
+ bpf_prog_put(attached);
+out:
mutex_unlock(&flow_dissector_mutex);
- return 0;
+ return ret;
}
int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
@@ -142,32 +173,11 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
mutex_unlock(&flow_dissector_mutex);
return -ENOENT;
}
- bpf_prog_put(attached);
RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+ bpf_prog_put(attached);
mutex_unlock(&flow_dissector_mutex);
return 0;
}
-/**
- * skb_flow_get_be16 - extract be16 entity
- * @skb: sk_buff to extract from
- * @poff: offset to extract at
- * @data: raw buffer pointer to the packet
- * @hlen: packet header length
- *
- * The function will try to retrieve a be32 entity at
- * offset poff
- */
-static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
- void *data, int hlen)
-{
- __be16 *u, _u;
-
- u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
- if (u)
- return *u;
-
- return 0;
-}
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
@@ -203,6 +213,72 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
}
EXPORT_SYMBOL(__skb_flow_get_ports);
+static bool icmp_has_id(u8 type)
+{
+ switch (type) {
+ case ICMP_ECHO:
+ case ICMP_ECHOREPLY:
+ case ICMP_TIMESTAMP:
+ case ICMP_TIMESTAMPREPLY:
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
+ * @skb: sk_buff to extract from
+ * @key_icmp: struct flow_dissector_key_icmp to fill
+ * @data: raw buffer pointer to the packet
+ * @thoff: offset to extract at
+ * @hlen: packet header length
+ */
+void skb_flow_get_icmp_tci(const struct sk_buff *skb,
+ struct flow_dissector_key_icmp *key_icmp,
+ void *data, int thoff, int hlen)
+{
+ struct icmphdr *ih, _ih;
+
+ ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
+ if (!ih)
+ return;
+
+ key_icmp->type = ih->type;
+ key_icmp->code = ih->code;
+
+ /* As we use 0 to signal that the Id field is not present,
+ * avoid confusion with packets without such field
+ */
+ if (icmp_has_id(ih->type))
+ key_icmp->id = ih->un.echo.id ? : 1;
+ else
+ key_icmp->id = 0;
+}
+EXPORT_SYMBOL(skb_flow_get_icmp_tci);
+
+/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
+ * using skb_flow_get_icmp_tci().
+ */
+static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container,
+ void *data, int thoff, int hlen)
+{
+ struct flow_dissector_key_icmp *key_icmp;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
+ return;
+
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+
+ skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
+}
+
void skb_flow_dissect_meta(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container)
@@ -523,8 +599,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
offset += sizeof(struct gre_base_hdr);
if (hdr->flags & GRE_CSUM)
- offset += FIELD_SIZEOF(struct gre_full_hdr, csum) +
- FIELD_SIZEOF(struct gre_full_hdr, reserved1);
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
if (hdr->flags & GRE_KEY) {
const __be32 *keyid;
@@ -546,11 +622,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
else
key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
}
- offset += FIELD_SIZEOF(struct gre_full_hdr, key);
+ offset += sizeof_field(struct gre_full_hdr, key);
}
if (hdr->flags & GRE_SEQ)
- offset += FIELD_SIZEOF(struct pptp_gre_header, seq);
+ offset += sizeof_field(struct pptp_gre_header, seq);
if (gre_ver == 0) {
if (*p_proto == htons(ETH_P_TEB)) {
@@ -577,7 +653,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
u8 *ppp_hdr;
if (hdr->flags & GRE_ACK)
- offset += FIELD_SIZEOF(struct pptp_gre_header, ack);
+ offset += sizeof_field(struct pptp_gre_header, ack);
ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
sizeof(_ppp_hdr),
@@ -684,6 +760,31 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb,
}
static void
+__skb_flow_dissect_ports(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, void *data, int nhoff,
+ u8 ip_proto, int hlen)
+{
+ enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
+ struct flow_dissector_key_ports *key_ports;
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
+ dissector_ports = FLOW_DISSECTOR_KEY_PORTS;
+ else if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE;
+
+ if (dissector_ports == FLOW_DISSECTOR_KEY_MAX)
+ return;
+
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ dissector_ports,
+ target_container);
+ key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+ data, hlen);
+}
+
+static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container, void *data, const struct iphdr *iph)
@@ -733,10 +834,11 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
struct flow_dissector *flow_dissector,
void *target_container)
{
+ struct flow_dissector_key_ports *key_ports = NULL;
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
- struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_tags *key_tags;
key_control = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_CONTROL,
@@ -774,17 +876,32 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
- if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
key_ports = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_PORTS,
target_container);
+ else if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+
+ if (key_ports) {
key_ports->src = flow_keys->sport;
key_ports->dst = flow_keys->dport;
}
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_keys->flow_label);
+ }
}
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
- __be16 proto, int nhoff, int hlen)
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct bpf_flow_keys *flow_keys = ctx->flow_keys;
u32 result;
@@ -795,6 +912,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
flow_keys->nhoff = nhoff;
flow_keys->thoff = flow_keys->nhoff;
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
+ (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
+ (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
+ (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+ flow_keys->flags = flags;
+
preempt_disable();
result = BPF_PROG_RUN(prog, ctx);
preempt_enable();
@@ -835,8 +960,6 @@ bool __skb_flow_dissect(const struct net *net,
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
- struct flow_dissector_key_ports *key_ports;
- struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
struct bpf_prog *attached = NULL;
@@ -853,9 +976,10 @@ bool __skb_flow_dissect(const struct net *net,
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
- if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) {
+ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
+ proto == htons(ETH_P_XDSA))) {
const struct dsa_device_ops *ops;
- int offset;
+ int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
if (ops->flow_dissect &&
@@ -893,7 +1017,10 @@ bool __skb_flow_dissect(const struct net *net,
WARN_ON_ONCE(!net);
if (net) {
rcu_read_lock();
- attached = rcu_dereference(net->flow_dissector_prog);
+ attached = rcu_dereference(init_net.flow_dissector_prog);
+
+ if (!attached)
+ attached = rcu_dereference(net->flow_dissector_prog);
if (attached) {
struct bpf_flow_keys flow_keys;
@@ -914,7 +1041,7 @@ bool __skb_flow_dissect(const struct net *net,
}
ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
- hlen);
+ hlen, flags);
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
rcu_read_unlock();
@@ -1278,26 +1405,19 @@ ip_proto_again:
data, nhoff, hlen);
break;
- default:
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
break;
- }
- if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS) &&
- !(key_control->flags & FLOW_DIS_IS_FRAGMENT)) {
- key_ports = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS,
- target_container);
- key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
+ default:
+ break;
}
- if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_ICMP)) {
- key_icmp = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_ICMP,
- target_container);
- key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
- }
+ if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
+ __skb_flow_dissect_ports(skb, flow_dissector, target_container,
+ data, nhoff, ip_proto, hlen);
/* Process result of IP proto processing */
switch (fdret) {
@@ -1333,32 +1453,23 @@ out_bad:
}
EXPORT_SYMBOL(__skb_flow_dissect);
-static u32 hashrnd __read_mostly;
+static siphash_key_t hashrnd __read_mostly;
static __always_inline void __flow_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
-static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
- u32 keyval)
+static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
- return jhash2(words, length, keyval);
-}
-
-static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
-{
- const void *p = flow;
-
- BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
- return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
+ return &flow->FLOW_KEYS_HASH_START_FIELD;
}
static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
+
BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
- BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
- sizeof(*flow) - sizeof(flow->addrs));
switch (flow->control.addr_type) {
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
@@ -1371,7 +1482,7 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
diff -= sizeof(flow->addrs.tipckey);
break;
}
- return (sizeof(*flow) - diff) / sizeof(u32);
+ return sizeof(*flow) - diff;
}
__be32 flow_get_u32_src(const struct flow_keys *flow)
@@ -1404,6 +1515,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow)
}
EXPORT_SYMBOL(flow_get_u32_dst);
+/* Sort the source and destination IP (and the ports if the IP are the same),
+ * to have consistent hash within the two directions
+ */
static inline void __flow_hash_consistentify(struct flow_keys *keys)
{
int addr_diff, i;
@@ -1437,14 +1551,15 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
}
}
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
+ const siphash_key_t *keyval)
{
u32 hash;
__flow_hash_consistentify(keys);
- hash = __flow_hash_words(flow_keys_hash_start(keys),
- flow_keys_hash_length(keys), keyval);
+ hash = siphash(flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
if (!hash)
hash = 1;
@@ -1454,12 +1569,13 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
u32 flow_hash_from_keys(struct flow_keys *keys)
{
__flow_hash_secret_init();
- return __flow_hash_from_keys(keys, hashrnd);
+ return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);
static inline u32 ___skb_get_hash(const struct sk_buff *skb,
- struct flow_keys *keys, u32 keyval)
+ struct flow_keys *keys,
+ const siphash_key_t *keyval)
{
skb_flow_dissect_flow_keys(skb, keys,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
@@ -1507,7 +1623,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
&keys, NULL, 0, 0, 0,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
- return __flow_hash_from_keys(&keys, hashrnd);
+ return __flow_hash_from_keys(&keys, &hashrnd);
}
EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
@@ -1527,13 +1643,14 @@ void __skb_get_hash(struct sk_buff *skb)
__flow_hash_secret_init();
- hash = ___skb_get_hash(skb, &keys, hashrnd);
+ hash = ___skb_get_hash(skb, &keys, &hashrnd);
__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash);
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+ const siphash_key_t *perturb)
{
struct flow_keys keys;
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index d63b970784dc..45b6a59ac124 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -2,6 +2,8 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <net/flow_offload.h>
+#include <linux/rtnetlink.h>
+#include <linux/mutex.h>
struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
@@ -280,3 +282,241 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
}
}
EXPORT_SYMBOL(flow_block_cb_setup_simple);
+
+static LIST_HEAD(block_cb_list);
+
+static struct rhashtable indr_setup_block_ht;
+
+struct flow_indr_block_cb {
+ struct list_head list;
+ void *cb_priv;
+ flow_indr_block_bind_cb_t *cb;
+ void *cb_ident;
+};
+
+struct flow_indr_block_dev {
+ struct rhash_head ht_node;
+ struct net_device *dev;
+ unsigned int refcnt;
+ struct list_head cb_list;
+};
+
+static const struct rhashtable_params flow_indr_setup_block_ht_params = {
+ .key_offset = offsetof(struct flow_indr_block_dev, dev),
+ .head_offset = offsetof(struct flow_indr_block_dev, ht_node),
+ .key_len = sizeof(struct net_device *),
+};
+
+static struct flow_indr_block_dev *
+flow_indr_block_dev_lookup(struct net_device *dev)
+{
+ return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
+ flow_indr_setup_block_ht_params);
+}
+
+static struct flow_indr_block_dev *
+flow_indr_block_dev_get(struct net_device *dev)
+{
+ struct flow_indr_block_dev *indr_dev;
+
+ indr_dev = flow_indr_block_dev_lookup(dev);
+ if (indr_dev)
+ goto inc_ref;
+
+ indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+ if (!indr_dev)
+ return NULL;
+
+ INIT_LIST_HEAD(&indr_dev->cb_list);
+ indr_dev->dev = dev;
+ if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+ flow_indr_setup_block_ht_params)) {
+ kfree(indr_dev);
+ return NULL;
+ }
+
+inc_ref:
+ indr_dev->refcnt++;
+ return indr_dev;
+}
+
+static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev)
+{
+ if (--indr_dev->refcnt)
+ return;
+
+ rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+ flow_indr_setup_block_ht_params);
+ kfree(indr_dev);
+}
+
+static struct flow_indr_block_cb *
+flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev,
+ flow_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct flow_indr_block_cb *indr_block_cb;
+
+ list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+ if (indr_block_cb->cb == cb &&
+ indr_block_cb->cb_ident == cb_ident)
+ return indr_block_cb;
+ return NULL;
+}
+
+static struct flow_indr_block_cb *
+flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv,
+ flow_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+ struct flow_indr_block_cb *indr_block_cb;
+
+ indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+ if (indr_block_cb)
+ return ERR_PTR(-EEXIST);
+
+ indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
+ if (!indr_block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ indr_block_cb->cb_priv = cb_priv;
+ indr_block_cb->cb = cb;
+ indr_block_cb->cb_ident = cb_ident;
+ list_add(&indr_block_cb->list, &indr_dev->cb_list);
+
+ return indr_block_cb;
+}
+
+static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
+{
+ list_del(&indr_block_cb->list);
+ kfree(indr_block_cb);
+}
+
+static DEFINE_MUTEX(flow_indr_block_cb_lock);
+
+static void flow_block_cmd(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ enum flow_block_command command)
+{
+ struct flow_indr_block_entry *entry;
+
+ mutex_lock(&flow_indr_block_cb_lock);
+ list_for_each_entry(entry, &block_cb_list, list) {
+ entry->cb(dev, cb, cb_priv, command);
+ }
+ mutex_unlock(&flow_indr_block_cb_lock);
+}
+
+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_ident)
+{
+ struct flow_indr_block_cb *indr_block_cb;
+ struct flow_indr_block_dev *indr_dev;
+ int err;
+
+ indr_dev = flow_indr_block_dev_get(dev);
+ if (!indr_dev)
+ return -ENOMEM;
+
+ indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
+ err = PTR_ERR_OR_ZERO(indr_block_cb);
+ if (err)
+ goto err_dev_put;
+
+ flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
+ FLOW_BLOCK_BIND);
+
+ return 0;
+
+err_dev_put:
+ flow_indr_block_dev_put(indr_dev);
+ return err;
+}
+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);
+
+int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_ident)
+{
+ int err;
+
+ rtnl_lock();
+ err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+ rtnl_unlock();
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_cb_register);
+
+void __flow_indr_block_cb_unregister(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_ident)
+{
+ struct flow_indr_block_cb *indr_block_cb;
+ struct flow_indr_block_dev *indr_dev;
+
+ indr_dev = flow_indr_block_dev_lookup(dev);
+ if (!indr_dev)
+ return;
+
+ indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+ if (!indr_block_cb)
+ return;
+
+ flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
+ FLOW_BLOCK_UNBIND);
+
+ flow_indr_block_cb_del(indr_block_cb);
+ flow_indr_block_dev_put(indr_dev);
+}
+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister);
+
+void flow_indr_block_cb_unregister(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_ident)
+{
+ rtnl_lock();
+ __flow_indr_block_cb_unregister(dev, cb, cb_ident);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
+
+void flow_indr_block_call(struct net_device *dev,
+ struct flow_block_offload *bo,
+ enum flow_block_command command)
+{
+ struct flow_indr_block_cb *indr_block_cb;
+ struct flow_indr_block_dev *indr_dev;
+
+ indr_dev = flow_indr_block_dev_lookup(dev);
+ if (!indr_dev)
+ return;
+
+ list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+ indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+ bo);
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_call);
+
+void flow_indr_add_block_cb(struct flow_indr_block_entry *entry)
+{
+ mutex_lock(&flow_indr_block_cb_lock);
+ list_add_tail(&entry->list, &block_cb_list);
+ mutex_unlock(&flow_indr_block_cb_lock);
+}
+EXPORT_SYMBOL_GPL(flow_indr_add_block_cb);
+
+void flow_indr_del_block_cb(struct flow_indr_block_entry *entry)
+{
+ mutex_lock(&flow_indr_block_cb_lock);
+ list_del(&entry->list);
+ mutex_unlock(&flow_indr_block_cb_lock);
+}
+EXPORT_SYMBOL_GPL(flow_indr_del_block_cb);
+
+static int __init init_flow_indr_rhashtable(void)
+{
+ return rhashtable_init(&indr_setup_block_ht,
+ &flow_indr_setup_block_ht_params);
+}
+subsys_initcall(init_flow_indr_rhashtable);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index bfe7bdd4c340..80dbf2f4016e 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -48,7 +48,7 @@ struct net_rate_estimator {
u8 intvl_log; /* period : (250ms << intvl_log) */
seqcount_t seq;
- u32 last_packets;
+ u64 last_packets;
u64 last_bytes;
u64 avpps;
@@ -83,7 +83,7 @@ static void est_timer(struct timer_list *t)
brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
brate -= (est->avbps >> est->ewma_log);
- rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
+ rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
rate -= (est->avpps >> est->ewma_log);
write_seqcount_begin(&est->seq);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 36888f5e09eb..1d653fbfcf52 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
for_each_possible_cpu(i) {
struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
unsigned int start;
- u64 bytes;
- u32 packets;
+ u64 bytes, packets;
do {
start = u64_stats_fetch_begin_irq(&bcpu->syncp);
@@ -176,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running,
if (d->tail) {
struct gnet_stats_basic sb;
+ int res;
memset(&sb, 0, sizeof(sb));
sb.bytes = bstats.bytes;
sb.packets = bstats.packets;
- return gnet_stats_copy(d, type, &sb, sizeof(sb),
- TCA_STATS_PAD);
+ res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD);
+ if (res < 0 || sb.packets == bstats.packets)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets,
+ sizeof(bstats.packets), TCA_STATS_PAD);
}
return 0;
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index f93785e5833c..99a6de52b21d 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -88,11 +88,16 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb)
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
+ struct net_device *dev = skb_dst(skb)->dev;
struct iphdr *iph = ip_hdr(skb);
+ dev_hold(dev);
+ skb_dst_drop(skb);
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb_dst(skb)->dev);
+ iph->tos, dev);
+ dev_put(dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
+ skb_dst_drop(skb);
err = ipv6_stub->ipv6_route_input(skb);
} else {
err = -EAFNOSUPPORT;
@@ -225,9 +230,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl6.daddr = iph6->daddr;
fl6.saddr = iph6->saddr;
- err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
- if (unlikely(err))
- goto err;
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto err;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f79e61c570ea..789a73aa7bd8 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -98,9 +98,6 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
- if (neigh->parms->neigh_cleanup)
- neigh->parms->neigh_cleanup(neigh);
-
trace_neigh_cleanup_and_release(neigh, 0);
__neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -1197,7 +1194,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
if (update) {
hh = &neigh->hh;
- if (hh->hh_len) {
+ if (READ_ONCE(hh->hh_len)) {
write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_sequnlock_bh(&hh->hh_lock);
@@ -1476,7 +1473,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
struct net_device *dev = neigh->dev;
unsigned int seq;
- if (dev->header_ops->cache && !neigh->hh.hh_len)
+ if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
neigh_hh_init(neigh);
do {
@@ -2052,8 +2049,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
goto nla_put_failure;
{
unsigned long now = jiffies;
- unsigned int flush_delta = now - tbl->last_flush;
- unsigned int rand_delta = now - tbl->last_rand;
+ long flush_delta = now - tbl->last_flush;
+ long rand_delta = now - tbl->last_rand;
struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
@@ -3033,7 +3030,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct neigh_hash_table *nht = state->nht;
struct neighbour *n = NULL;
- int bucket = state->bucket;
+ int bucket;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
@@ -3293,6 +3290,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
+ (*pos)++;
return NULL;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 36347933ec3a..6bbd06f7dc7d 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -20,8 +20,8 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff
struct hlist_head *h;
unsigned int count = 0, offset = get_offset(*pos);
- h = &net->dev_name_head[get_bucket(*pos)];
- hlist_for_each_entry_rcu(dev, h, name_hlist) {
+ h = &net->dev_index_head[get_bucket(*pos)];
+ hlist_for_each_entry_rcu(dev, h, index_hlist) {
if (++count == offset)
return dev;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 865ba6ca16eb..4c826b8bf9b1 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -919,25 +919,30 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Kobject_put later will trigger rx_queue_release call which
+ * decreases dev refcount: Take that reference here
+ */
+ dev_hold(queue->dev);
+
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
if (error)
- return error;
-
- dev_hold(queue->dev);
+ goto err;
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto err;
}
kobject_uevent(kobj, KOBJ_ADD);
return error;
+
+err:
+ kobject_put(kobj);
+ return error;
}
#endif /* CONFIG_SYSFS */
@@ -1457,25 +1462,29 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Kobject_put later will trigger netdev_queue_release call
+ * which decreases dev refcount: Take that reference here
+ */
+ dev_hold(queue->dev);
+
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
if (error)
- return error;
-
- dev_hold(queue->dev);
+ goto err;
#ifdef CONFIG_BQL
error = sysfs_create_group(kobj, &dql_group);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto err;
#endif
kobject_uevent(kobj, KOBJ_ADD);
-
return 0;
+
+err:
+ kobject_put(kobj);
+ return error;
}
#endif /* CONFIG_SYSFS */
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a0e0d298c991..757cc1d084e7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -211,16 +211,10 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}
-/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
- * is set to true, thus the caller knows that the new id must be notified via
- * rtnl.
- */
-static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
+/* Must be called from RCU-critical section or with nsid_lock held */
+static int __peernet2id(const struct net *net, struct net *peer)
{
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
- bool alloc_it = *alloc;
-
- *alloc = false;
/* Magic value for id 0. */
if (id == NET_ID_ZERO)
@@ -228,61 +222,60 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
if (id > 0)
return id;
- if (alloc_it) {
- id = alloc_netid(net, peer, -1);
- *alloc = true;
- return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
- }
-
return NETNSA_NSID_NOT_ASSIGNED;
}
-/* should be called with nsid_lock held */
-static int __peernet2id(struct net *net, struct net *peer)
-{
- bool no = false;
-
- return __peernet2id_alloc(net, peer, &no);
-}
-
-static void rtnl_net_notifyid(struct net *net, int cmd, int id);
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
+ struct nlmsghdr *nlh, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
-int peernet2id_alloc(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
- bool alloc = false, alive = false;
int id;
if (refcount_read(&net->count) == 0)
return NETNSA_NSID_NOT_ASSIGNED;
- spin_lock_bh(&net->nsid_lock);
- /*
- * When peer is obtained from RCU lists, we may race with
+
+ spin_lock(&net->nsid_lock);
+ id = __peernet2id(net, peer);
+ if (id >= 0) {
+ spin_unlock(&net->nsid_lock);
+ return id;
+ }
+
+ /* When peer is obtained from RCU lists, we may race with
* its cleanup. Check whether it's alive, and this guarantees
* we never hash a peer back to net->netns_ids, after it has
* just been idr_remove()'d from there in cleanup_net().
*/
- if (maybe_get_net(peer))
- alive = alloc = true;
- id = __peernet2id_alloc(net, peer, &alloc);
- spin_unlock_bh(&net->nsid_lock);
- if (alloc && id >= 0)
- rtnl_net_notifyid(net, RTM_NEWNSID, id);
- if (alive)
- put_net(peer);
+ if (!maybe_get_net(peer)) {
+ spin_unlock(&net->nsid_lock);
+ return NETNSA_NSID_NOT_ASSIGNED;
+ }
+
+ id = alloc_netid(net, peer, -1);
+ spin_unlock(&net->nsid_lock);
+
+ put_net(peer);
+ if (id < 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
+
+ rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);
+
return id;
}
EXPORT_SYMBOL_GPL(peernet2id_alloc);
/* This function returns, if assigned, the id of a peer netns. */
-int peernet2id(struct net *net, struct net *peer)
+int peernet2id(const struct net *net, struct net *peer)
{
int id;
- spin_lock_bh(&net->nsid_lock);
+ rcu_read_lock();
id = __peernet2id(net, peer);
- spin_unlock_bh(&net->nsid_lock);
+ rcu_read_unlock();
+
return id;
}
EXPORT_SYMBOL(peernet2id);
@@ -290,12 +283,12 @@ EXPORT_SYMBOL(peernet2id);
/* This function returns true is the peer netns has an id assigned into the
* current netns.
*/
-bool peernet_has_id(struct net *net, struct net *peer)
+bool peernet_has_id(const struct net *net, struct net *peer)
{
return peernet2id(net, peer) >= 0;
}
-struct net *get_net_ns_by_id(struct net *net, int id)
+struct net *get_net_ns_by_id(const struct net *net, int id)
{
struct net *peer;
@@ -478,6 +471,7 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0) {
put_userns:
+ key_remove_domain(net->key_domain);
put_user_ns(user_ns);
net_drop_ns(net);
dec_ucounts:
@@ -526,19 +520,20 @@ static void unhash_nsid(struct net *net, struct net *last)
for_each_net(tmp) {
int id;
- spin_lock_bh(&tmp->nsid_lock);
+ spin_lock(&tmp->nsid_lock);
id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
- spin_unlock_bh(&tmp->nsid_lock);
+ spin_unlock(&tmp->nsid_lock);
if (id >= 0)
- rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
+ GFP_KERNEL);
if (tmp == last)
break;
}
- spin_lock_bh(&net->nsid_lock);
+ spin_lock(&net->nsid_lock);
idr_destroy(&net->netns_ids);
- spin_unlock_bh(&net->nsid_lock);
+ spin_unlock(&net->nsid_lock);
}
static LLIST_HEAD(cleanup_list);
@@ -751,9 +746,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(peer);
}
- spin_lock_bh(&net->nsid_lock);
+ spin_lock(&net->nsid_lock);
if (__peernet2id(net, peer) >= 0) {
- spin_unlock_bh(&net->nsid_lock);
+ spin_unlock(&net->nsid_lock);
err = -EEXIST;
NL_SET_BAD_ATTR(extack, nla);
NL_SET_ERR_MSG(extack,
@@ -762,9 +757,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
}
err = alloc_netid(net, peer, nsid);
- spin_unlock_bh(&net->nsid_lock);
+ spin_unlock(&net->nsid_lock);
if (err >= 0) {
- rtnl_net_notifyid(net, RTM_NEWNSID, err);
+ rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
+ nlh, GFP_KERNEL);
err = 0;
} else if (err == -ENOSPC && nsid >= 0) {
err = -EEXIST;
@@ -946,6 +942,7 @@ struct rtnl_net_dump_cb {
int s_idx;
};
+/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
@@ -1030,19 +1027,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
goto end;
}
- spin_lock_bh(&net_cb.tgt_net->nsid_lock);
- if (net_cb.fillargs.add_ref &&
- !net_eq(net_cb.ref_net, net_cb.tgt_net) &&
- !spin_trylock_bh(&net_cb.ref_net->nsid_lock)) {
- spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
- err = -EAGAIN;
- goto end;
- }
+ rcu_read_lock();
idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- if (net_cb.fillargs.add_ref &&
- !net_eq(net_cb.ref_net, net_cb.tgt_net))
- spin_unlock_bh(&net_cb.ref_net->nsid_lock);
- spin_unlock_bh(&net_cb.tgt_net->nsid_lock);
+ rcu_read_unlock();
cb->args[0] = net_cb.idx;
end:
@@ -1051,16 +1038,19 @@ end:
return err < 0 ? err : skb->len;
}
-static void rtnl_net_notifyid(struct net *net, int cmd, int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
+ struct nlmsghdr *nlh, gfp_t gfp)
{
struct net_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
.cmd = cmd,
.nsid = id,
};
struct sk_buff *msg;
int err = -ENOMEM;
- msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ msg = nlmsg_new(rtnl_net_get_size(), gfp);
if (!msg)
goto out;
@@ -1068,7 +1058,7 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id)
if (err < 0)
goto err_out;
- rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+ rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
return;
err_out:
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 2cf27da1baeb..849380a622ef 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -122,7 +122,7 @@ static void queue_process(struct work_struct *work)
txq = netdev_get_tx_queue(dev, q_index);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
- netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
+ !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
skb_queue_head(&npinfo->txq, skb);
HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
@@ -335,7 +335,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
HARD_TX_UNLOCK(dev, txq);
- if (status == NETDEV_TX_OK)
+ if (dev_xmit_complete(status))
break;
}
@@ -352,7 +352,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
}
- if (status != NETDEV_TX_OK) {
+ if (!dev_xmit_complete(status)) {
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 256b7954b720..8881dd943dd0 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx)
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
{
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
- int id = css->cgroup->id;
+ int id = css->id;
if (map && id < map->priomap_len)
return map->priomap[id];
@@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css,
struct net_device *dev, u32 prio)
{
struct netprio_map *map;
- int id = css->cgroup->id;
+ int id = css->id;
int ret;
/* avoid extending priomap for zero writes */
@@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return css->cgroup->id;
+ return css->id;
}
static int read_priomap(struct seq_file *sf, void *v)
@@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(p, css, tset) {
- void *v = (void *)(unsigned long)css->cgroup->id;
+ void *v = (void *)(unsigned long)css->id;
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 3272dc7a8c81..10d2b255df5e 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -18,6 +18,9 @@
#include <trace/events/page_pool.h>
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (60 * HZ)
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -44,6 +47,21 @@ static int page_pool_init(struct page_pool *pool,
(pool->p.dma_dir != DMA_BIDIRECTIONAL))
return -EINVAL;
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
+ /* In order to request DMA-sync-for-device the page
+ * needs to be mapped
+ */
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ return -EINVAL;
+
+ if (!pool->p.max_len)
+ return -EINVAL;
+
+ /* pool->p.offset has to be set according to the address
+ * offset used by the DMA engine to start copying rx data
+ */
+ }
+
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
@@ -61,7 +79,7 @@ static int page_pool_init(struct page_pool *pool,
struct page_pool *page_pool_create(const struct page_pool_params *params)
{
struct page_pool *pool;
- int err = 0;
+ int err;
pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
if (!pool)
@@ -78,47 +96,86 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
}
EXPORT_SYMBOL(page_pool_create);
-/* fast path */
-static struct page *__page_pool_get_cached(struct page_pool *pool)
+static void __page_pool_return_page(struct page_pool *pool, struct page *page);
+
+noinline
+static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
struct page *page;
+ int pref_nid; /* preferred NUMA node */
/* Quicker fallback, avoid locks when ring is empty */
if (__ptr_ring_empty(r))
return NULL;
- /* Test for safe-context, caller should provide this guarantee */
- if (likely(in_serving_softirq())) {
- if (likely(pool->alloc.count)) {
- /* Fast-path */
- page = pool->alloc.cache[--pool->alloc.count];
- return page;
- }
- /* Slower-path: Alloc array empty, time to refill
- *
- * Open-coded bulk ptr_ring consumer.
- *
- * Discussion: the ring consumer lock is not really
- * needed due to the softirq/NAPI protection, but
- * later need the ability to reclaim pages on the
- * ring. Thus, keeping the locks.
- */
- spin_lock(&r->consumer_lock);
- while ((page = __ptr_ring_consume(r))) {
- if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
- break;
+ /* Softirq guarantee CPU and thus NUMA node is stable. This,
+ * assumes CPU refilling driver RX-ring will also run RX-NAPI.
+ */
+#ifdef CONFIG_NUMA
+ pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
+#else
+ /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
+ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
+#endif
+
+ /* Slower-path: Get pages from locked ring queue */
+ spin_lock(&r->consumer_lock);
+
+ /* Refill alloc array, but only if NUMA match */
+ do {
+ page = __ptr_ring_consume(r);
+ if (unlikely(!page))
+ break;
+
+ if (likely(page_to_nid(page) == pref_nid)) {
pool->alloc.cache[pool->alloc.count++] = page;
+ } else {
+ /* NUMA mismatch;
+ * (1) release 1 page to page-allocator and
+ * (2) break out to fallthrough to alloc_pages_node.
+ * This limit stress on page buddy alloactor.
+ */
+ __page_pool_return_page(pool, page);
+ page = NULL;
+ break;
}
- spin_unlock(&r->consumer_lock);
- return page;
+ } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
+
+ /* Return last page */
+ if (likely(pool->alloc.count > 0))
+ page = pool->alloc.cache[--pool->alloc.count];
+
+ spin_unlock(&r->consumer_lock);
+ return page;
+}
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+ struct page *page;
+
+ /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ } else {
+ page = page_pool_refill_alloc_cache(pool);
}
- /* Slow-path: Get page from locked ring queue */
- page = ptr_ring_consume(&pool->ring);
return page;
}
+static void page_pool_dma_sync_for_device(struct page_pool *pool,
+ struct page *page,
+ unsigned int dma_sync_size)
+{
+ dma_sync_size = min(dma_sync_size, pool->p.max_len);
+ dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+ pool->p.offset, dma_sync_size,
+ pool->p.dma_dir);
+}
+
/* slow path */
noinline
static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
@@ -142,7 +199,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
*/
/* Cache was empty, do real allocation */
+#ifdef CONFIG_NUMA
page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+#else
+ page = alloc_pages(gfp, pool->p.order);
+#endif
if (!page)
return NULL;
@@ -163,6 +224,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
}
page->dma_addr = dma;
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+
skip_dma_map:
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -200,22 +264,14 @@ static s32 page_pool_inflight(struct page_pool *pool)
{
u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
- s32 distance;
-
- distance = _distance(hold_cnt, release_cnt);
-
- trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
- return distance;
-}
+ s32 inflight;
-static bool __page_pool_safe_to_destroy(struct page_pool *pool)
-{
- s32 inflight = page_pool_inflight(pool);
+ inflight = _distance(hold_cnt, release_cnt);
- /* The distance should not be able to become negative */
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
- return (inflight == 0);
+ return inflight;
}
/* Cleanup page_pool state from page */
@@ -223,6 +279,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
{
dma_addr_t dma;
+ int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
goto skip_dma_unmap;
@@ -234,9 +291,11 @@ static void __page_pool_clean_page(struct page_pool *pool,
DMA_ATTR_SKIP_CPU_SYNC);
page->dma_addr = 0;
skip_dma_unmap:
- atomic_inc(&pool->pages_state_release_cnt);
- trace_page_pool_state_release(pool, page,
- atomic_read(&pool->pages_state_release_cnt));
+ /* This may be the last page returned, releasing the pool, so
+ * it is not safe to reference pool afterwards.
+ */
+ count = atomic_inc_return(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, page, count);
}
/* unmap the page and clean our state */
@@ -290,8 +349,16 @@ static bool __page_pool_recycle_direct(struct page *page,
return true;
}
-void __page_pool_put_page(struct page_pool *pool,
- struct page *page, bool allow_direct)
+/* page is NOT reusable when:
+ * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
+ */
+static bool pool_page_reusable(struct page_pool *pool, struct page *page)
+{
+ return !page_is_pfmemalloc(page);
+}
+
+void __page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
@@ -299,9 +366,14 @@ void __page_pool_put_page(struct page_pool *pool,
*
* refcnt == 1 means page_pool owns page, and can recycle it.
*/
- if (likely(page_ref_count(page) == 1)) {
+ if (likely(page_ref_count(page) == 1 &&
+ pool_page_reusable(pool, page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page,
+ dma_sync_size);
+
if (allow_direct && in_serving_softirq())
if (__page_pool_recycle_direct(page, pool))
return;
@@ -345,31 +417,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
}
}
-static void __warn_in_flight(struct page_pool *pool)
+static void page_pool_free(struct page_pool *pool)
{
- u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
- u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
- s32 distance;
-
- distance = _distance(hold_cnt, release_cnt);
-
- /* Drivers should fix this, but only problematic when DMA is used */
- WARN(1, "Still in-flight pages:%d hold:%u released:%u",
- distance, hold_cnt, release_cnt);
-}
-
-void __page_pool_free(struct page_pool *pool)
-{
- /* Only last user actually free/release resources */
- if (!page_pool_put(pool))
- return;
-
- WARN(pool->alloc.count, "API usage violation");
- WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
-
- /* Can happen due to forced shutdown */
- if (!__page_pool_safe_to_destroy(pool))
- __warn_in_flight(pool);
+ if (pool->disconnect)
+ pool->disconnect(pool);
ptr_ring_cleanup(&pool->ring, NULL);
@@ -378,15 +429,14 @@ void __page_pool_free(struct page_pool *pool)
kfree(pool);
}
-EXPORT_SYMBOL(__page_pool_free);
-/* Request to shutdown: release pages cached by page_pool, and check
- * for in-flight pages
- */
-bool __page_pool_request_shutdown(struct page_pool *pool)
+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
{
struct page *page;
+ if (pool->destroy_cnt)
+ return;
+
/* Empty alloc cache, assume caller made sure this is
* no-longer in use, and page_pool_alloc_pages() cannot be
* call concurrently.
@@ -395,12 +445,91 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
page = pool->alloc.cache[--pool->alloc.count];
__page_pool_return_page(pool, page);
}
+}
+
+static void page_pool_scrub(struct page_pool *pool)
+{
+ page_pool_empty_alloc_cache_once(pool);
+ pool->destroy_cnt++;
/* No more consumers should exist, but producers could still
* be in-flight.
*/
__page_pool_empty_ring(pool);
+}
+
+static int page_pool_release(struct page_pool *pool)
+{
+ int inflight;
+
+ page_pool_scrub(pool);
+ inflight = page_pool_inflight(pool);
+ if (!inflight)
+ page_pool_free(pool);
+
+ return inflight;
+}
+
+static void page_pool_release_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+ int inflight;
+
+ inflight = page_pool_release(pool);
+ if (!inflight)
+ return;
+
+ /* Periodic warning */
+ if (time_after_eq(jiffies, pool->defer_warn)) {
+ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
+
+ pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
+ __func__, inflight, sec);
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
+{
+ refcount_inc(&pool->user_cnt);
+ pool->disconnect = disconnect;
+}
+
+void page_pool_destroy(struct page_pool *pool)
+{
+ if (!pool)
+ return;
+
+ if (!page_pool_put(pool))
+ return;
+
+ if (!page_pool_release(pool))
+ return;
+
+ pool->defer_start = jiffies;
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+
+ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+EXPORT_SYMBOL(page_pool_destroy);
+
+/* Caller must provide appropriate safe context, e.g. NAPI. */
+void page_pool_update_nid(struct page_pool *pool, int new_nid)
+{
+ struct page *page;
+
+ trace_page_pool_update_nid(pool, new_nid);
+ pool->p.nid = new_nid;
- return __page_pool_safe_to_destroy(pool);
+ /* Flush pool alloc cache, as refill will check NUMA node */
+ while (pool->alloc.count) {
+ page = pool->alloc.cache[--pool->alloc.count];
+ __page_pool_return_page(pool, page);
+ }
}
-EXPORT_SYMBOL(__page_pool_request_shutdown);
+EXPORT_SYMBOL(page_pool_update_nid);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index bb9915291644..acc849df60b5 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -535,12 +535,12 @@ static int pgctrl_open(struct inode *inode, struct file *file)
return single_open(file, pgctrl_show, PDE_DATA(inode));
}
-static const struct file_operations pktgen_fops = {
- .open = pgctrl_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pgctrl_write,
- .release = single_release,
+static const struct proc_ops pktgen_proc_ops = {
+ .proc_open = pgctrl_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pgctrl_write,
+ .proc_release = single_release,
};
static int pktgen_if_show(struct seq_file *seq, void *v)
@@ -1707,12 +1707,12 @@ static int pktgen_if_open(struct inode *inode, struct file *file)
return single_open(file, pktgen_if_show, PDE_DATA(inode));
}
-static const struct file_operations pktgen_if_fops = {
- .open = pktgen_if_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_if_write,
- .release = single_release,
+static const struct proc_ops pktgen_if_proc_ops = {
+ .proc_open = pktgen_if_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pktgen_if_write,
+ .proc_release = single_release,
};
static int pktgen_thread_show(struct seq_file *seq, void *v)
@@ -1844,12 +1844,12 @@ static int pktgen_thread_open(struct inode *inode, struct file *file)
return single_open(file, pktgen_thread_show, PDE_DATA(inode));
}
-static const struct file_operations pktgen_thread_fops = {
- .open = pktgen_thread_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_thread_write,
- .release = single_release,
+static const struct proc_ops pktgen_thread_proc_ops = {
+ .proc_open = pktgen_thread_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pktgen_thread_write,
+ .proc_release = single_release,
};
/* Think find or remove for NN */
@@ -1926,7 +1926,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
pkt_dev->entry = proc_create_data(dev->name, 0600,
pn->proc_dir,
- &pktgen_if_fops,
+ &pktgen_if_proc_ops,
pkt_dev);
if (!pkt_dev->entry)
pr_err("can't move proc entry for '%s'\n",
@@ -2156,7 +2156,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
s64 remaining;
struct hrtimer_sleeper t;
- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
@@ -2170,11 +2170,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
end_time = ktime_get();
} while (ktime_compare(end_time, spin_until) < 0);
} else {
- /* see do_nanosleep */
- hrtimer_init_sleeper(&t, current);
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS);
if (likely(t.task))
schedule();
@@ -2652,7 +2650,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
get_page(pkt_dev->page);
skb_frag_set_page(skb, i, pkt_dev->page);
- skb_shinfo(skb)->frags[i].page_offset = 0;
+ skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0);
/*last fragment, fill rest of data*/
if (i == (frags - 1))
skb_frag_size_set(&skb_shinfo(skb)->frags[i],
@@ -3406,7 +3404,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
HARD_TX_LOCK(odev, txq, smp_processor_id());
if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
- ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
}
@@ -3641,7 +3638,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->clone_skb = pg_clone_skb_d;
pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
- &pktgen_if_fops, pkt_dev);
+ &pktgen_if_proc_ops, pkt_dev);
if (!pkt_dev->entry) {
pr_err("cannot create %s/%s procfs entry\n",
PG_PROC_DIR, ifname);
@@ -3711,7 +3708,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
t->tsk = p;
pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
- &pktgen_thread_fops, t);
+ &pktgen_thread_proc_ops, t);
if (!pe) {
pr_err("cannot create %s/%s procfs entry\n",
PG_PROC_DIR, t->tsk->comm);
@@ -3796,7 +3793,7 @@ static int __net_init pg_net_init(struct net *net)
pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
return -ENODEV;
}
- pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops);
if (pe == NULL) {
pr_err("cannot create %s procfs entry\n", PGCTRL);
ret = -EINVAL;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index c9bb00008528..f35c2e998406 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
- tcp_sk(sk)->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
tcp_rsk(req)->tfo_listener = false;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1ee6460f8275..09c44bf2e1d2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -980,6 +980,19 @@ static size_t rtnl_xdp_size(void)
return xdp_size;
}
+static size_t rtnl_prop_list_size(const struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ size_t size;
+
+ if (list_empty(&dev->name_node->list))
+ return 0;
+ size = nla_total_size(0);
+ list_for_each_entry(name_node, &dev->name_node->list, list)
+ size += nla_total_size(ALTIFNAMSIZ);
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1027,6 +1040,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ nla_total_size(4) /* IFLA_MIN_MTU */
+ nla_total_size(4) /* IFLA_MAX_MTU */
+ + rtnl_prop_list_size(dev)
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ 0;
}
@@ -1204,6 +1219,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_mac vf_mac;
struct ifla_vf_broadcast vf_broadcast;
struct ifla_vf_info ivi;
+ struct ifla_vf_guid node_guid;
+ struct ifla_vf_guid port_guid;
memset(&ivi, 0, sizeof(ivi));
@@ -1225,6 +1242,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
return 0;
memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+ memset(&node_guid, 0, sizeof(node_guid));
+ memset(&port_guid, 0, sizeof(port_guid));
vf_mac.vf =
vf_vlan.vf =
@@ -1234,7 +1253,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_spoofchk.vf =
vf_linkstate.vf =
vf_rss_query_en.vf =
- vf_trust.vf = ivi.vf;
+ vf_trust.vf =
+ node_guid.vf =
+ port_guid.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
@@ -1270,6 +1291,16 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put(skb, IFLA_VF_TRUST,
sizeof(vf_trust), &vf_trust))
goto nla_put_vf_failure;
+
+ if (dev->netdev_ops->ndo_get_vf_guid &&
+ !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid,
+ &port_guid)) {
+ if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid),
+ &node_guid) ||
+ nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid),
+ &port_guid))
+ goto nla_put_vf_failure;
+ }
vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST);
if (!vfvlanlist)
goto nla_put_vf_failure;
@@ -1523,7 +1554,7 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
static int rtnl_fill_link_netnsid(struct sk_buff *skb,
const struct net_device *dev,
- struct net *src_net)
+ struct net *src_net, gfp_t gfp)
{
bool put_iflink = false;
@@ -1531,7 +1562,7 @@ static int rtnl_fill_link_netnsid(struct sk_buff *skb,
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) {
- int id = peernet2id_alloc(src_net, link_net);
+ int id = peernet2id_alloc(src_net, link_net, gfp);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
return -EMSGSIZE;
@@ -1584,12 +1615,48 @@ static int rtnl_fill_link_af(struct sk_buff *skb,
return 0;
}
+static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ int count = 0;
+
+ list_for_each_entry(name_node, &dev->name_node->list, list) {
+ if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name))
+ return -EMSGSIZE;
+ count++;
+ }
+ return count;
+}
+
+static int rtnl_fill_prop_list(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *prop_list;
+ int ret;
+
+ prop_list = nla_nest_start(skb, IFLA_PROP_LIST);
+ if (!prop_list)
+ return -EMSGSIZE;
+
+ ret = rtnl_fill_alt_ifnames(skb, dev);
+ if (ret <= 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, prop_list);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, prop_list);
+ return ret;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask,
u32 event, int *new_nsid, int new_ifindex,
- int tgt_netnsid)
+ int tgt_netnsid, gfp_t gfp)
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
@@ -1681,7 +1748,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_netnsid(skb, dev, src_net))
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp))
goto nla_put_failure;
if (new_nsid &&
@@ -1691,12 +1758,18 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
goto nla_put_failure;
+ if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) &&
+ nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr))
+ goto nla_put_failure;
rcu_read_lock();
if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
goto nla_put_failure_rcu;
rcu_read_unlock();
+ if (rtnl_fill_prop_list(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1750,6 +1823,10 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
[IFLA_MIN_MTU] = { .type = NLA_U32 },
[IFLA_MAX_MTU] = { .type = NLA_U32 },
+ [IFLA_PROP_LIST] = { .type = NLA_NESTED },
+ [IFLA_ALT_IFNAME] = { .type = NLA_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2001,7 +2078,7 @@ walk_entries:
NETLINK_CB(cb->skb).portid,
nlh->nlmsg_seq, 0, flags,
ext_filter_mask, 0, NULL, 0,
- netnsid);
+ netnsid, GFP_KERNEL);
if (err < 0) {
if (likely(skb->len))
@@ -2195,6 +2272,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_MAC]) {
struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+ if (ivm->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_mac)
err = ops->ndo_set_vf_mac(dev, ivm->vf,
@@ -2206,6 +2285,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_VLAN]) {
struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+ if (ivv->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
@@ -2238,6 +2319,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (len == 0)
return -EINVAL;
+ if (ivvl[0]->vf >= INT_MAX)
+ return -EINVAL;
err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
@@ -2248,6 +2331,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
struct ifla_vf_info ivf;
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_get_vf_config)
err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
@@ -2266,6 +2351,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_RATE]) {
struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_rate)
err = ops->ndo_set_vf_rate(dev, ivt->vf,
@@ -2278,6 +2365,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_SPOOFCHK]) {
struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+ if (ivs->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_spoofchk)
err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
@@ -2289,6 +2378,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_LINK_STATE]) {
struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+ if (ivl->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_link_state)
err = ops->ndo_set_vf_link_state(dev, ivl->vf,
@@ -2302,6 +2393,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ivrssq_en->vf >= INT_MAX)
+ return -EINVAL;
if (ops->ndo_set_vf_rss_query_en)
err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
ivrssq_en->setting);
@@ -2312,6 +2405,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_TRUST]) {
struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_trust)
err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
@@ -2322,15 +2417,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_IB_NODE_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
-
return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
}
if (tb[IFLA_VF_IB_PORT_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
@@ -2355,6 +2453,7 @@ static int do_set_master(struct net_device *dev, int ifindex,
err = ops->ndo_del_slave(upper_dev, dev);
if (err)
return err;
+ netdev_update_lockdep_key(dev);
} else {
return -EOPNOTSUPP;
}
@@ -2722,6 +2821,26 @@ errout:
return err;
}
+static struct net_device *rtnl_dev_get(struct net *net,
+ struct nlattr *ifname_attr,
+ struct nlattr *altifname_attr,
+ char *ifname)
+{
+ char buffer[ALTIFNAMSIZ];
+
+ if (!ifname) {
+ ifname = buffer;
+ if (ifname_attr)
+ nla_strlcpy(ifname, ifname_attr, IFNAMSIZ);
+ else if (altifname_attr)
+ nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ);
+ else
+ return NULL;
+ }
+
+ return __dev_get_by_name(net, ifname);
+}
+
static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -2750,8 +2869,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = __dev_get_by_name(net, ifname);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname);
else
goto errout;
@@ -2824,7 +2943,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *tgt_net = net;
struct net_device *dev = NULL;
struct ifinfomsg *ifm;
- char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
int err;
int netnsid = -1;
@@ -2838,9 +2956,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
-
if (tb[IFLA_TARGET_NETNSID]) {
netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
@@ -2852,8 +2967,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = __dev_get_by_name(tgt_net, ifname);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, tb[IFLA_IFNAME],
+ tb[IFLA_ALT_IFNAME], NULL);
else if (tb[IFLA_GROUP])
err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
else
@@ -2937,8 +3053,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
dev->rtnl_link_ops = ops;
dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
- if (tb[IFLA_MTU])
- dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
+ int err;
+
+ err = dev_validate_mtu(dev, mtu, extack);
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ dev->mtu = mtu;
+ }
if (tb[IFLA_ADDRESS]) {
memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
nla_len(tb[IFLA_ADDRESS]));
@@ -3024,12 +3149,10 @@ replay:
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
- else {
- if (ifname[0])
- dev = __dev_get_by_name(net, ifname);
- else
- dev = NULL;
- }
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname);
+ else
+ dev = NULL;
if (dev) {
master_dev = netdev_master_upper_dev_get(dev);
@@ -3291,6 +3414,7 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
switch (i) {
case IFLA_IFNAME:
+ case IFLA_ALT_IFNAME:
case IFLA_EXT_MASK:
case IFLA_TARGET_NETNSID:
break;
@@ -3309,7 +3433,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct net *tgt_net = net;
struct ifinfomsg *ifm;
- char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
struct net_device *dev = NULL;
struct sk_buff *nskb;
@@ -3332,9 +3455,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(tgt_net);
}
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
-
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
@@ -3342,8 +3462,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = __dev_get_by_name(tgt_net, ifname);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME],
+ tb[IFLA_ALT_IFNAME], NULL);
else
goto out;
@@ -3359,7 +3480,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
err = rtnl_fill_ifinfo(nskb, dev, net,
RTM_NEWLINK, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, 0, 0, ext_filter_mask,
- 0, NULL, 0, netnsid);
+ 0, NULL, 0, netnsid, GFP_KERNEL);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
WARN_ON(err == -EMSGSIZE);
@@ -3373,6 +3494,100 @@ out:
return err;
}
+static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
+ bool *changed, struct netlink_ext_ack *extack)
+{
+ char *alt_ifname;
+ int err;
+
+ err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
+ if (err)
+ return err;
+
+ alt_ifname = nla_data(attr);
+ if (cmd == RTM_NEWLINKPROP) {
+ alt_ifname = kstrdup(alt_ifname, GFP_KERNEL);
+ if (!alt_ifname)
+ return -ENOMEM;
+ err = netdev_name_node_alt_create(dev, alt_ifname);
+ if (err) {
+ kfree(alt_ifname);
+ return err;
+ }
+ } else if (cmd == RTM_DELLINKPROP) {
+ err = netdev_name_node_alt_destroy(dev, alt_ifname);
+ if (err)
+ return err;
+ } else {
+ WARN_ON(1);
+ return 0;
+ }
+
+ *changed = true;
+ return 0;
+}
+
+static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ bool changed = false;
+ struct nlattr *attr;
+ int err, rem;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ if (err)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, tb[IFLA_IFNAME],
+ tb[IFLA_ALT_IFNAME], NULL);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!tb[IFLA_PROP_LIST])
+ return 0;
+
+ nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) {
+ switch (nla_type(attr)) {
+ case IFLA_ALT_IFNAME:
+ err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack);
+ if (err)
+ return err;
+ break;
+ }
+ }
+
+ if (changed)
+ netdev_state_change(dev);
+ return 0;
+}
+
+static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack);
+}
+
+static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
+}
+
static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
@@ -3471,7 +3686,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
type, 0, 0, change, 0, 0, event,
- new_nsid, new_ifindex, -1);
+ new_nsid, new_ifindex, -1, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3916,7 +4131,7 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_flags || ndm->ndm_type) {
- NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request");
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
return -EINVAL;
}
@@ -5331,6 +5546,9 @@ void __init rtnetlink_init(void)
rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);
+
rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);
diff --git a/net/core/scm.c b/net/core/scm.c
index 31a38239c92f..dc6fed1f221c 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -268,8 +268,10 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter
struct scm_timestamping tss;
int i;
- for (i = 0; i < ARRAY_SIZE(tss.ts); i++)
- tss.ts[i] = timespec64_to_timespec(tss_internal->ts[i]);
+ for (i = 0; i < ARRAY_SIZE(tss.ts); i++) {
+ tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec;
+ tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec;
+ }
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0338820ee0ec..864cb9e9622f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -68,6 +68,7 @@
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
+#include <net/mptcp.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -785,7 +786,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
struct page *p;
u8 *vaddr;
- skb_frag_foreach_page(frag, frag->page_offset,
+ skb_frag_foreach_page(frag, skb_frag_off(frag),
skb_frag_size(frag), p, p_off, p_len,
copied) {
seg_len = min_t(int, p_len, len);
@@ -1375,7 +1376,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
struct page *p;
u8 *vaddr;
- skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
p, p_off, p_len, copied) {
u32 copy, done = 0;
vaddr = kmap_atomic(p);
@@ -2144,10 +2145,12 @@ pull_pages:
skb_frag_unref(skb, i);
eat -= size;
} else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
+
+ *frag = skb_shinfo(skb)->frags[i];
if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+ skb_frag_off_add(frag, eat);
+ skb_frag_size_sub(frag, eat);
if (!i)
goto end;
eat = 0;
@@ -2219,7 +2222,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
copy = len;
skb_frag_foreach_page(f,
- f->page_offset + offset - start,
+ skb_frag_off(f) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(to + copied, vaddr + p_off, p_len);
@@ -2395,7 +2398,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
if (__splice_segment(skb_frag_page(f),
- f->page_offset, skb_frag_size(f),
+ skb_frag_off(f), skb_frag_size(f),
offset, len, spd, false, sk, pipe))
return true;
}
@@ -2485,20 +2488,20 @@ do_frag_list:
for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
- if (offset < frag->size)
+ if (offset < skb_frag_size(frag))
break;
- offset -= frag->size;
+ offset -= skb_frag_size(frag);
}
for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
- slen = min_t(size_t, len, frag->size - offset);
+ slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
- ret = kernel_sendpage_locked(sk, frag->page.p,
- frag->page_offset + offset,
+ ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
+ skb_frag_off(frag) + offset,
slen, MSG_DONTWAIT);
if (ret <= 0)
goto error;
@@ -2580,7 +2583,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
copy = len;
skb_frag_foreach_page(frag,
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(vaddr + p_off, from + copied, p_len);
@@ -2660,7 +2663,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
copy = len;
skb_frag_foreach_page(frag,
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
csum2 = INDIRECT_CALL_1(ops->update,
@@ -2759,7 +2762,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
copy = len;
skb_frag_foreach_page(frag,
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
csum2 = csum_partial_copy_nocheck(vaddr + p_off,
@@ -2975,11 +2978,15 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
skb_zerocopy_clone(to, from, GFP_ATOMIC);
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ int size;
+
if (!len)
break;
skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
- skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
- len -= skb_shinfo(to)->frags[j].size;
+ size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
+ len);
+ skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
+ len -= size;
skb_frag_ref(to, j);
j++;
}
@@ -3230,7 +3237,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,
* 2. Split is accurately. We make this.
*/
skb_frag_ref(skb, i);
- skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+ skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
@@ -3293,7 +3300,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
int from, to, merge, todo;
- struct skb_frag_struct *fragfrom, *fragto;
+ skb_frag_t *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
@@ -3312,7 +3319,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
*/
if (!to ||
!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
- fragfrom->page_offset)) {
+ skb_frag_off(fragfrom))) {
merge = -1;
} else {
merge = to - 1;
@@ -3329,7 +3336,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
skb_frag_size_add(fragto, shiftlen);
skb_frag_size_sub(fragfrom, shiftlen);
- fragfrom->page_offset += shiftlen;
+ skb_frag_off_add(fragfrom, shiftlen);
goto onlymerged;
}
@@ -3360,11 +3367,11 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
} else {
__skb_frag_ref(fragfrom);
- fragto->page = fragfrom->page;
- fragto->page_offset = fragfrom->page_offset;
+ skb_frag_page_copy(fragto, fragfrom);
+ skb_frag_off_copy(fragto, fragfrom);
skb_frag_size_set(fragto, todo);
- fragfrom->page_offset += todo;
+ skb_frag_off_add(fragfrom, todo);
skb_frag_size_sub(fragfrom, todo);
todo = 0;
@@ -3489,7 +3496,7 @@ next_skb:
if (!st->frag_data)
st->frag_data = kmap_atomic(skb_frag_page(frag));
- *data = (u8 *) st->frag_data + frag->page_offset +
+ *data = (u8 *) st->frag_data + skb_frag_off(frag) +
(abs_offset - st->stepped_offset);
return block_limit - abs_offset;
@@ -3625,13 +3632,104 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
struct page *page;
page = virt_to_head_page(frag_skb->head);
- head_frag.page.p = page;
- head_frag.page_offset = frag_skb->data -
- (unsigned char *)page_address(page);
- head_frag.size = skb_headlen(frag_skb);
+ __skb_frag_set_page(&head_frag, page);
+ skb_frag_off_set(&head_frag, frag_skb->data -
+ (unsigned char *)page_address(page));
+ skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
return head_frag;
}
+struct sk_buff *skb_segment_list(struct sk_buff *skb,
+ netdev_features_t features,
+ unsigned int offset)
+{
+ struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
+ unsigned int tnl_hlen = skb_tnl_header_len(skb);
+ unsigned int delta_truesize = 0;
+ unsigned int delta_len = 0;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *nskb;
+
+ skb_push(skb, -skb_network_offset(skb) + offset);
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ do {
+ nskb = list_skb;
+ list_skb = list_skb->next;
+
+ if (!tail)
+ skb->next = nskb;
+ else
+ tail->next = nskb;
+
+ tail = nskb;
+
+ delta_len += nskb->len;
+ delta_truesize += nskb->truesize;
+
+ skb_push(nskb, -skb_network_offset(nskb) + offset);
+
+ __copy_skb_header(nskb, skb);
+
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+ skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ offset + tnl_hlen);
+
+ if (skb_needs_linearize(nskb, features) &&
+ __skb_linearize(nskb))
+ goto err_linearize;
+
+ } while (list_skb);
+
+ skb->truesize = skb->truesize - delta_truesize;
+ skb->data_len = skb->data_len - delta_len;
+ skb->len = skb->len - delta_len;
+
+ skb_gso_reset(skb);
+
+ skb->prev = tail;
+
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto err_linearize;
+
+ skb_get(skb);
+
+ return skb;
+
+err_linearize:
+ kfree_skb_list(skb->next);
+ skb->next = NULL;
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(skb_segment_list);
+
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_gro_receive_list);
+
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
@@ -3664,6 +3762,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int pos;
int dummy;
+ if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
+ (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
+ /* gso_size is untrusted, and we have a frag_list with a linear
+ * non head_frag head.
+ *
+ * (we assume checking the first list_skb member suffices;
+ * i.e if either of the list_skb members have non head_frag
+ * head, then the first one has too).
+ *
+ * If head_skb's headlen does not fit requested gso_size, it
+ * means that the frag_list members do NOT terminate on exact
+ * gso_size boundaries. Hence we cannot perform skb_frag_t page
+ * sharing. Therefore we must fallback to copying the frag_list
+ * skbs; we do so by disabling SG.
+ */
+ if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
+ features &= ~NETIF_F_SG;
+ }
+
__skb_push(head_skb, doffset);
proto = skb_network_protocol(head_skb, &dummy);
if (unlikely(!proto))
@@ -3871,7 +3988,7 @@ normal:
size = skb_frag_size(nskb_frag);
if (pos < offset) {
- nskb_frag->page_offset += offset - pos;
+ skb_frag_off_add(nskb_frag, offset - pos);
skb_frag_size_sub(nskb_frag, offset - pos);
}
@@ -3992,7 +4109,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
*--frag = *--frag2;
} while (--i);
- frag->page_offset += offset;
+ skb_frag_off_add(frag, offset);
skb_frag_size_sub(frag, offset);
/* all fragments truesize : remove (head size + sk_buff) */
@@ -4021,8 +4138,8 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
- frag->page.p = page;
- frag->page_offset = first_offset;
+ __skb_frag_set_page(frag, page);
+ skb_frag_off_set(frag, first_offset);
skb_frag_size_set(frag, first_size);
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
@@ -4038,7 +4155,7 @@ merge:
if (offset > headlen) {
unsigned int eat = offset - headlen;
- skbinfo->frags[0].page_offset += eat;
+ skb_frag_off_add(&skbinfo->frags[0], eat);
skb_frag_size_sub(&skbinfo->frags[0], eat);
skb->data_len -= eat;
skb->len -= eat;
@@ -4081,6 +4198,12 @@ static const u8 skb_ext_type_len[] = {
#ifdef CONFIG_XFRM
[SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
+#endif
+#if IS_ENABLED(CONFIG_MPTCP)
+ [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -4092,6 +4215,12 @@ static __always_inline unsigned int skb_ext_total_length(void)
#ifdef CONFIG_XFRM
skb_ext_type_len[SKB_EXT_SEC_PATH] +
#endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ skb_ext_type_len[TC_SKB_EXT] +
+#endif
+#if IS_ENABLED(CONFIG_MPTCP)
+ skb_ext_type_len[SKB_EXT_MPTCP] +
+#endif
0;
}
@@ -4163,7 +4292,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
if (copy > len)
copy = len;
sg_set_page(&sg[elt], skb_frag_page(frag), copy,
- frag->page_offset+offset-start);
+ skb_frag_off(frag) + offset - start);
elt++;
if (!(len -= copy))
return elt;
@@ -4384,7 +4513,7 @@ static void skb_set_err_queue(struct sk_buff *skb)
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned int)sk->sk_rcvbuf)
+ (unsigned int)READ_ONCE(sk->sk_rcvbuf))
return -ENOMEM;
skb_orphan(skb);
@@ -5088,8 +5217,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- secpath_reset(skb);
- nf_reset(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
nf_reset_trace(skb);
#ifdef CONFIG_NET_SWITCHDEV
@@ -5441,17 +5570,22 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
}
/**
- * skb_mpls_push() - push a new MPLS header after the mac header
+ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
+ * the packet
*
* @skb: buffer
* @mpls_lse: MPLS label stack entry to push
* @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ * @mac_len: length of the MAC header
+ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
+ * ethernet
*
* Expects skb->data at mac header.
*
* Returns 0 on success, -errno otherwise.
*/
-int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
+ int mac_len, bool ethernet)
{
struct mpls_shim_hdr *lse;
int err;
@@ -5468,21 +5602,22 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
return err;
if (!skb->inner_protocol) {
- skb_set_inner_network_header(skb, skb->mac_len);
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
skb_set_inner_protocol(skb, skb->protocol);
}
skb_push(skb, MPLS_HLEN);
memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
- skb->mac_len);
+ mac_len);
skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb->mac_len);
+ skb_set_network_header(skb, mac_len);
+ skb_reset_mac_len(skb);
lse = mpls_hdr(skb);
lse->label_stack_entry = mpls_lse;
skb_postpush_rcsum(skb, lse, MPLS_HLEN);
- if (skb->dev && skb->dev->type == ARPHRD_ETHER)
+ if (ethernet)
skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
skb->protocol = mpls_proto;
@@ -5495,31 +5630,34 @@ EXPORT_SYMBOL_GPL(skb_mpls_push);
*
* @skb: buffer
* @next_proto: ethertype of header after popped MPLS header
+ * @mac_len: length of the MAC header
+ * @ethernet: flag to indicate if the packet is ethernet
*
* Expects skb->data at mac header.
*
* Returns 0 on success, -errno otherwise.
*/
-int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
+ bool ethernet)
{
int err;
if (unlikely(!eth_p_mpls(skb->protocol)))
- return -EINVAL;
+ return 0;
- err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
if (unlikely(err))
return err;
skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
- skb->mac_len);
+ mac_len);
__skb_pull(skb, MPLS_HLEN);
skb_reset_mac_header(skb);
- skb_set_network_header(skb, skb->mac_len);
+ skb_set_network_header(skb, mac_len);
- if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ if (ethernet) {
struct ethhdr *hdr;
/* use mpls_hdr() to get ethertype to account for VLANs. */
@@ -5834,7 +5972,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
- shinfo->frags[0].page_offset += off - pos;
+ skb_frag_off_add(&shinfo->frags[0], off - pos);
skb_frag_size_sub(&shinfo->frags[0], off - pos);
}
skb_frag_ref(skb, i);
@@ -5940,7 +6078,14 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}
-static struct skb_ext *skb_ext_alloc(void)
+/**
+ * __skb_ext_alloc - allocate a new skb extensions storage
+ *
+ * Returns the newly allocated pointer. The pointer can later attached to a
+ * skb via __skb_ext_set().
+ * Note: caller must handle the skb_ext as an opaque data.
+ */
+struct skb_ext *__skb_ext_alloc(void)
{
struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
@@ -5981,6 +6126,30 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
}
/**
+ * __skb_ext_set - attach the specified extension storage to this skb
+ * @skb: buffer
+ * @id: extension id
+ * @ext: extension storage previously allocated via __skb_ext_alloc()
+ *
+ * Existing extensions, if any, are cleared.
+ *
+ * Returns the pointer to the extension.
+ */
+void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
+ struct skb_ext *ext)
+{
+ unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
+
+ skb_ext_put(skb);
+ newlen = newoff + skb_ext_type_len[id];
+ ext->chunks = newlen;
+ ext->offset[id] = newoff;
+ skb->extensions = ext;
+ skb->active_extensions = 1 << id;
+ return skb_ext_get_ptr(ext, id);
+}
+
+/**
* skb_ext_add - allocate space for given extension, COW if needed
* @skb: buffer
* @id: extension to allocate space for
@@ -6013,7 +6182,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
} else {
newoff = SKB_EXT_CHUNKSIZEOF(*new);
- new = skb_ext_alloc();
+ new = __skb_ext_alloc();
if (!new)
return NULL;
}
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 6832eeb4b785..ded2d5227678 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -190,8 +190,7 @@ static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
sk_msg_check_to_free(msg, i, msg->sg.size);
sge = sk_msg_elem(msg, i);
}
- if (msg->skb)
- consume_skb(msg->skb);
+ consume_skb(msg->skb);
sk_msg_init(msg);
return freed;
}
@@ -271,18 +270,28 @@ void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
msg->sg.data[i].length -= trim;
sk_mem_uncharge(sk, trim);
+ /* Adjust copybreak if it falls into the trimmed part of last buf */
+ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length)
+ msg->sg.copybreak = msg->sg.data[i].length;
out:
- /* If we trim data before curr pointer update copybreak and current
- * so that any future copy operations start at new copy location.
+ sk_msg_iter_var_next(i);
+ msg->sg.end = i;
+
+ /* If we trim data a full sg elem before curr pointer update
+ * copybreak and current so that any future copy operations
+ * start at new copy location.
* However trimed data that has not yet been used in a copy op
* does not require an update.
*/
- if (msg->sg.curr >= i) {
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >=
+ sk_msg_iter_dist(msg->sg.start, msg->sg.end)) {
+ sk_msg_iter_var_prev(i);
msg->sg.curr = i;
msg->sg.copybreak = msg->sg.data[i].length;
}
- sk_msg_iter_var_next(i);
- msg->sg.end = i;
}
EXPORT_SYMBOL_GPL(sk_msg_trim);
@@ -412,7 +421,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
copied = skb->len;
msg->sg.start = 0;
msg->sg.size = copied;
- msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+ msg->sg.end = num_sge;
msg->skb = skb;
sk_psock_queue_msg(psock, msg);
@@ -784,15 +793,18 @@ static void sk_psock_strp_data_ready(struct sock *sk)
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
- void (*write_space)(struct sock *sk);
+ void (*write_space)(struct sock *sk) = NULL;
rcu_read_lock();
psock = sk_psock(sk);
- if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
rcu_read_unlock();
- write_space(sk);
+ if (write_space)
+ write_space(sk);
}
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
diff --git a/net/core/sock.c b/net/core/sock.c
index 6d08553f885c..a4c8fac781ff 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -333,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv);
static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
struct __kernel_sock_timeval tv;
- int size;
if (timeo == MAX_SCHEDULE_TIMEOUT) {
tv.tv_sec = 0;
@@ -354,13 +353,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
old_tv.tv_sec = tv.tv_sec;
old_tv.tv_usec = tv.tv_usec;
*(struct __kernel_old_timeval *)optval = old_tv;
- size = sizeof(old_tv);
- } else {
- *(struct __kernel_sock_timeval *)optval = tv;
- size = sizeof(tv);
+ return sizeof(old_tv);
}
- return size;
+ *(struct __kernel_sock_timeval *)optval = tv;
+ return sizeof(tv);
}
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
@@ -521,8 +518,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
rc = sk_backlog_rcv(sk, skb);
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
+ } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
bh_unlock_sock(sk);
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
@@ -687,7 +684,8 @@ out:
return ret;
}
-static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
+ int valbool)
{
if (valbool)
sock_set_flag(sk, bit);
@@ -785,7 +783,8 @@ set_sndbuf:
*/
val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
@@ -831,7 +830,8 @@ set_rcvbuf:
* returning the value we actually used in getsockopt
* is the most desirable behavior.
*/
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ max_t(int, val * 2, SOCK_MIN_RCVBUF));
break;
case SO_RCVBUFFORCE:
@@ -974,7 +974,7 @@ set_rcvbuf:
if (sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
break;
case SO_RCVTIMEO_OLD:
@@ -1125,7 +1125,7 @@ set_rcvbuf:
break;
}
case SO_INCOMING_CPU:
- sk->sk_incoming_cpu = val;
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
break;
case SO_CNX_ADVICE:
@@ -1474,7 +1474,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_INCOMING_CPU:
- v.val = sk->sk_incoming_cpu;
+ v.val = READ_ONCE(sk->sk_incoming_cpu);
break;
case SO_MEMINFO:
@@ -1700,8 +1700,6 @@ static void __sk_destruct(struct rcu_head *head)
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- if (rcu_access_pointer(sk->sk_reuseport_cb))
- reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
@@ -1728,7 +1726,14 @@ static void __sk_destruct(struct rcu_head *head)
void sk_destruct(struct sock *sk)
{
- if (sock_flag(sk, SOCK_RCU_FREE))
+ bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
+
+ if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+ reuseport_detach_sock(sk);
+ use_call_rcu = true;
+ }
+
+ if (use_call_rcu)
call_rcu(&sk->sk_rcu, __sk_destruct);
else
__sk_destruct(&sk->sk_rcu);
@@ -1851,9 +1856,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
goto out;
}
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-#ifdef CONFIG_BPF_SYSCALL
- RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
-#endif
+
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
@@ -2080,8 +2088,10 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force ||
+ refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
struct sk_buff *skb = alloc_skb(size, priority);
+
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -2182,7 +2192,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -2217,7 +2227,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -2326,8 +2336,8 @@ static void sk_leave_memory_pressure(struct sock *sk)
} else {
unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
- if (memory_pressure && *memory_pressure)
- *memory_pressure = 0;
+ if (memory_pressure && READ_ONCE(*memory_pressure))
+ WRITE_ONCE(*memory_pressure, 0);
}
}
@@ -2776,7 +2786,7 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk)
+void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
@@ -2798,7 +2808,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
@@ -2906,7 +2916,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
- sk->sk_pacing_shift = 10;
+ WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
sk_rx_queue_clear(sk);
@@ -3003,7 +3013,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp,
return -ENOENT;
if (ts.tv_sec == 0) {
ktime_t kt = ktime_get_real();
- sock_write_timestamp(sk, kt);;
+ sock_write_timestamp(sk, kt);
ts = ktime_to_timespec64(kt);
}
@@ -3030,7 +3040,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp,
}
EXPORT_SYMBOL(sock_gettstamp);
-void sock_enable_timestamp(struct sock *sk, int flag)
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
if (!sock_flag(sk, flag)) {
unsigned long previous_flags = sk->sk_flags;
@@ -3196,13 +3206,13 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
- mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
- mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
- mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
- mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}
@@ -3287,16 +3297,17 @@ static __init int net_inuse_init(void)
core_initcall(net_inuse_init);
-static void assign_proto_idx(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
pr_err("PROTO_INUSE_NR exhausted\n");
- return;
+ return -ENOSPC;
}
set_bit(prot->inuse_idx, proto_inuse_idx);
+ return 0;
}
static void release_proto_idx(struct proto *prot)
@@ -3305,8 +3316,9 @@ static void release_proto_idx(struct proto *prot)
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
-static inline void assign_proto_idx(struct proto *prot)
+static inline int assign_proto_idx(struct proto *prot)
{
+ return 0;
}
static inline void release_proto_idx(struct proto *prot)
@@ -3355,6 +3367,8 @@ static int req_prot_init(const struct proto *prot)
int proto_register(struct proto *prot, int alloc_slab)
{
+ int ret = -ENOBUFS;
+
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
@@ -3391,20 +3405,27 @@ int proto_register(struct proto *prot, int alloc_slab)
}
mutex_lock(&proto_list_mutex);
+ ret = assign_proto_idx(prot);
+ if (ret) {
+ mutex_unlock(&proto_list_mutex);
+ goto out_free_timewait_sock_slab_name;
+ }
list_add(&prot->node, &proto_list);
- assign_proto_idx(prot);
mutex_unlock(&proto_list_mutex);
- return 0;
+ return ret;
out_free_timewait_sock_slab_name:
- kfree(prot->twsk_prot->twsk_slab_name);
+ if (alloc_slab && prot->twsk_prot)
+ kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
- req_prot_cleanup(prot->rsk_prot);
+ if (alloc_slab) {
+ req_prot_cleanup(prot->rsk_prot);
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
out:
- return -ENOBUFS;
+ return ret;
}
EXPORT_SYMBOL(proto_register);
@@ -3478,7 +3499,7 @@ static long sock_prot_memory_allocated(struct proto *proto)
return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}
-static char *sock_prot_memory_pressure(struct proto *proto)
+static const char *sock_prot_memory_pressure(struct proto *proto)
{
return proto->memory_pressure != NULL ?
proto_memory_pressure(proto) ? "yes" : "no" : "NI";
@@ -3577,7 +3598,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
{
struct sock *sk = p;
- return !skb_queue_empty(&sk->sk_receive_queue) ||
+ return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1330a7442e5b..085cef5857bb 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -234,19 +234,23 @@ static void sock_map_free(struct bpf_map *map)
int i;
synchronize_rcu();
- rcu_read_lock();
raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct sock **psk = &stab->sks[i];
struct sock *sk;
sk = xchg(psk, NULL);
- if (sk)
+ if (sk) {
+ lock_sock(sk);
+ rcu_read_lock();
sock_map_unref(sk, psk);
+ rcu_read_unlock();
+ release_sock(sk);
+ }
}
raw_spin_unlock_bh(&stab->lock);
- rcu_read_unlock();
+ /* wait for psock readers accessing its map link */
synchronize_rcu();
bpf_map_area_free(stab->sks);
@@ -345,7 +349,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (unlikely(icsk->icsk_ulp_data))
+ if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data)))
return -EINVAL;
link = sk_psock_init_link();
@@ -413,14 +417,16 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
ret = -EINVAL;
goto out;
}
- if (!sock_map_sk_is_suitable(sk) ||
- sk->sk_state != TCP_ESTABLISHED) {
+ if (!sock_map_sk_is_suitable(sk)) {
ret = -EOPNOTSUPP;
goto out;
}
sock_map_sk_acquire(sk);
- ret = sock_map_update_common(map, idx, sk, flags);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ ret = -EOPNOTSUPP;
+ else
+ ret = sock_map_update_common(map, idx, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
@@ -656,6 +662,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
struct sock *sk, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct inet_connection_sock *icsk = inet_csk(sk);
u32 key_size = map->key_size, hash;
struct bpf_htab_elem *elem, *elem_new;
struct bpf_htab_bucket *bucket;
@@ -666,6 +673,8 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
+ if (unlikely(icsk->icsk_ulp_data))
+ return -EINVAL;
link = sk_psock_init_link();
if (!link)
@@ -733,14 +742,16 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key,
ret = -EINVAL;
goto out;
}
- if (!sock_map_sk_is_suitable(sk) ||
- sk->sk_state != TCP_ESTABLISHED) {
+ if (!sock_map_sk_is_suitable(sk)) {
ret = -EOPNOTSUPP;
goto out;
}
sock_map_sk_acquire(sk);
- ret = sock_hash_update_common(map, key, sk, flags);
+ if (sk->sk_state != TCP_ESTABLISHED)
+ ret = -EOPNOTSUPP;
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
fput(sock->file);
@@ -853,17 +864,22 @@ static void sock_hash_free(struct bpf_map *map)
int i;
synchronize_rcu();
- rcu_read_lock();
for (i = 0; i < htab->buckets_num; i++) {
bucket = sock_hash_select_bucket(htab, i);
raw_spin_lock_bh(&bucket->lock);
hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
hlist_del_rcu(&elem->node);
+ lock_sock(elem->sk);
+ rcu_read_lock();
sock_map_unref(elem->sk, elem);
+ rcu_read_unlock();
+ release_sock(elem->sk);
}
raw_spin_unlock_bh(&bucket->lock);
}
- rcu_read_unlock();
+
+ /* wait for psock readers accessing its map link */
+ synchronize_rcu();
bpf_map_area_free(htab->buckets);
kfree(htab);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 9408f9264d05..91e9f2223c39 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -107,7 +107,6 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
if (!more_reuse)
return NULL;
- more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
@@ -295,8 +294,19 @@ struct sock *reuseport_select_sock(struct sock *sk,
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
- if (!sk2)
- sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ if (!sk2) {
+ int i, j;
+
+ i = j = reciprocal_scale(hash, socks);
+ while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+ i++;
+ if (i >= reuse->num_socks)
+ i = 0;
+ if (i == j)
+ goto out;
+ }
+ sk2 = reuse->socks[i];
+ }
}
out:
@@ -345,8 +355,8 @@ int reuseport_detach_prog(struct sock *sk)
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
- rcu_swap_protected(reuse->prog, old_prog,
- lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_replace_pointer(reuse->prog, old_prog,
+ lockdep_is_held(&reuseport_lock));
spin_unlock_bh(&reuseport_lock);
if (!old_prog)
diff --git a/net/core/stream.c b/net/core/stream.c
index e94bb02a5629..4f1d4aa5fb38 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -120,7 +120,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
int err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
- bool noblock = (*timeo_p ? false : true);
DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
@@ -133,11 +132,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
- if (!*timeo_p) {
- if (noblock)
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- goto do_nonblock;
- }
+ if (!*timeo_p)
+ goto do_eagain;
if (signal_pending(current))
goto do_interrupted;
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -169,7 +165,13 @@ out:
do_error:
err = -EPIPE;
goto out;
-do_nonblock:
+do_eagain:
+ /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
+ * be generated later.
+ * When TCP receives ACK packets that make room, tcp_check_space()
+ * only calls tcp_new_space() if SOCK_NOSPACE is set.
+ */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
goto out;
do_interrupted:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 8da5b3a54dac..9f9e00ba3ad7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -288,6 +288,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
return ret;
}
+# ifdef CONFIG_HAVE_EBPF_JIT
static int
proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -298,6 +299,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
+# endif /* CONFIG_HAVE_EBPF_JIT */
static int
proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
@@ -567,6 +569,14 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_do_static_key,
},
+ {
+ .procname = "gro_normal_batch",
+ .data = &gro_normal_batch,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
{ }
};
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 7911235706a9..04840697fe79 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -13,7 +13,7 @@
static unsigned int classify(const struct sk_buff *skb)
{
if (likely(skb->dev && skb->dev->phydev &&
- skb->dev->phydev->drv))
+ skb->dev->phydev->mii_ts))
return ptp_classify_raw(skb);
else
return PTP_CLASS_NONE;
@@ -21,7 +21,7 @@ static unsigned int classify(const struct sk_buff *skb)
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
- struct phy_device *phydev;
+ struct mii_timestamper *mii_ts;
struct sk_buff *clone;
unsigned int type;
@@ -32,22 +32,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return;
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->txtstamp)) {
+ mii_ts = skb->dev->phydev->mii_ts;
+ if (likely(mii_ts->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
return;
- phydev->drv->txtstamp(phydev, clone, type);
+ mii_ts->txtstamp(mii_ts, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
- struct phy_device *phydev;
+ struct mii_timestamper *mii_ts;
unsigned int type;
- if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
+ if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->mii_ts)
return false;
if (skb_headroom(skb) < ETH_HLEN)
@@ -62,9 +62,9 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return false;
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->rxtstamp))
- return phydev->drv->rxtstamp(phydev, skb, type);
+ mii_ts = skb->dev->phydev->mii_ts;
+ if (likely(mii_ts->rxtstamp))
+ return mii_ts->rxtstamp(mii_ts, skb, type);
return false;
}
diff --git a/net/core/tso.c b/net/core/tso.c
index 43f4eba61933..d4d5c077ad72 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -55,8 +55,8 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
- tso->size = frag->size;
- tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->size = skb_frag_size(frag);
+ tso->data = skb_frag_address(frag);
tso->next_frag_idx++;
}
}
@@ -79,8 +79,8 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso)
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
- tso->size = frag->size;
- tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->size = skb_frag_size(frag);
+ tso->data = skb_frag_address(frag);
tso->next_frag_idx++;
}
}
diff --git a/net/core/utils.c b/net/core/utils.c
index 6b6e51db9f3b..1f31a39236d5 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -438,6 +438,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
+/**
+ * inet_proto_csum_replace16 - update layer 4 header checksum field
+ * @sum: Layer 4 header checksum field
+ * @skb: sk_buff for the packet
+ * @from: old IPv6 address
+ * @to: new IPv6 address
+ * @pseudohdr: True if layer 4 header checksum includes pseudoheader
+ *
+ * Update layer 4 header as per the update in IPv6 src/dst address.
+ *
+ * There is no need to update skb->csum in this function, because update in two
+ * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other
+ * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to
+ * update skb->csum, because update in 3 fields a.) IPv4 src/dst address,
+ * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as
+ * L4 Header checksum for skb->csum calculation.
+ */
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
const __be32 *from, const __be32 *to,
bool pseudohdr)
@@ -449,9 +466,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_PARTIAL) {
*sum = csum_fold(csum_partial(diff, sizeof(diff),
~csum_unfold(*sum)));
- if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_partial(diff, sizeof(diff),
- ~skb->csum);
} else if (pseudohdr)
*sum = ~csum_fold(csum_partial(diff, sizeof(diff),
csum_unfold(*sum)));
diff --git a/net/core/xdp.c b/net/core/xdp.c
index d7bf62ffbb5e..8310714c47fd 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -36,7 +36,7 @@ static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
const u32 *k = data;
const u32 key = *k;
- BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+ BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id)
!= sizeof(u32));
/* Use cyclic increasing ID as direct hash key */
@@ -56,7 +56,7 @@ static const struct rhashtable_params mem_id_rht_params = {
.nelem_hint = 64,
.head_offset = offsetof(struct xdp_mem_allocator, node),
.key_offset = offsetof(struct xdp_mem_allocator, mem.id),
- .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+ .key_len = sizeof_field(struct xdp_mem_allocator, mem.id),
.max_size = MEM_ID_MAX,
.min_size = 8,
.automatic_shrinking = true,
@@ -70,77 +70,63 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
- /* Allocator have indicated safe to remove before this is called */
- if (xa->mem.type == MEM_TYPE_PAGE_POOL)
- page_pool_free(xa->page_pool);
-
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
- /* Poison memory */
- xa->mem.id = 0xFFFF;
- xa->mem.type = 0xF0F0;
- xa->allocator = (void *)0xDEAD9001;
-
kfree(xa);
}
-static bool __mem_id_disconnect(int id, bool force)
+static void mem_xa_remove(struct xdp_mem_allocator *xa)
+{
+ trace_mem_disconnect(xa);
+
+ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+}
+
+static void mem_allocator_disconnect(void *allocator)
{
struct xdp_mem_allocator *xa;
- bool safe_to_remove = true;
+ struct rhashtable_iter iter;
mutex_lock(&mem_id_lock);
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- WARN(1, "Request remove non-existing id(%d), driver bug?", id);
- return true;
- }
- xa->disconnect_cnt++;
+ rhashtable_walk_enter(mem_id_ht, &iter);
+ do {
+ rhashtable_walk_start(&iter);
- /* Detects in-flight packet-pages for page_pool */
- if (xa->mem.type == MEM_TYPE_PAGE_POOL)
- safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+ while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
+ if (xa->allocator == allocator)
+ mem_xa_remove(xa);
+ }
- trace_mem_disconnect(xa, safe_to_remove, force);
+ rhashtable_walk_stop(&iter);
- if ((safe_to_remove || force) &&
- !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+ } while (xa == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
mutex_unlock(&mem_id_lock);
- return (safe_to_remove|force);
}
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (30 * HZ)
-#define DEFER_MAX_RETRIES 120
-
-static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+static void mem_id_disconnect(int id)
{
- struct delayed_work *dwq = to_delayed_work(wq);
- struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
- bool force = false;
+ struct xdp_mem_allocator *xa;
- if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
- force = true;
+ mutex_lock(&mem_id_lock);
- if (__mem_id_disconnect(xa->mem.id, force))
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ WARN(1, "Request remove non-existing id(%d), driver bug?", id);
return;
+ }
- /* Periodic warning */
- if (time_after_eq(jiffies, xa->defer_warn)) {
- int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+ trace_mem_disconnect(xa);
- pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
- __func__, xa->mem.id, xa->disconnect_cnt, sec);
- xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
- }
+ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
- /* Still not ready to be disconnected, retry later */
- schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+ mutex_unlock(&mem_id_lock);
}
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
@@ -153,38 +139,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
return;
}
- if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
- xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
- return;
- }
-
if (id == 0)
return;
- if (__mem_id_disconnect(id, false))
- return;
-
- /* Could not disconnect, defer new disconnect attempt to later */
- mutex_lock(&mem_id_lock);
+ if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
+ return mem_id_disconnect(id);
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- return;
+ if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ page_pool_destroy(xa->page_pool);
+ rcu_read_unlock();
}
- xa->defer_start = jiffies;
- xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
-
- INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
- mutex_unlock(&mem_id_lock);
- schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
-/* This unregister operation will also cleanup and destroy the
- * allocator. The page_pool_free() operation is first called when it's
- * safe to remove, possibly deferred to a workqueue.
- */
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -371,7 +340,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
}
if (type == MEM_TYPE_PAGE_POOL)
- page_pool_get(xdp_alloc->page_pool);
+ page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
mutex_unlock(&mem_id_lock);
@@ -386,7 +355,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
- * while still leveraging this protection. The @napi_direct boolian
+ * while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
@@ -402,15 +371,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (likely(xa)) {
- napi_direct &= !xdp_return_frame_no_direct();
- page_pool_put_page(xa->page_pool, page, napi_direct);
- } else {
- /* Hopefully stack show who to blame for late return */
- WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
- trace_mem_return_failed(mem, page);
- put_page(page);
- }
+ napi_direct &= !xdp_return_frame_no_direct();
+ page_pool_put_page(xa->page_pool, page, napi_direct);
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED:
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b685bc82f8d0..d19557c6d04b 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -117,7 +117,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
inet->inet_dport);
- inet->inet_id = dp->dccps_iss ^ jiffies;
+ inet->inet_id = prandom_u32();
err = dccp_connect(sk);
rt = NULL;
@@ -416,7 +416,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
newinet->mc_index = inet_iif(skb);
newinet->mc_ttl = ip_hdr(skb)->ttl;
- newinet->inet_id = jiffies;
+ newinet->inet_id = prandom_u32();
if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
goto put_and_exit;
@@ -871,7 +871,7 @@ lookup:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- nf_reset(skb);
+ nf_reset_ct(skb);
return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 1b7381ff787b..1e5e08cc0bfc 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -210,7 +210,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
@@ -230,7 +230,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
+ err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass,
+ sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -281,10 +282,10 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
- dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
+ ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
@@ -911,7 +912,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 5bad08dc4316..4af8a98fe784 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -944,7 +944,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
goto out;
- sk->sk_max_ack_backlog = backlog;
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -1132,7 +1132,7 @@ static int __init dccp_init(void)
int rc;
BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
- FIELD_SIZEOF(struct sk_buff, cb));
+ sizeof_field(struct sk_buff, cb));
rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
if (rc)
goto out_fail;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 0ea75286abf4..0a46ea3bddd5 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -670,7 +670,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
{
struct sock *sk;
- if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ if (protocol < 0 || protocol > U8_MAX)
return -EINVAL;
if (!net_eq(net, &init_net))
@@ -1091,7 +1091,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
}
cb = DN_SKB_CB(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
if (newsk == NULL) {
release_sock(sk);
@@ -1205,7 +1205,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wai
struct dn_scp *scp = DN_SK(sk);
__poll_t mask = datagram_poll(file, sock, wait);
- if (!skb_queue_empty(&scp->other_receive_queue))
+ if (!skb_queue_empty_lockless(&scp->other_receive_queue))
mask |= EPOLLRDBAND;
return mask;
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index e4161e0c86aa..c68503a18025 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -328,7 +328,7 @@ static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb)
return;
}
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_state_change(sk);
}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index aea918135ec3..08c3dc45f1a4 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -110,7 +110,8 @@ static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
static void dn_dst_link_failure(struct sk_buff *);
static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb , u32 mtu);
+ struct sk_buff *skb , u32 mtu,
+ bool confirm_neigh);
static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
@@ -251,7 +252,8 @@ static int dn_dst_gc(struct dst_ops *ops)
* advertise to the other end).
*/
static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct dn_route *rt = (struct dn_route *) dst;
struct neighbour *n = rt->n;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 6e942dda1bcd..92663dcb3aa2 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -20,7 +20,7 @@ if NET_DSA
# tagging formats
config NET_DSA_TAG_8021Q
- tristate "Tag driver for switches using custom 802.1Q VLAN headers"
+ tristate
select VLAN_8021Q
help
Unlike the other tagging protocols, the 802.1Q config option simply
@@ -29,6 +29,12 @@ config NET_DSA_TAG_8021Q
Drivers which use these helpers should select this as dependency.
+config NET_DSA_TAG_AR9331
+ tristate "Tag driver for Atheros AR9331 SoC with built-in switch"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ the Atheros AR9331 SoC with built-in switch.
+
config NET_DSA_TAG_BRCM_COMMON
tristate
default n
@@ -73,23 +79,18 @@ config NET_DSA_TAG_MTK
Say Y or M if you want to enable support for tagging frames for
Mediatek switches.
-config NET_DSA_TAG_KSZ_COMMON
- tristate
- default n
-
config NET_DSA_TAG_KSZ
- tristate "Tag driver for Microchip 9893 family of switches"
- select NET_DSA_TAG_KSZ_COMMON
+ tristate "Tag driver for Microchip 8795/9477/9893 families of switches"
help
Say Y if you want to enable support for tagging frames for the
- Microchip 9893 family of switches.
+ Microchip 8795/9477/9893 families of switches.
-config NET_DSA_TAG_KSZ9477
- tristate "Tag driver for Microchip 9477 family of switches"
- select NET_DSA_TAG_KSZ_COMMON
+config NET_DSA_TAG_OCELOT
+ tristate "Tag driver for Ocelot family of switches"
+ select PACKING
help
- Say Y if you want to enable support for tagging frames for the
- Microchip 9477 family of switches.
+ Say Y or M if you want to enable support for tagging frames for the
+ Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959).
config NET_DSA_TAG_QCA
tristate "Tag driver for Qualcomm Atheros QCA8K switches"
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index c342f54715ba..108486cfdeef 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -5,13 +5,15 @@ dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o
# tagging formats
obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o
+obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o
obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
-obj-$(CONFIG_NET_DSA_TAG_KSZ_COMMON) += tag_ksz.o
+obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 43120a3fb06f..17281fec710c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -246,7 +246,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
#ifdef CONFIG_PM_SLEEP
static bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
{
- return dsa_is_user_port(ds, p) && ds->ports[p].slave;
+ const struct dsa_port *dp = dsa_to_port(ds, p);
+
+ return dp->type == DSA_PORT_TYPE_USER && dp->slave;
}
int dsa_switch_suspend(struct dsa_switch *ds)
@@ -258,7 +260,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
if (!dsa_is_port_initialized(ds, i))
continue;
- ret = dsa_slave_suspend(ds->ports[i].slave);
+ ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave);
if (ret)
return ret;
}
@@ -285,7 +287,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
if (!dsa_is_port_initialized(ds, i))
continue;
- ret = dsa_slave_resume(ds->ports[i].slave);
+ ret = dsa_slave_resume(dsa_to_port(ds, i)->slave);
if (ret)
return ret;
}
@@ -329,6 +331,91 @@ int call_dsa_notifiers(unsigned long val, struct net_device *dev,
}
EXPORT_SYMBOL_GPL(call_dsa_notifiers);
+int dsa_devlink_param_get(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ struct dsa_devlink_priv *dl_priv;
+ struct dsa_switch *ds;
+
+ dl_priv = devlink_priv(dl);
+ ds = dl_priv->ds;
+
+ if (!ds->ops->devlink_param_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_get(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
+
+int dsa_devlink_param_set(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ struct dsa_devlink_priv *dl_priv;
+ struct dsa_switch *ds;
+
+ dl_priv = devlink_priv(dl);
+ ds = dl_priv->ds;
+
+ if (!ds->ops->devlink_param_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_set(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_set);
+
+int dsa_devlink_params_register(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ return devlink_params_register(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_register);
+
+void dsa_devlink_params_unregister(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ devlink_params_unregister(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
+
+int dsa_devlink_resource_register(struct dsa_switch *ds,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ return devlink_resource_register(ds->devlink, resource_name,
+ resource_size, resource_id,
+ parent_resource_id,
+ size_params);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_register);
+
+void dsa_devlink_resources_unregister(struct dsa_switch *ds)
+{
+ devlink_resources_unregister(ds->devlink, NULL);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister);
+
+void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ return devlink_resource_occ_get_register(ds->devlink, resource_id,
+ occ_get, occ_get_priv);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register);
+
+void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
+ u64 resource_id)
+{
+ devlink_resource_occ_get_unregister(ds->devlink, resource_id);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+
static int __init dsa_init_module(void)
{
int rc;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 3abd173ebacb..e7c30b472034 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -45,8 +45,12 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index)
dst->index = index;
+ INIT_LIST_HEAD(&dst->rtable);
+
+ INIT_LIST_HEAD(&dst->ports);
+
INIT_LIST_HEAD(&dst->list);
- list_add_tail(&dsa_tree_list, &dst->list);
+ list_add_tail(&dst->list, &dsa_tree_list);
kref_init(&dst->refcount);
@@ -111,24 +115,39 @@ static bool dsa_port_is_user(struct dsa_port *dp)
static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
struct device_node *dn)
{
- struct dsa_switch *ds;
struct dsa_port *dp;
- int device, port;
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->dn == dn)
+ return dp;
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
+ return NULL;
+}
- if (dp->dn == dn)
- return dp;
- }
- }
+static struct dsa_link *dsa_link_touch(struct dsa_port *dp,
+ struct dsa_port *link_dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst;
+ struct dsa_link *dl;
- return NULL;
+ dst = ds->dst;
+
+ list_for_each_entry(dl, &dst->rtable, list)
+ if (dl->dp == dp && dl->link_dp == link_dp)
+ return dl;
+
+ dl = kzalloc(sizeof(*dl), GFP_KERNEL);
+ if (!dl)
+ return NULL;
+
+ dl->dp = dp;
+ dl->link_dp = link_dp;
+
+ INIT_LIST_HEAD(&dl->list);
+ list_add_tail(&dl->list, &dst->rtable);
+
+ return dl;
}
static bool dsa_port_setup_routing_table(struct dsa_port *dp)
@@ -138,6 +157,7 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp)
struct device_node *dn = dp->dn;
struct of_phandle_iterator it;
struct dsa_port *link_dp;
+ struct dsa_link *dl;
int err;
of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
@@ -147,24 +167,22 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp)
return false;
}
- ds->rtable[link_dp->ds->index] = dp->index;
+ dl = dsa_link_touch(dp, link_dp);
+ if (!dl) {
+ of_node_put(it.node);
+ return false;
+ }
}
return true;
}
-static bool dsa_switch_setup_routing_table(struct dsa_switch *ds)
+static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
{
bool complete = true;
struct dsa_port *dp;
- int i;
-
- for (i = 0; i < DSA_MAX_SWITCHES; i++)
- ds->rtable[i] = DSA_RTABLE_NONE;
-
- for (i = 0; i < ds->num_ports; i++) {
- dp = &ds->ports[i];
+ list_for_each_entry(dp, &dst->ports, list) {
if (dsa_port_is_dsa(dp)) {
complete = dsa_port_setup_routing_table(dp);
if (!complete)
@@ -175,178 +193,176 @@ static bool dsa_switch_setup_routing_table(struct dsa_switch *ds)
return complete;
}
-static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
-{
- struct dsa_switch *ds;
- bool complete = true;
- int device;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- complete = dsa_switch_setup_routing_table(ds);
- if (!complete)
- break;
- }
-
- return complete;
-}
-
static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
{
- struct dsa_switch *ds;
struct dsa_port *dp;
- int device, port;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
- if (dsa_port_is_cpu(dp))
- return dp;
- }
- }
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_cpu(dp))
+ return dp;
return NULL;
}
static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
{
- struct dsa_switch *ds;
- struct dsa_port *dp;
- int device, port;
+ struct dsa_port *cpu_dp, *dp;
- /* DSA currently only supports a single CPU port */
- dst->cpu_dp = dsa_tree_find_first_cpu(dst);
- if (!dst->cpu_dp) {
- pr_warn("Tree has no master device\n");
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ if (!cpu_dp) {
+ pr_err("DSA: tree %d has no CPU port\n", dst->index);
return -EINVAL;
}
/* Assign the default CPU port to all ports of the fabric */
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
- dp->cpu_dp = dst->cpu_dp;
- }
- }
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
return 0;
}
static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst)
{
- /* DSA currently only supports a single CPU port */
- dst->cpu_dp = NULL;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = NULL;
}
static int dsa_port_setup(struct dsa_port *dp)
{
- enum devlink_port_flavour flavour;
struct dsa_switch *ds = dp->ds;
struct dsa_switch_tree *dst = ds->dst;
+ const unsigned char *id = (const unsigned char *)&dst->index;
+ const unsigned char len = sizeof(dst->index);
+ struct devlink_port *dlp = &dp->devlink_port;
+ bool dsa_port_link_registered = false;
+ bool devlink_port_registered = false;
+ struct devlink *dl = ds->devlink;
+ bool dsa_port_enabled = false;
int err = 0;
- if (dp->type == DSA_PORT_TYPE_UNUSED)
+ if (dp->setup)
return 0;
- memset(&dp->devlink_port, 0, sizeof(dp->devlink_port));
- dp->mac = of_get_mac_address(dp->dn);
-
- switch (dp->type) {
- case DSA_PORT_TYPE_CPU:
- flavour = DEVLINK_PORT_FLAVOUR_CPU;
- break;
- case DSA_PORT_TYPE_DSA:
- flavour = DEVLINK_PORT_FLAVOUR_DSA;
- break;
- case DSA_PORT_TYPE_USER: /* fall-through */
- default:
- flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
- break;
- }
-
- /* dp->index is used now as port_number. However
- * CPU and DSA ports should have separate numbering
- * independent from front panel port numbers.
- */
- devlink_port_attrs_set(&dp->devlink_port, flavour,
- dp->index, false, 0,
- (const char *) &dst->index, sizeof(dst->index));
- err = devlink_port_register(ds->devlink, &dp->devlink_port,
- dp->index);
- if (err)
- return err;
-
switch (dp->type) {
case DSA_PORT_TYPE_UNUSED:
+ dsa_port_disable(dp);
break;
case DSA_PORT_TYPE_CPU:
+ memset(dlp, 0, sizeof(*dlp));
+ devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_CPU,
+ dp->index, false, 0, id, len);
+ err = devlink_port_register(dl, dlp, dp->index);
+ if (err)
+ break;
+ devlink_port_registered = true;
+
err = dsa_port_link_register_of(dp);
if (err)
- dev_err(ds->dev, "failed to setup link for port %d.%d\n",
- ds->index, dp->index);
+ break;
+ dsa_port_link_registered = true;
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
break;
case DSA_PORT_TYPE_DSA:
+ memset(dlp, 0, sizeof(*dlp));
+ devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_DSA,
+ dp->index, false, 0, id, len);
+ err = devlink_port_register(dl, dlp, dp->index);
+ if (err)
+ break;
+ devlink_port_registered = true;
+
err = dsa_port_link_register_of(dp);
if (err)
- dev_err(ds->dev, "failed to setup link for port %d.%d\n",
- ds->index, dp->index);
+ break;
+ dsa_port_link_registered = true;
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
break;
case DSA_PORT_TYPE_USER:
+ memset(dlp, 0, sizeof(*dlp));
+ devlink_port_attrs_set(dlp, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+ dp->index, false, 0, id, len);
+ err = devlink_port_register(dl, dlp, dp->index);
+ if (err)
+ break;
+ devlink_port_registered = true;
+
+ dp->mac = of_get_mac_address(dp->dn);
err = dsa_slave_create(dp);
if (err)
- dev_err(ds->dev, "failed to create slave for port %d.%d\n",
- ds->index, dp->index);
- else
- devlink_port_type_eth_set(&dp->devlink_port, dp->slave);
+ break;
+
+ devlink_port_type_eth_set(dlp, dp->slave);
break;
}
+ if (err && dsa_port_enabled)
+ dsa_port_disable(dp);
+ if (err && dsa_port_link_registered)
+ dsa_port_link_unregister_of(dp);
+ if (err && devlink_port_registered)
+ devlink_port_unregister(dlp);
if (err)
- devlink_port_unregister(&dp->devlink_port);
+ return err;
- return err;
+ dp->setup = true;
+
+ return 0;
}
static void dsa_port_teardown(struct dsa_port *dp)
{
- if (dp->type != DSA_PORT_TYPE_UNUSED)
- devlink_port_unregister(&dp->devlink_port);
+ struct devlink_port *dlp = &dp->devlink_port;
+
+ if (!dp->setup)
+ return;
switch (dp->type) {
case DSA_PORT_TYPE_UNUSED:
break;
case DSA_PORT_TYPE_CPU:
+ dsa_port_disable(dp);
dsa_tag_driver_put(dp->tag_ops);
- /* fall-through */
+ devlink_port_unregister(dlp);
+ dsa_port_link_unregister_of(dp);
+ break;
case DSA_PORT_TYPE_DSA:
+ dsa_port_disable(dp);
+ devlink_port_unregister(dlp);
dsa_port_link_unregister_of(dp);
break;
case DSA_PORT_TYPE_USER:
+ devlink_port_unregister(dlp);
if (dp->slave) {
dsa_slave_destroy(dp->slave);
dp->slave = NULL;
}
break;
}
+
+ dp->setup = false;
}
static int dsa_switch_setup(struct dsa_switch *ds)
{
- int err = 0;
+ struct dsa_devlink_priv *dl_priv;
+ int err;
+
+ if (ds->setup)
+ return 0;
/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
* driver and before ops->setup() has run, since the switch drivers and
@@ -358,9 +374,11 @@ static int dsa_switch_setup(struct dsa_switch *ds)
/* Add the switch to devlink before calling setup, so that setup can
* add dpipe tables
*/
- ds->devlink = devlink_alloc(&dsa_devlink_ops, 0);
+ ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv));
if (!ds->devlink)
return -ENOMEM;
+ dl_priv = devlink_priv(ds->devlink);
+ dl_priv->ds = ds;
err = devlink_register(ds->devlink, ds->dev);
if (err)
@@ -374,6 +392,8 @@ static int dsa_switch_setup(struct dsa_switch *ds)
if (err < 0)
goto unregister_notifier;
+ devlink_params_publish(ds->devlink);
+
if (!ds->slave_mii_bus && ds->ops->phy_read) {
ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
if (!ds->slave_mii_bus) {
@@ -388,6 +408,8 @@ static int dsa_switch_setup(struct dsa_switch *ds)
goto unregister_notifier;
}
+ ds->setup = true;
+
return 0;
unregister_notifier:
@@ -403,6 +425,9 @@ free_devlink:
static void dsa_switch_teardown(struct dsa_switch *ds)
{
+ if (!ds->setup)
+ return;
+
if (ds->slave_mii_bus && ds->ops->phy_read)
mdiobus_unregister(ds->slave_mii_bus);
@@ -417,95 +442,72 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
ds->devlink = NULL;
}
+ ds->setup = false;
}
static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
{
- struct dsa_switch *ds;
struct dsa_port *dp;
- int device, port, i;
- int err = 0;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
+ int err;
- err = dsa_switch_setup(ds);
+ list_for_each_entry(dp, &dst->ports, list) {
+ err = dsa_switch_setup(dp->ds);
if (err)
- goto switch_teardown;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
+ goto teardown;
+ }
- err = dsa_port_setup(dp);
- if (err)
- goto ports_teardown;
- }
+ list_for_each_entry(dp, &dst->ports, list) {
+ err = dsa_port_setup(dp);
+ if (err)
+ goto teardown;
}
return 0;
-ports_teardown:
- for (i = 0; i < port; i++)
- dsa_port_teardown(&ds->ports[i]);
-
- dsa_switch_teardown(ds);
+teardown:
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_port_teardown(dp);
-switch_teardown:
- for (i = 0; i < device; i++) {
- ds = dst->ds[i];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- dsa_port_teardown(dp);
- }
-
- dsa_switch_teardown(ds);
- }
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_switch_teardown(dp->ds);
return err;
}
static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
{
- struct dsa_switch *ds;
struct dsa_port *dp;
- int device, port;
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- dsa_port_teardown(dp);
- }
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_port_teardown(dp);
- dsa_switch_teardown(ds);
- }
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_switch_teardown(dp->ds);
}
static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
{
- struct dsa_port *cpu_dp = dst->cpu_dp;
- struct net_device *master = cpu_dp->master;
+ struct dsa_port *dp;
+ int err;
- /* DSA currently supports a single pair of CPU port and master device */
- return dsa_master_setup(master, cpu_dp);
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_cpu(dp)) {
+ err = dsa_master_setup(dp->master, dp);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
}
static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
{
- struct dsa_port *cpu_dp = dst->cpu_dp;
- struct net_device *master = cpu_dp->master;
+ struct dsa_port *dp;
- return dsa_master_teardown(master);
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_cpu(dp))
+ dsa_master_teardown(dp->master);
}
static int dsa_tree_setup(struct dsa_switch_tree *dst)
@@ -551,6 +553,8 @@ teardown_default_cpu:
static void dsa_tree_teardown(struct dsa_switch_tree *dst)
{
+ struct dsa_link *dl, *next;
+
if (!dst->setup)
return;
@@ -560,39 +564,36 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst)
dsa_tree_teardown_default_cpu(dst);
+ list_for_each_entry_safe(dl, next, &dst->rtable, list) {
+ list_del(&dl->list);
+ kfree(dl);
+ }
+
pr_info("DSA: tree %d torn down\n", dst->index);
dst->setup = false;
}
-static void dsa_tree_remove_switch(struct dsa_switch_tree *dst,
- unsigned int index)
+static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
{
- dsa_tree_teardown(dst);
+ struct dsa_switch_tree *dst = ds->dst;
+ struct dsa_port *dp;
- dst->ds[index] = NULL;
- dsa_tree_put(dst);
-}
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->ds == ds && dp->index == index)
+ return dp;
-static int dsa_tree_add_switch(struct dsa_switch_tree *dst,
- struct dsa_switch *ds)
-{
- unsigned int index = ds->index;
- int err;
+ dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+ if (!dp)
+ return NULL;
- if (dst->ds[index])
- return -EBUSY;
+ dp->ds = ds;
+ dp->index = index;
- dsa_tree_get(dst);
- dst->ds[index] = ds;
+ INIT_LIST_HEAD(&dp->list);
+ list_add_tail(&dp->list, &dst->ports);
- err = dsa_tree_setup(dst);
- if (err) {
- dst->ds[index] = NULL;
- dsa_tree_put(dst);
- }
-
- return err;
+ return dp;
}
static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
@@ -613,6 +614,32 @@ static int dsa_port_parse_dsa(struct dsa_port *dp)
return 0;
}
+static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
+ struct net_device *master)
+{
+ enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
+ struct dsa_switch *mds, *ds = dp->ds;
+ unsigned int mdp_upstream;
+ struct dsa_port *mdp;
+
+ /* It is possible to stack DSA switches onto one another when that
+ * happens the switch driver may want to know if its tagging protocol
+ * is going to work in such a configuration.
+ */
+ if (dsa_slave_dev_check(master)) {
+ mdp = dsa_slave_to_port(master);
+ mds = mdp->ds;
+ mdp_upstream = dsa_upstream_port(mds, mdp->index);
+ tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
+ DSA_TAG_PROTO_NONE);
+ }
+
+ /* If the master device is not itself a DSA slave in a disjoint DSA
+ * tree, then return immediately.
+ */
+ return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
+}
+
static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master)
{
struct dsa_switch *ds = dp->ds;
@@ -620,18 +647,21 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master)
const struct dsa_device_ops *tag_ops;
enum dsa_tag_protocol tag_protocol;
- tag_protocol = ds->ops->get_tag_protocol(ds, dp->index);
+ tag_protocol = dsa_get_tag_protocol(dp, master);
tag_ops = dsa_tag_driver_get(tag_protocol);
if (IS_ERR(tag_ops)) {
+ if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
+ return -EPROBE_DEFER;
dev_warn(ds->dev, "No tagger for this switch\n");
+ dp->master = NULL;
return PTR_ERR(tag_ops);
}
+ dp->master = master;
dp->type = DSA_PORT_TYPE_CPU;
dp->filter = tag_ops->filter;
dp->rcv = tag_ops->rcv;
dp->tag_ops = tag_ops;
- dp->master = master;
dp->dst = dst;
return 0;
@@ -685,7 +715,7 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
goto out_put_node;
}
- dp = &ds->ports[reg];
+ dp = dsa_to_port(ds, reg);
err = dsa_port_parse_of(dp, port);
if (err)
@@ -709,8 +739,6 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds,
return sz;
ds->index = m[1];
- if (ds->index >= DSA_MAX_SWITCHES)
- return -EINVAL;
ds->dst = dsa_tree_touch(m[0]);
if (!ds->dst)
@@ -719,6 +747,20 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds,
return 0;
}
+static int dsa_switch_touch_ports(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int port;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ dp = dsa_port_touch(ds, port);
+ if (!dp)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
{
int err;
@@ -727,6 +769,10 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
if (err)
return err;
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
return dsa_switch_parse_ports_of(ds, dn);
}
@@ -764,7 +810,7 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds,
for (i = 0; i < DSA_MAX_PORTS; i++) {
name = cd->port_names[i];
dev = cd->netdev[i];
- dp = &ds->ports[i];
+ dp = dsa_to_port(ds, i);
if (!name)
continue;
@@ -784,6 +830,8 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds,
static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
{
+ int err;
+
ds->cd = cd;
/* We don't support interconnected switches nor multiple trees via
@@ -794,69 +842,67 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
if (!ds->dst)
return -ENOMEM;
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
return dsa_switch_parse_ports(ds, cd);
}
-static int dsa_switch_add(struct dsa_switch *ds)
+static void dsa_switch_release_ports(struct dsa_switch *ds)
{
struct dsa_switch_tree *dst = ds->dst;
+ struct dsa_port *dp, *next;
- return dsa_tree_add_switch(dst, ds);
+ list_for_each_entry_safe(dp, next, &dst->ports, list) {
+ if (dp->ds != ds)
+ continue;
+ list_del(&dp->list);
+ kfree(dp);
+ }
}
static int dsa_switch_probe(struct dsa_switch *ds)
{
- struct dsa_chip_data *pdata = ds->dev->platform_data;
- struct device_node *np = ds->dev->of_node;
+ struct dsa_switch_tree *dst;
+ struct dsa_chip_data *pdata;
+ struct device_node *np;
int err;
- if (np)
- err = dsa_switch_parse_of(ds, np);
- else if (pdata)
- err = dsa_switch_parse(ds, pdata);
- else
- err = -ENODEV;
+ if (!ds->dev)
+ return -ENODEV;
- if (err)
- return err;
+ pdata = ds->dev->platform_data;
+ np = ds->dev->of_node;
- return dsa_switch_add(ds);
-}
-
-struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
-{
- struct dsa_switch *ds;
- int i;
-
- ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL);
- if (!ds)
- return NULL;
+ if (!ds->num_ports)
+ return -EINVAL;
- /* We avoid allocating memory outside dsa_switch
- * if it is not needed.
- */
- if (n <= sizeof(ds->_bitmap) * 8) {
- ds->bitmap = &ds->_bitmap;
+ if (np) {
+ err = dsa_switch_parse_of(ds, np);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else if (pdata) {
+ err = dsa_switch_parse(ds, pdata);
+ if (err)
+ dsa_switch_release_ports(ds);
} else {
- ds->bitmap = devm_kcalloc(dev,
- BITS_TO_LONGS(n),
- sizeof(unsigned long),
- GFP_KERNEL);
- if (unlikely(!ds->bitmap))
- return NULL;
+ err = -ENODEV;
}
- ds->dev = dev;
- ds->num_ports = n;
+ if (err)
+ return err;
- for (i = 0; i < ds->num_ports; ++i) {
- ds->ports[i].index = i;
- ds->ports[i].ds = ds;
+ dst = ds->dst;
+ dsa_tree_get(dst);
+ err = dsa_tree_setup(dst);
+ if (err) {
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
}
- return ds;
+ return err;
}
-EXPORT_SYMBOL_GPL(dsa_switch_alloc);
int dsa_register_switch(struct dsa_switch *ds)
{
@@ -874,9 +920,10 @@ EXPORT_SYMBOL_GPL(dsa_register_switch);
static void dsa_switch_remove(struct dsa_switch *ds)
{
struct dsa_switch_tree *dst = ds->dst;
- unsigned int index = ds->index;
- dsa_tree_remove_switch(dst, index);
+ dsa_tree_teardown(dst);
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
}
void dsa_unregister_switch(struct dsa_switch *ds)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 12f8c7ee4dd8..a7662e7a691d 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -104,25 +104,14 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
struct dsa_switch_tree *dst = cpu_dp->dst;
- struct dsa_switch *ds;
- struct dsa_port *slave_port;
+ struct dsa_port *dp;
- if (device < 0 || device >= DSA_MAX_SWITCHES)
- return NULL;
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->ds->index == device && dp->index == port &&
+ dp->type == DSA_PORT_TYPE_USER)
+ return dp->slave;
- ds = dst->ds[device];
- if (!ds)
- return NULL;
-
- if (port < 0 || port >= ds->num_ports)
- return NULL;
-
- slave_port = &ds->ports[port];
-
- if (unlikely(slave_port->type != DSA_PORT_TYPE_USER))
- return NULL;
-
- return slave_port->slave;
+ return NULL;
}
/* port.c */
@@ -161,22 +150,6 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags);
int dsa_port_vid_del(struct dsa_port *dp, u16 vid);
int dsa_port_link_register_of(struct dsa_port *dp);
void dsa_port_link_unregister_of(struct dsa_port *dp);
-void dsa_port_phylink_validate(struct phylink_config *config,
- unsigned long *supported,
- struct phylink_link_state *state);
-int dsa_port_phylink_mac_link_state(struct phylink_config *config,
- struct phylink_link_state *state);
-void dsa_port_phylink_mac_config(struct phylink_config *config,
- unsigned int mode,
- const struct phylink_link_state *state);
-void dsa_port_phylink_mac_an_restart(struct phylink_config *config);
-void dsa_port_phylink_mac_link_down(struct phylink_config *config,
- unsigned int mode,
- phy_interface_t interface);
-void dsa_port_phylink_mac_link_up(struct phylink_config *config,
- unsigned int mode,
- phy_interface_t interface,
- struct phy_device *phydev);
extern const struct phylink_mac_ops dsa_port_phylink_mac_ops;
/* slave.c */
@@ -184,13 +157,12 @@ extern const struct dsa_device_ops notag_netdev_ops;
void dsa_slave_mii_bus_init(struct dsa_switch *ds);
int dsa_slave_create(struct dsa_port *dp);
void dsa_slave_destroy(struct net_device *slave_dev);
+bool dsa_slave_dev_check(const struct net_device *dev);
int dsa_slave_suspend(struct net_device *slave_dev);
int dsa_slave_resume(struct net_device *slave_dev);
int dsa_slave_register_notifier(void);
void dsa_slave_unregister_notifier(void);
-void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev);
-
static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 4b52f8bac5e1..bd44bde272f4 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -8,6 +8,70 @@
#include "dsa_priv.h"
+static int dsa_master_get_regs_len(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int ret = 0;
+ int len;
+
+ if (ops->get_regs_len) {
+ len = ops->get_regs_len(dev);
+ if (len < 0)
+ return len;
+ ret += len;
+ }
+
+ ret += sizeof(struct ethtool_drvinfo);
+ ret += sizeof(struct ethtool_regs);
+
+ if (ds->ops->get_regs_len) {
+ len = ds->ops->get_regs_len(ds, port);
+ if (len < 0)
+ return len;
+ ret += len;
+ }
+
+ return ret;
+}
+
+static void dsa_master_get_regs(struct net_device *dev,
+ struct ethtool_regs *regs, void *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct ethtool_drvinfo *cpu_info;
+ struct ethtool_regs *cpu_regs;
+ int port = cpu_dp->index;
+ int len;
+
+ if (ops->get_regs_len && ops->get_regs) {
+ len = ops->get_regs_len(dev);
+ if (len < 0)
+ return;
+ regs->len = len;
+ ops->get_regs(dev, regs, data);
+ data += regs->len;
+ }
+
+ cpu_info = (struct ethtool_drvinfo *)data;
+ strlcpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
+ data += sizeof(*cpu_info);
+ cpu_regs = (struct ethtool_regs *)data;
+ data += sizeof(*cpu_regs);
+
+ if (ds->ops->get_regs_len && ds->ops->get_regs) {
+ len = ds->ops->get_regs_len(ds, port);
+ if (len < 0)
+ return;
+ cpu_regs->len = len;
+ ds->ops->get_regs(ds, port, cpu_regs, data);
+ }
+}
+
static void dsa_master_get_ethtool_stats(struct net_device *dev,
struct ethtool_stats *stats,
uint64_t *data)
@@ -133,6 +197,35 @@ static int dsa_master_get_phys_port_name(struct net_device *dev,
return 0;
}
+static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct dsa_switch_tree *dst;
+ int err = -EOPNOTSUPP;
+ struct dsa_port *dp;
+
+ dst = ds->dst;
+
+ switch (cmd) {
+ case SIOCGHWTSTAMP:
+ case SIOCSHWTSTAMP:
+ /* Deny PTP operations on master if there is at least one
+ * switch in the tree that is PTP capable.
+ */
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->ds->ops->port_hwtstamp_get ||
+ dp->ds->ops->port_hwtstamp_set)
+ return -EBUSY;
+ break;
+ }
+
+ if (cpu_dp->orig_ndo_ops && cpu_dp->orig_ndo_ops->ndo_do_ioctl)
+ err = cpu_dp->orig_ndo_ops->ndo_do_ioctl(dev, ifr, cmd);
+
+ return err;
+}
+
static int dsa_master_ethtool_setup(struct net_device *dev)
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -147,6 +240,8 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
if (cpu_dp->orig_ethtool_ops)
memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
+ ops->get_regs_len = dsa_master_get_regs_len;
+ ops->get_regs = dsa_master_get_regs;
ops->get_sset_count = dsa_master_get_sset_count;
ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
ops->get_strings = dsa_master_get_strings;
@@ -183,6 +278,7 @@ static int dsa_master_ndo_setup(struct net_device *dev)
memcpy(ops, cpu_dp->orig_ndo_ops, sizeof(*ops));
ops->ndo_get_phys_port_name = dsa_master_get_phys_port_name;
+ ops->ndo_do_ioctl = dsa_master_ioctl;
dev->netdev_ops = ops;
@@ -244,8 +340,6 @@ static void dsa_master_reset_mtu(struct net_device *dev)
rtnl_unlock();
}
-static struct lock_class_key dsa_master_addr_list_lock_key;
-
int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
{
int ret;
@@ -259,9 +353,6 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
wmb();
dev->dsa_ptr = cpu_dp;
- lockdep_set_class(&dev->addr_list_lock,
- &dsa_master_addr_list_lock_key);
-
ret = dsa_master_ethtool_setup(dev);
if (ret)
return ret;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index f071acf2842b..774facb8d547 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -348,10 +348,7 @@ int dsa_port_vlan_add(struct dsa_port *dp,
.vlan = vlan,
};
- if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
- return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
-
- return 0;
+ return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
}
int dsa_port_vlan_del(struct dsa_port *dp,
@@ -363,10 +360,7 @@ int dsa_port_vlan_del(struct dsa_port *dp,
.vlan = vlan,
};
- if (!dp->bridge_dev || br_vlan_enabled(dp->bridge_dev))
- return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
-
- return 0;
+ return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
}
int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags)
@@ -382,8 +376,8 @@ int dsa_port_vid_add(struct dsa_port *dp, u16 vid, u16 flags)
trans.ph_prepare = true;
err = dsa_port_vlan_add(dp, &vlan, &trans);
- if (err == -EOPNOTSUPP)
- return 0;
+ if (err)
+ return err;
trans.ph_prepare = false;
return dsa_port_vlan_add(dp, &vlan, &trans);
@@ -421,9 +415,9 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
return phydev;
}
-void dsa_port_phylink_validate(struct phylink_config *config,
- unsigned long *supported,
- struct phylink_link_state *state)
+static void dsa_port_phylink_validate(struct phylink_config *config,
+ unsigned long *supported,
+ struct phylink_link_state *state)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
@@ -433,25 +427,26 @@ void dsa_port_phylink_validate(struct phylink_config *config,
ds->ops->phylink_validate(ds, dp->index, supported, state);
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_validate);
-int dsa_port_phylink_mac_link_state(struct phylink_config *config,
- struct phylink_link_state *state)
+static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
+ struct phylink_link_state *state)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
- /* Only called for SGMII and 802.3z */
- if (!ds->ops->phylink_mac_link_state)
- return -EOPNOTSUPP;
+ /* Only called for inband modes */
+ if (!ds->ops->phylink_mac_link_state) {
+ state->link = 0;
+ return;
+ }
- return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+ if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0)
+ state->link = 0;
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state);
-void dsa_port_phylink_mac_config(struct phylink_config *config,
- unsigned int mode,
- const struct phylink_link_state *state)
+static void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
@@ -461,9 +456,8 @@ void dsa_port_phylink_mac_config(struct phylink_config *config,
ds->ops->phylink_mac_config(ds, dp->index, mode, state);
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_config);
-void dsa_port_phylink_mac_an_restart(struct phylink_config *config)
+static void dsa_port_phylink_mac_an_restart(struct phylink_config *config)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
@@ -473,11 +467,10 @@ void dsa_port_phylink_mac_an_restart(struct phylink_config *config)
ds->ops->phylink_mac_an_restart(ds, dp->index);
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_an_restart);
-void dsa_port_phylink_mac_link_down(struct phylink_config *config,
- unsigned int mode,
- phy_interface_t interface)
+static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct phy_device *phydev = NULL;
@@ -494,12 +487,11 @@ void dsa_port_phylink_mac_link_down(struct phylink_config *config,
ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_down);
-void dsa_port_phylink_mac_link_up(struct phylink_config *config,
- unsigned int mode,
- phy_interface_t interface,
- struct phy_device *phydev)
+static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface,
+ struct phy_device *phydev)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
@@ -512,11 +504,10 @@ void dsa_port_phylink_mac_link_up(struct phylink_config *config,
ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up);
const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
.validate = dsa_port_phylink_validate,
- .mac_link_state = dsa_port_phylink_mac_link_state,
+ .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state,
.mac_config = dsa_port_phylink_mac_config,
.mac_an_restart = dsa_port_phylink_mac_an_restart,
.mac_link_down = dsa_port_phylink_mac_link_down,
@@ -538,10 +529,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
return PTR_ERR(phydev);
if (enable) {
- err = genphy_config_init(phydev);
- if (err < 0)
- goto err_put_dev;
-
err = genphy_resume(phydev);
if (err < 0)
goto err_put_dev;
@@ -571,7 +558,7 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
struct dsa_switch *ds = dp->ds;
struct phy_device *phydev;
int port = dp->index;
- int mode;
+ phy_interface_t mode;
int err;
err = of_phy_register_fixed_link(dn);
@@ -584,12 +571,11 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
phydev = of_phy_find_device(dn);
- mode = of_get_phy_mode(dn);
- if (mode < 0)
+ err = of_get_phy_mode(dn, &mode);
+ if (err)
mode = PHY_INTERFACE_MODE_NA;
phydev->interface = mode;
- genphy_config_init(phydev);
genphy_read_status(phydev);
if (ds->ops->adjust_link)
@@ -604,14 +590,16 @@ static int dsa_port_phylink_register(struct dsa_port *dp)
{
struct dsa_switch *ds = dp->ds;
struct device_node *port_dn = dp->dn;
- int mode, err;
+ phy_interface_t mode;
+ int err;
- mode = of_get_phy_mode(port_dn);
- if (mode < 0)
+ err = of_get_phy_mode(port_dn, &mode);
+ if (err)
mode = PHY_INTERFACE_MODE_NA;
dp->pl_config.dev = ds->dev;
dp->pl_config.type = PHYLINK_DEV;
+ dp->pl_config.pcs_poll = ds->pcs_poll;
dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn),
mode, &dsa_port_phylink_mac_ops);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 33f41178afcc..088c886e609e 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -22,8 +22,6 @@
#include "dsa_priv.h"
-static bool dsa_slave_dev_check(const struct net_device *dev);
-
/* slave mii_bus handling ***************************************************/
static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
{
@@ -116,9 +114,6 @@ static int dsa_slave_close(struct net_device *dev)
struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- cancel_work_sync(&dp->xmit_work);
- skb_queue_purge(&dp->xmit_queue);
-
phylink_stop(dp->pl);
dsa_port_disable(dp);
@@ -312,6 +307,39 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
return ret;
}
+static int dsa_slave_vlan_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct switchdev_trans *trans)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct switchdev_obj_port_vlan vlan;
+ int err;
+
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
+
+ if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev))
+ return 0;
+
+ vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ err = dsa_port_vlan_add(dp, &vlan, trans);
+ if (err)
+ return err;
+
+ /* We need the dedicated CPU port to be a member of the VLAN as well.
+ * Even though drivers often handle CPU membership in special ways,
+ * it doesn't make sense to program a PVID, so clear this flag.
+ */
+ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
+
+ err = dsa_port_vlan_add(dp->cpu_dp, &vlan, trans);
+ if (err)
+ return err;
+
+ return 0;
+}
+
static int dsa_slave_port_obj_add(struct net_device *dev,
const struct switchdev_obj *obj,
struct switchdev_trans *trans,
@@ -339,10 +367,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
trans);
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
- if (obj->orig_dev != dev)
- return -EOPNOTSUPP;
- err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj),
- trans);
+ err = dsa_slave_vlan_add(dev, obj, trans);
break;
default:
err = -EOPNOTSUPP;
@@ -352,6 +377,23 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
return err;
}
+static int dsa_slave_vlan_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ if (obj->orig_dev != dev)
+ return -EOPNOTSUPP;
+
+ if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev))
+ return 0;
+
+ /* Do not deprogram the CPU port as it may be shared with other user
+ * ports which can be members of this VLAN as well.
+ */
+ return dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
+}
+
static int dsa_slave_port_obj_del(struct net_device *dev,
const struct switchdev_obj *obj)
{
@@ -371,9 +413,7 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj));
break;
case SWITCHDEV_OBJ_ID_PORT_VLAN:
- if (obj->orig_dev != dev)
- return -EOPNOTSUPP;
- err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
+ err = dsa_slave_vlan_del(dev, obj);
break;
default:
err = -EOPNOTSUPP;
@@ -473,7 +513,6 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
s->tx_bytes += skb->len;
u64_stats_update_end(&s->syncp);
- DSA_SKB_CB(skb)->deferred_xmit = false;
DSA_SKB_CB(skb)->clone = NULL;
/* Identify PTP protocol packets, clone them, and pass them to the
@@ -486,39 +525,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
*/
nskb = p->xmit(skb, dev);
if (!nskb) {
- if (!DSA_SKB_CB(skb)->deferred_xmit)
- kfree_skb(skb);
+ kfree_skb(skb);
return NETDEV_TX_OK;
}
return dsa_enqueue_skb(nskb, dev);
}
-void *dsa_defer_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- DSA_SKB_CB(skb)->deferred_xmit = true;
-
- skb_queue_tail(&dp->xmit_queue, skb);
- schedule_work(&dp->xmit_work);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(dsa_defer_xmit);
-
-static void dsa_port_xmit_work(struct work_struct *work)
-{
- struct dsa_port *dp = container_of(work, struct dsa_port, xmit_work);
- struct dsa_switch *ds = dp->ds;
- struct sk_buff *skb;
-
- if (unlikely(!ds->ops->port_deferred_xmit))
- return;
-
- while ((skb = skb_dequeue(&dp->xmit_queue)) != NULL)
- ds->ops->port_deferred_xmit(ds, dp->index, skb);
-}
-
/* ethtool operations *******************************************************/
static void dsa_slave_get_drvinfo(struct net_device *dev,
@@ -744,6 +757,22 @@ static int dsa_slave_set_link_ksettings(struct net_device *dev,
return phylink_ethtool_ksettings_set(dp->pl, cmd);
}
+static void dsa_slave_get_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ phylink_ethtool_get_pauseparam(dp->pl, pause);
+}
+
+static int dsa_slave_set_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+
+ return phylink_ethtool_set_pauseparam(dp->pl, pause);
+}
+
#ifdef CONFIG_NET_POLL_CONTROLLER
static int dsa_slave_netpoll_setup(struct net_device *dev,
struct netpoll_info *ni)
@@ -990,12 +1019,16 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
void *type_data)
{
- switch (type) {
- case TC_SETUP_BLOCK:
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (type == TC_SETUP_BLOCK)
return dsa_slave_setup_tc_block(dev, type_data);
- default:
+
+ if (!ds->ops->port_setup_tc)
return -EOPNOTSUPP;
- }
+
+ return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
}
static void dsa_slave_get_stats64(struct net_device *dev,
@@ -1073,6 +1106,9 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
* need to emulate the switchdev prepare + commit phase.
*/
if (dp->bridge_dev) {
+ if (!br_vlan_enabled(dp->bridge_dev))
+ return 0;
+
/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
* device, respectively the VID is not found, returning
* 0 means success, which is a failure for us here.
@@ -1082,8 +1118,15 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
return -EBUSY;
}
- /* This API only allows programming tagged, non-PVID VIDs */
- return dsa_port_vid_add(dp, vid, 0);
+ ret = dsa_port_vid_add(dp, vid, 0);
+ if (ret)
+ return ret;
+
+ ret = dsa_port_vid_add(dp->cpu_dp, vid, 0);
+ if (ret)
+ return ret;
+
+ return 0;
}
static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
@@ -1097,6 +1140,9 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
* need to emulate the switchdev prepare + commit phase.
*/
if (dp->bridge_dev) {
+ if (!br_vlan_enabled(dp->bridge_dev))
+ return 0;
+
/* br_vlan_get_info() returns -EINVAL or -ENOENT if the
* device, respectively the VID is not found, returning
* 0 means success, which is a failure for us here.
@@ -1106,11 +1152,10 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
return -EBUSY;
}
- ret = dsa_port_vid_del(dp, vid);
- if (ret == -EOPNOTSUPP)
- ret = 0;
-
- return ret;
+ /* Do not deprogram the CPU port as it may be shared with other user
+ * ports which can be members of this VLAN as well.
+ */
+ return dsa_port_vid_del(dp, vid);
}
static const struct ethtool_ops dsa_slave_ethtool_ops = {
@@ -1131,6 +1176,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_eee = dsa_slave_get_eee,
.get_link_ksettings = dsa_slave_get_link_ksettings,
.set_link_ksettings = dsa_slave_set_link_ksettings,
+ .get_pauseparam = dsa_slave_get_pauseparam,
+ .set_pauseparam = dsa_slave_set_pauseparam,
.get_rxnfc = dsa_slave_get_rxnfc,
.set_rxnfc = dsa_slave_set_rxnfc,
.get_ts_info = dsa_slave_get_ts_info,
@@ -1234,11 +1281,12 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
struct device_node *port_dn = dp->dn;
struct dsa_switch *ds = dp->ds;
+ phy_interface_t mode;
u32 phy_flags = 0;
- int mode, ret;
+ int ret;
- mode = of_get_phy_mode(port_dn);
- if (mode < 0)
+ ret = of_get_phy_mode(port_dn, &mode);
+ if (ret)
mode = PHY_INTERFACE_MODE_NA;
dp->pl_config.dev = &slave_dev->dev;
@@ -1280,15 +1328,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
return ret;
}
-static struct lock_class_key dsa_slave_netdev_xmit_lock_key;
-static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock,
- &dsa_slave_netdev_xmit_lock_key);
-}
-
int dsa_slave_suspend(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
@@ -1296,9 +1335,6 @@ int dsa_slave_suspend(struct net_device *slave_dev)
if (!netif_running(slave_dev))
return 0;
- cancel_work_sync(&dp->xmit_work);
- skb_queue_purge(&dp->xmit_queue);
-
netif_device_detach(slave_dev);
rtnl_lock();
@@ -1357,8 +1393,9 @@ int dsa_slave_create(struct dsa_port *port)
if (slave_dev == NULL)
return -ENOMEM;
- slave_dev->features = master->vlan_features | NETIF_F_HW_TC |
- NETIF_F_HW_VLAN_CTAG_FILTER;
+ slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
+ if (ds->ops->port_vlan_add && ds->ops->port_vlan_del)
+ slave_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
slave_dev->hw_features |= NETIF_F_HW_TC;
slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
if (!IS_ERR_OR_NULL(port->mac))
@@ -1371,9 +1408,6 @@ int dsa_slave_create(struct dsa_port *port)
slave_dev->max_mtu = ETH_MAX_MTU;
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
- netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
- NULL);
-
SET_NETDEV_DEV(slave_dev, port->ds->dev);
slave_dev->dev.of_node = port->dn;
slave_dev->vlan_features = master->vlan_features;
@@ -1386,8 +1420,6 @@ int dsa_slave_create(struct dsa_port *port)
}
p->dp = port;
INIT_LIST_HEAD(&p->mall_tc_list);
- INIT_WORK(&port->xmit_work, dsa_port_xmit_work);
- skb_queue_head_init(&port->xmit_queue);
p->xmit = cpu_dp->tag_ops->xmit;
port->slave = slave_dev;
@@ -1439,7 +1471,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
free_netdev(slave_dev);
}
-static bool dsa_slave_dev_check(const struct net_device *dev)
+bool dsa_slave_dev_check(const struct net_device *dev)
{
return dev->netdev_ops == &dsa_slave_netdev_ops;
}
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 09d9286b27cc..df4abe897ed6 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -20,7 +20,7 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
int i;
for (i = 0; i < ds->num_ports; ++i) {
- struct dsa_port *dp = &ds->ports[i];
+ struct dsa_port *dp = dsa_to_port(ds, i);
if (dp->ageing_time && dp->ageing_time < ageing_time)
ageing_time = dp->ageing_time;
@@ -98,7 +98,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
if (unset_vlan_filtering) {
struct switchdev_trans trans = {0};
- err = dsa_port_vlan_filtering(&ds->ports[info->port],
+ err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port),
false, &trans);
if (err && err != EOPNOTSUPP)
return err;
@@ -128,57 +128,51 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds,
return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
}
-static int
-dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_mdb *mdb,
- const unsigned long *bitmap)
+static bool dsa_switch_mdb_match(struct dsa_switch *ds, int port,
+ struct dsa_notifier_mdb_info *info)
+{
+ if (ds->index == info->sw_index && port == info->port)
+ return true;
+
+ if (dsa_is_dsa_port(ds, port))
+ return true;
+
+ return false;
+}
+
+static int dsa_switch_mdb_prepare(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
{
int port, err;
if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
return -EOPNOTSUPP;
- for_each_set_bit(port, bitmap, ds->num_ports) {
- err = ds->ops->port_mdb_prepare(ds, port, mdb);
- if (err)
- return err;
+ for (port = 0; port < ds->num_ports; port++) {
+ if (dsa_switch_mdb_match(ds, port, info)) {
+ err = ds->ops->port_mdb_prepare(ds, port, info->mdb);
+ if (err)
+ return err;
+ }
}
return 0;
}
-static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_mdb *mdb,
- const unsigned long *bitmap)
-{
- int port;
-
- if (!ds->ops->port_mdb_add)
- return;
-
- for_each_set_bit(port, bitmap, ds->num_ports)
- ds->ops->port_mdb_add(ds, port, mdb);
-}
-
static int dsa_switch_mdb_add(struct dsa_switch *ds,
struct dsa_notifier_mdb_info *info)
{
- const struct switchdev_obj_port_mdb *mdb = info->mdb;
- struct switchdev_trans *trans = info->trans;
int port;
- /* Build a mask of Multicast group members */
- bitmap_zero(ds->bitmap, ds->num_ports);
- if (ds->index == info->sw_index)
- set_bit(info->port, ds->bitmap);
- for (port = 0; port < ds->num_ports; port++)
- if (dsa_is_dsa_port(ds, port))
- set_bit(port, ds->bitmap);
+ if (switchdev_trans_ph_prepare(info->trans))
+ return dsa_switch_mdb_prepare(ds, info);
- if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
+ if (!ds->ops->port_mdb_add)
+ return 0;
- dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
+ for (port = 0; port < ds->num_ports; port++)
+ if (dsa_switch_mdb_match(ds, port, info))
+ ds->ops->port_mdb_add(ds, port, info->mdb);
return 0;
}
@@ -186,13 +180,11 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
static int dsa_switch_mdb_del(struct dsa_switch *ds,
struct dsa_notifier_mdb_info *info)
{
- const struct switchdev_obj_port_mdb *mdb = info->mdb;
-
if (!ds->ops->port_mdb_del)
return -EOPNOTSUPP;
if (ds->index == info->sw_index)
- return ds->ops->port_mdb_del(ds, info->port, mdb);
+ return ds->ops->port_mdb_del(ds, info->port, info->mdb);
return 0;
}
@@ -234,59 +226,55 @@ static int dsa_port_vlan_check(struct dsa_switch *ds, int port,
(void *)vlan);
}
-static int
-dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_vlan *vlan,
- const unsigned long *bitmap)
+static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port,
+ struct dsa_notifier_vlan_info *info)
+{
+ if (ds->index == info->sw_index && port == info->port)
+ return true;
+
+ if (dsa_is_dsa_port(ds, port))
+ return true;
+
+ return false;
+}
+
+static int dsa_switch_vlan_prepare(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
{
int port, err;
if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
return -EOPNOTSUPP;
- for_each_set_bit(port, bitmap, ds->num_ports) {
- err = dsa_port_vlan_check(ds, port, vlan);
- if (err)
- return err;
+ for (port = 0; port < ds->num_ports; port++) {
+ if (dsa_switch_vlan_match(ds, port, info)) {
+ err = dsa_port_vlan_check(ds, port, info->vlan);
+ if (err)
+ return err;
- err = ds->ops->port_vlan_prepare(ds, port, vlan);
- if (err)
- return err;
+ err = ds->ops->port_vlan_prepare(ds, port, info->vlan);
+ if (err)
+ return err;
+ }
}
return 0;
}
-static void
-dsa_switch_vlan_add_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_vlan *vlan,
- const unsigned long *bitmap)
-{
- int port;
-
- for_each_set_bit(port, bitmap, ds->num_ports)
- ds->ops->port_vlan_add(ds, port, vlan);
-}
-
static int dsa_switch_vlan_add(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
- const struct switchdev_obj_port_vlan *vlan = info->vlan;
- struct switchdev_trans *trans = info->trans;
int port;
- /* Build a mask of VLAN members */
- bitmap_zero(ds->bitmap, ds->num_ports);
- if (ds->index == info->sw_index)
- set_bit(info->port, ds->bitmap);
- for (port = 0; port < ds->num_ports; port++)
- if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
- set_bit(port, ds->bitmap);
+ if (switchdev_trans_ph_prepare(info->trans))
+ return dsa_switch_vlan_prepare(ds, info);
- if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
+ if (!ds->ops->port_vlan_add)
+ return 0;
- dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
+ for (port = 0; port < ds->num_ports; port++)
+ if (dsa_switch_vlan_match(ds, port, info))
+ ds->ops->port_vlan_add(ds, port, info->vlan);
return 0;
}
@@ -294,14 +282,15 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
static int dsa_switch_vlan_del(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
- const struct switchdev_obj_port_vlan *vlan = info->vlan;
-
if (!ds->ops->port_vlan_del)
return -EOPNOTSUPP;
if (ds->index == info->sw_index)
- return ds->ops->port_vlan_del(ds, info->port, vlan);
+ return ds->ops->port_vlan_del(ds, info->port, info->vlan);
+ /* Do not deprogram the DSA links as they may be used as conduit
+ * for other VLAN members in the fabric.
+ */
return 0;
}
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 6ebbd799c4eb..2fb6c26294b5 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -28,16 +28,17 @@
*
* RSV - VID[9]:
* To be used for further expansion of SWITCH_ID or for other purposes.
+ * Must be transmitted as zero and ignored on receive.
*
* SWITCH_ID - VID[8:6]:
- * Index of switch within DSA tree. Must be between 0 and
- * DSA_MAX_SWITCHES - 1.
+ * Index of switch within DSA tree. Must be between 0 and 7.
*
* RSV - VID[5:4]:
* To be used for further expansion of PORT or for other purposes.
+ * Must be transmitted as zero and ignored on receive.
*
* PORT - VID[3:0]:
- * Index of switch port. Must be between 0 and DSA_MAX_PORTS - 1.
+ * Index of switch port. Must be between 0 and 15.
*/
#define DSA_8021Q_DIR_SHIFT 10
@@ -91,6 +92,79 @@ int dsa_8021q_rx_source_port(u16 vid)
}
EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
+static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
+{
+ struct bridge_vlan_info vinfo;
+ struct net_device *slave;
+ u16 pvid;
+ int err;
+
+ if (!dsa_is_user_port(ds, port))
+ return 0;
+
+ slave = dsa_to_port(ds, port)->slave;
+
+ err = br_vlan_get_pvid(slave, &pvid);
+ if (!pvid || err < 0)
+ /* There is no pvid on the bridge for this port, which is
+ * perfectly valid. Nothing to restore, bye-bye!
+ */
+ return 0;
+
+ err = br_vlan_get_info(slave, pvid, &vinfo);
+ if (err < 0) {
+ dev_err(ds->dev, "Couldn't determine PVID attributes\n");
+ return err;
+ }
+
+ return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags);
+}
+
+/* If @enabled is true, installs @vid with @flags into the switch port's HW
+ * filter.
+ * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the
+ * user explicitly configured this @vid through the bridge core, then the @vid
+ * is installed again, but this time with the flags from the bridge layer.
+ */
+static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
+ u16 flags, bool enabled)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ struct bridge_vlan_info vinfo;
+ int err;
+
+ if (enabled)
+ return dsa_port_vid_add(dp, vid, flags);
+
+ err = dsa_port_vid_del(dp, vid);
+ if (err < 0)
+ return err;
+
+ /* Nothing to restore from the bridge for a non-user port.
+ * The CPU port VLANs are restored implicitly with the user ports,
+ * similar to how the bridge does in dsa_slave_vlan_add and
+ * dsa_slave_vlan_del.
+ */
+ if (!dsa_is_user_port(ds, port))
+ return 0;
+
+ err = br_vlan_get_info(dp->slave, vid, &vinfo);
+ /* Couldn't determine bridge attributes for this vid,
+ * it means the bridge had not configured it.
+ */
+ if (err < 0)
+ return 0;
+
+ /* Restore the VID from the bridge */
+ err = dsa_port_vid_add(dp, vid, vinfo.flags);
+ if (err < 0)
+ return err;
+
+ vinfo.flags &= ~BRIDGE_VLAN_INFO_PVID;
+
+ return dsa_port_vid_add(dp->cpu_dp, vid, vinfo.flags);
+}
+
/* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single
* front-panel switch port (here swp0).
*
@@ -146,8 +220,6 @@ EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
{
int upstream = dsa_upstream_port(ds, port);
- struct dsa_port *dp = &ds->ports[port];
- struct dsa_port *upstream_dp = &ds->ports[upstream];
u16 rx_vid = dsa_8021q_rx_vid(ds, port);
u16 tx_vid = dsa_8021q_tx_vid(ds, port);
int i, err;
@@ -164,7 +236,6 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
* restrictions, so there are no concerns about leaking traffic.
*/
for (i = 0; i < ds->num_ports; i++) {
- struct dsa_port *other_dp = &ds->ports[i];
u16 flags;
if (i == upstream)
@@ -177,10 +248,7 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
/* The RX VID is a regular VLAN on all others */
flags = BRIDGE_VLAN_INFO_UNTAGGED;
- if (enabled)
- err = dsa_port_vid_add(other_dp, rx_vid, flags);
- else
- err = dsa_port_vid_del(other_dp, rx_vid);
+ err = dsa_8021q_vid_apply(ds, i, rx_vid, flags, enabled);
if (err) {
dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
rx_vid, port, err);
@@ -191,10 +259,7 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
/* CPU port needs to see this port's RX VID
* as tagged egress.
*/
- if (enabled)
- err = dsa_port_vid_add(upstream_dp, rx_vid, 0);
- else
- err = dsa_port_vid_del(upstream_dp, rx_vid);
+ err = dsa_8021q_vid_apply(ds, upstream, rx_vid, 0, enabled);
if (err) {
dev_err(ds->dev, "Failed to apply RX VID %d to port %d: %d\n",
rx_vid, port, err);
@@ -202,26 +267,24 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled)
}
/* Finally apply the TX VID on this port and on the CPU port */
- if (enabled)
- err = dsa_port_vid_add(dp, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED);
- else
- err = dsa_port_vid_del(dp, tx_vid);
+ err = dsa_8021q_vid_apply(ds, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED,
+ enabled);
if (err) {
dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n",
tx_vid, port, err);
return err;
}
- if (enabled)
- err = dsa_port_vid_add(upstream_dp, tx_vid, 0);
- else
- err = dsa_port_vid_del(upstream_dp, tx_vid);
+ err = dsa_8021q_vid_apply(ds, upstream, tx_vid, 0, enabled);
if (err) {
dev_err(ds->dev, "Failed to apply TX VID %d on port %d: %d\n",
tx_vid, upstream, err);
return err;
}
- return 0;
+ if (!enabled)
+ err = dsa_8021q_restore_pvid(ds, port);
+
+ return err;
}
EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging);
@@ -278,13 +341,4 @@ struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
-static const struct dsa_device_ops dsa_8021q_netdev_ops = {
- .name = "8021q",
- .proto = DSA_TAG_PROTO_8021Q,
- .overhead = VLAN_HLEN,
-};
-
MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_8021Q);
-
-module_dsa_tag_driver(dsa_8021q_netdev_ops);
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
new file mode 100644
index 000000000000..55b00694cdba
--- /dev/null
+++ b/net/dsa/tag_ar9331.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+ */
+
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+
+#include "dsa_priv.h"
+
+#define AR9331_HDR_LEN 2
+#define AR9331_HDR_VERSION 1
+
+#define AR9331_HDR_VERSION_MASK GENMASK(15, 14)
+#define AR9331_HDR_PRIORITY_MASK GENMASK(13, 12)
+#define AR9331_HDR_TYPE_MASK GENMASK(10, 8)
+#define AR9331_HDR_BROADCAST BIT(7)
+#define AR9331_HDR_FROM_CPU BIT(6)
+/* AR9331_HDR_RESERVED - not used or may be version field.
+ * According to the AR8216 doc it should 0b10. On AR9331 it is 0b11 on RX path
+ * and should be set to 0b11 to make it work.
+ */
+#define AR9331_HDR_RESERVED_MASK GENMASK(5, 4)
+#define AR9331_HDR_PORT_NUM_MASK GENMASK(3, 0)
+
+static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ __le16 *phdr;
+ u16 hdr;
+
+ if (skb_cow_head(skb, AR9331_HDR_LEN) < 0)
+ return NULL;
+
+ phdr = skb_push(skb, AR9331_HDR_LEN);
+
+ hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION);
+ hdr |= AR9331_HDR_FROM_CPU | dp->index;
+ /* 0b10 for AR8216 and 0b11 for AR9331 */
+ hdr |= AR9331_HDR_RESERVED_MASK;
+
+ phdr[0] = cpu_to_le16(hdr);
+
+ return skb;
+}
+
+static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *pt)
+{
+ u8 ver, port;
+ u16 hdr;
+
+ if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN)))
+ return NULL;
+
+ hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb));
+
+ ver = FIELD_GET(AR9331_HDR_VERSION_MASK, hdr);
+ if (unlikely(ver != AR9331_HDR_VERSION)) {
+ netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n",
+ __func__, __LINE__, hdr);
+ return NULL;
+ }
+
+ if (unlikely(hdr & AR9331_HDR_FROM_CPU)) {
+ netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n",
+ __func__, __LINE__, hdr);
+ return NULL;
+ }
+
+ skb_pull_rcsum(skb, AR9331_HDR_LEN);
+
+ /* Get source port information */
+ port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
+
+ skb->dev = dsa_master_find_slave(ndev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ return skb;
+}
+
+static const struct dsa_device_ops ar9331_netdev_ops = {
+ .name = "ar9331",
+ .proto = DSA_TAG_PROTO_AR9331,
+ .xmit = ar9331_tag_xmit,
+ .rcv = ar9331_tag_rcv,
+ .overhead = AR9331_HDR_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331);
+module_dsa_tag_driver(ar9331_netdev_ops);
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
index b678160bbd66..408d4af390a0 100644
--- a/net/dsa/tag_gswip.c
+++ b/net/dsa/tag_gswip.c
@@ -104,7 +104,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
}
static const struct dsa_device_ops gswip_netdev_ops = {
- .name = "gwsip",
+ .name = "gswip",
.proto = DSA_TAG_PROTO_GSWIP,
.xmit = gswip_tag_xmit,
.rcv = gswip_tag_rcv,
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index b4872b87d4a6..90d055c4df9e 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -70,6 +70,65 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
}
/*
+ * For Ingress (Host -> KSZ8795), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
+ *
+ * For Egress (KSZ8795 -> Host), 1 byte is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ * (eg, 0x00=port1, 0x02=port3, 0x06=port7)
+ */
+
+#define KSZ8795_TAIL_TAG_OVERRIDE BIT(6)
+#define KSZ8795_TAIL_TAG_LOOKUP BIT(7)
+
+static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct sk_buff *nskb;
+ u8 *tag;
+ u8 *addr;
+
+ nskb = ksz_common_xmit(skb, dev, KSZ_INGRESS_TAG_LEN);
+ if (!nskb)
+ return NULL;
+
+ /* Tag encoding */
+ tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
+ addr = skb_mac_header(nskb);
+
+ *tag = 1 << dp->index;
+ if (is_link_local_ether_addr(addr))
+ *tag |= KSZ8795_TAIL_TAG_OVERRIDE;
+
+ return nskb;
+}
+
+static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt)
+{
+ u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+
+ return ksz_common_rcv(skb, dev, tag[0] & 7, KSZ_EGRESS_TAG_LEN);
+}
+
+static const struct dsa_device_ops ksz8795_netdev_ops = {
+ .name = "ksz8795",
+ .proto = DSA_TAG_PROTO_KSZ8795,
+ .xmit = ksz8795_xmit,
+ .rcv = ksz8795_rcv,
+ .overhead = KSZ_INGRESS_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz8795_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795);
+
+/*
* For Ingress (Host -> KSZ9477), 2 bytes are added before FCS.
* ---------------------------------------------------------------------------
* DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
@@ -183,6 +242,7 @@ DSA_TAG_DRIVER(ksz9893_netdev_ops);
MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893);
static struct dsa_tag_driver *dsa_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
&DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops),
&DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops),
};
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
new file mode 100644
index 000000000000..8e3e7283d430
--- /dev/null
+++ b/net/dsa/tag_ocelot.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 NXP Semiconductors
+ */
+#include <soc/mscc/ocelot.h>
+#include <linux/packing.h>
+#include "dsa_priv.h"
+
+/* The CPU injection header and the CPU extraction header can have 3 types of
+ * prefixes: long, short and no prefix. The format of the header itself is the
+ * same in all 3 cases.
+ *
+ * Extraction with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame |
+ * | | | | | header | |
+ * +-------------------+-------------------+------+------+------------+-------+
+ * 48 bits 48 bits 16 bits 16 bits 128 bits
+ *
+ * Extraction with short prefix:
+ *
+ * +------+------+------------+-------+
+ * | 8880 | 000a | extraction | frame |
+ * | | | header | |
+ * +------+------+------------+-------+
+ * 16 bits 16 bits 128 bits
+ *
+ * Extraction with no prefix:
+ *
+ * +------------+-------+
+ * | extraction | frame |
+ * | header | |
+ * +------------+-------+
+ * 128 bits
+ *
+ *
+ * Injection with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * | any dmac | any smac | 8880 | 000a | injection | frame |
+ * | | | | | header | |
+ * +-------------------+-------------------+------+------+------------+-------+
+ * 48 bits 48 bits 16 bits 16 bits 128 bits
+ *
+ * Injection with short prefix:
+ *
+ * +------+------+------------+-------+
+ * | 8880 | 000a | injection | frame |
+ * | | | header | |
+ * +------+------+------------+-------+
+ * 16 bits 16 bits 128 bits
+ *
+ * Injection with no prefix:
+ *
+ * +------------+-------+
+ * | injection | frame |
+ * | header | |
+ * +------------+-------+
+ * 128 bits
+ *
+ * The injection header looks like this (network byte order, bit 127
+ * is part of lowest address byte in memory, bit 0 is part of highest
+ * address byte):
+ *
+ * +------+------+------+------+------+------+------+------+
+ * 127:120 |BYPASS| MASQ | MASQ_PORT |REW_OP|REW_OP|
+ * +------+------+------+------+------+------+------+------+
+ * 119:112 | REW_OP |
+ * +------+------+------+------+------+------+------+------+
+ * 111:104 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 103: 96 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 95: 88 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 87: 80 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 79: 72 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 71: 64 | RSV | DEST |
+ * +------+------+------+------+------+------+------+------+
+ * 63: 56 | DEST |
+ * +------+------+------+------+------+------+------+------+
+ * 55: 48 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 47: 40 | RSV | SRC_PORT | RSV |TFRM_TIMER|
+ * +------+------+------+------+------+------+------+------+
+ * 39: 32 | TFRM_TIMER | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 31: 24 | RSV | DP | POP_CNT | CPUQ |
+ * +------+------+------+------+------+------+------+------+
+ * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE|
+ * +------+------+------+------+------+------+------+------+
+ * 15: 8 | PCP | DEI | VID |
+ * +------+------+------+------+------+------+------+------+
+ * 7: 0 | VID |
+ * +------+------+------+------+------+------+------+------+
+ *
+ * And the extraction header looks like this:
+ *
+ * +------+------+------+------+------+------+------+------+
+ * 127:120 | RSV | REW_OP |
+ * +------+------+------+------+------+------+------+------+
+ * 119:112 | REW_OP | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 111:104 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 103: 96 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 95: 88 | REW_VAL |
+ * +------+------+------+------+------+------+------+------+
+ * 87: 80 | REW_VAL | LLEN |
+ * +------+------+------+------+------+------+------+------+
+ * 79: 72 | LLEN | WLEN |
+ * +------+------+------+------+------+------+------+------+
+ * 71: 64 | WLEN | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 63: 56 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 55: 48 | RSV |
+ * +------+------+------+------+------+------+------+------+
+ * 47: 40 | RSV | SRC_PORT | ACL_ID |
+ * +------+------+------+------+------+------+------+------+
+ * 39: 32 | ACL_ID | RSV | SFLOW_ID |
+ * +------+------+------+------+------+------+------+------+
+ * 31: 24 |ACL_HIT| DP | LRN_FLAGS | CPUQ |
+ * +------+------+------+------+------+------+------+------+
+ * 23: 16 | CPUQ | QOS_CLASS |TAG_TYPE|
+ * +------+------+------+------+------+------+------+------+
+ * 15: 8 | PCP | DEI | VID |
+ * +------+------+------+------+------+------+------+------+
+ * 7: 0 | VID |
+ * +------+------+------+------+------+------+------+------+
+ */
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_slave_to_port(netdev);
+ u64 bypass, dest, src, qos_class, rew_op;
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct ocelot *ocelot = ds->priv;
+ struct ocelot_port *ocelot_port = ocelot->ports[port];
+ u8 *injection;
+
+ if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) {
+ netdev_err(netdev, "Cannot make room for tag.\n");
+ return NULL;
+ }
+
+ injection = skb_push(skb, OCELOT_TAG_LEN);
+
+ memset(injection, 0, OCELOT_TAG_LEN);
+
+ src = dsa_upstream_port(ds, port);
+ dest = BIT(port);
+ bypass = true;
+ qos_class = skb->priority;
+
+ packing(injection, &bypass, 127, 127, OCELOT_TAG_LEN, PACK, 0);
+ packing(injection, &dest, 68, 56, OCELOT_TAG_LEN, PACK, 0);
+ packing(injection, &src, 46, 43, OCELOT_TAG_LEN, PACK, 0);
+ packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0);
+
+ if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) {
+ rew_op = ocelot_port->ptp_cmd;
+ if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) {
+ rew_op |= (ocelot_port->ts_id % 4) << 3;
+ ocelot_port->ts_id++;
+ }
+
+ packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+ struct net_device *netdev,
+ struct packet_type *pt)
+{
+ u64 src_port, qos_class;
+ u8 *start = skb->data;
+ u8 *extraction;
+
+ /* Revert skb->data by the amount consumed by the DSA master,
+ * so it points to the beginning of the frame.
+ */
+ skb_push(skb, ETH_HLEN);
+ /* We don't care about the long prefix, it is just for easy entrance
+ * into the DSA master's RX filter. Discard it now by moving it into
+ * the headroom.
+ */
+ skb_pull(skb, OCELOT_LONG_PREFIX_LEN);
+ /* And skb->data now points to the extraction frame header.
+ * Keep a pointer to it.
+ */
+ extraction = skb->data;
+ /* Now the EFH is part of the headroom as well */
+ skb_pull(skb, OCELOT_TAG_LEN);
+ /* Reset the pointer to the real MAC header */
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ /* And move skb->data to the correct location again */
+ skb_pull(skb, ETH_HLEN);
+
+ /* Remove from inet csum the extraction header */
+ skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN);
+
+ packing(extraction, &src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0);
+ packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
+
+ skb->dev = dsa_master_find_slave(netdev, 0, src_port);
+ if (!skb->dev)
+ /* The switch will reflect back some frames sent through
+ * sockets opened on the bare DSA master. These will come back
+ * with src_port equal to the index of the CPU port, for which
+ * there is no slave registered. So don't print any error
+ * message here (ignore and drop those frames).
+ */
+ return NULL;
+
+ skb->offload_fwd_mark = 1;
+ skb->priority = qos_class;
+
+ return skb;
+}
+
+static struct dsa_device_ops ocelot_netdev_ops = {
+ .name = "ocelot",
+ .proto = DSA_TAG_PROTO_OCELOT,
+ .xmit = ocelot_xmit,
+ .rcv = ocelot_rcv,
+ .overhead = OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT);
+
+module_dsa_tag_driver(ocelot_netdev_ops);
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index c95885215525..70db7c909f74 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -33,10 +33,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
struct dsa_port *dp = dsa_slave_to_port(dev);
u16 *phdr, hdr;
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
- if (skb_cow_head(skb, 0) < 0)
+ if (skb_cow_head(skb, QCA_HDR_LEN) < 0)
return NULL;
skb_push(skb, QCA_HDR_LEN);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 47ee88163a9d..5366ea430349 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -83,20 +83,33 @@ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev)
return false;
}
+/* Calls sja1105_port_deferred_xmit in sja1105_main.c */
+static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp,
+ struct sk_buff *skb)
+{
+ /* Increase refcount so the kfree_skb in dsa_slave_xmit
+ * won't really free the packet.
+ */
+ skb_queue_tail(&sp->xmit_queue, skb_get(skb));
+ kthread_queue_work(sp->xmit_worker, &sp->xmit_work);
+
+ return NULL;
+}
+
static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
struct net_device *netdev)
{
struct dsa_port *dp = dsa_slave_to_port(netdev);
- struct dsa_switch *ds = dp->ds;
- u16 tx_vid = dsa_8021q_tx_vid(ds, dp->index);
- u8 pcp = skb->priority;
+ u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
/* Transmitting management traffic does not rely upon switch tagging,
* but instead SPI-installed management routes. Part 2 of this
* is the .port_deferred_xmit driver callback.
*/
if (unlikely(sja1105_is_link_local(skb)))
- return dsa_defer_xmit(skb, netdev);
+ return sja1105_defer_xmit(dp->priv, skb);
/* If we are under a vlan_filtering bridge, IP termination on
* switch ports based on 802.1Q tags is simply too brittle to
@@ -155,7 +168,11 @@ static struct sk_buff
/* Step 1: A timestampable frame was received.
* Buffer it until we get its meta frame.
*/
- if (is_link_local && sp->data->hwts_rx_en) {
+ if (is_link_local) {
+ if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state))
+ /* Do normal processing. */
+ return skb;
+
spin_lock(&sp->data->meta_lock);
/* Was this a link-local frame instead of the meta
* that we were expecting?
@@ -186,6 +203,12 @@ static struct sk_buff
} else if (is_meta) {
struct sk_buff *stampable_skb;
+ /* Drop the meta frame if we're not in the right state
+ * to process it.
+ */
+ if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state))
+ return NULL;
+
spin_lock(&sp->data->meta_lock);
stampable_skb = sp->data->stampable_skb;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 17374afee28f..c8b903302ff2 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16
eth->h_proto = type;
memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
- hh->hh_len = ETH_HLEN;
+
+ /* Pairs with READ_ONCE() in neigh_resolve_output(),
+ * neigh_hh_output() and neigh_update_hhs().
+ */
+ smp_store_release(&hh->hh_len, ETH_HLEN);
+
return 0;
}
EXPORT_SYMBOL(eth_header_cache);
@@ -330,22 +335,6 @@ int eth_mac_addr(struct net_device *dev, void *p)
}
EXPORT_SYMBOL(eth_mac_addr);
-/**
- * eth_change_mtu - set new MTU size
- * @dev: network device
- * @new_mtu: new Maximum Transfer Unit
- *
- * Allow changing MTU size. Needs to be overridden for devices
- * supporting jumbo frames.
- */
-int eth_change_mtu(struct net_device *dev, int new_mtu)
-{
- netdev_warn(dev, "%s is deprecated\n", __func__);
- dev->mtu = new_mtu;
- return 0;
-}
-EXPORT_SYMBOL(eth_change_mtu);
-
int eth_validate_addr(struct net_device *dev)
{
if (!is_valid_ether_addr(dev->dev_addr))
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
new file mode 100644
index 000000000000..424545a4aaec
--- /dev/null
+++ b/net/ethtool/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-y += ioctl.o common.o
+
+obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o
+
+ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
+ linkstate.o debug.o wol.o
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
new file mode 100644
index 000000000000..fce45dac4205
--- /dev/null
+++ b/net/ethtool/bitset.c
@@ -0,0 +1,735 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <linux/bitmap.h>
+#include "netlink.h"
+#include "bitset.h"
+
+/* Some bitmaps are internally represented as an array of unsigned long, some
+ * as an array of u32 (some even as single u32 for now). To avoid the need of
+ * wrappers on caller side, we provide two set of functions: those with "32"
+ * suffix in their names expect u32 based bitmaps, those without it expect
+ * unsigned long bitmaps.
+ */
+
+static u32 ethnl_lower_bits(unsigned int n)
+{
+ return ~(u32)0 >> (32 - n % 32);
+}
+
+static u32 ethnl_upper_bits(unsigned int n)
+{
+ return ~(u32)0 << (n % 32);
+}
+
+/**
+ * ethnl_bitmap32_clear() - Clear u32 based bitmap
+ * @dst: bitmap to clear
+ * @start: beginning of the interval
+ * @end: end of the interval
+ * @mod: set if bitmap was modified
+ *
+ * Clear @nbits bits of a bitmap with indices @start <= i < @end
+ */
+static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end,
+ bool *mod)
+{
+ unsigned int start_word = start / 32;
+ unsigned int end_word = end / 32;
+ unsigned int i;
+ u32 mask;
+
+ if (end <= start)
+ return;
+
+ if (start % 32) {
+ mask = ethnl_upper_bits(start);
+ if (end_word == start_word) {
+ mask &= ethnl_lower_bits(end);
+ if (dst[start_word] & mask) {
+ dst[start_word] &= ~mask;
+ *mod = true;
+ }
+ return;
+ }
+ if (dst[start_word] & mask) {
+ dst[start_word] &= ~mask;
+ *mod = true;
+ }
+ start_word++;
+ }
+
+ for (i = start_word; i < end_word; i++) {
+ if (dst[i]) {
+ dst[i] = 0;
+ *mod = true;
+ }
+ }
+ if (end % 32) {
+ mask = ethnl_lower_bits(end);
+ if (dst[end_word] & mask) {
+ dst[end_word] &= ~mask;
+ *mod = true;
+ }
+ }
+}
+
+/**
+ * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval
+ * @map: bitmap to test
+ * @start: beginning of the interval
+ * @end: end of the interval
+ *
+ * Return: true if there is non-zero bit with index @start <= i < @end,
+ * false if the whole interval is zero
+ */
+static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start,
+ unsigned int end)
+{
+ unsigned int start_word = start / 32;
+ unsigned int end_word = end / 32;
+ u32 mask;
+
+ if (end <= start)
+ return true;
+
+ if (start % 32) {
+ mask = ethnl_upper_bits(start);
+ if (end_word == start_word) {
+ mask &= ethnl_lower_bits(end);
+ return map[start_word] & mask;
+ }
+ if (map[start_word] & mask)
+ return true;
+ start_word++;
+ }
+
+ if (!memchr_inv(map + start_word, '\0',
+ (end_word - start_word) * sizeof(u32)))
+ return true;
+ if (end % 32 == 0)
+ return true;
+ return map[end_word] & ethnl_lower_bits(end);
+}
+
+/**
+ * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask
+ * pair
+ * @dst: bitmap to update
+ * @nbits: bit size of the bitmap
+ * @value: values to set
+ * @mask: mask of bits to set
+ * @mod: set to true if bitmap is modified, preserve if not
+ *
+ * Set bits in @dst bitmap which are set in @mask to values from @value, leave
+ * the rest untouched. If destination bitmap was modified, set @mod to true,
+ * leave as it is if not.
+ */
+static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits,
+ const u32 *value, const u32 *mask, bool *mod)
+{
+ while (nbits > 0) {
+ u32 real_mask = mask ? *mask : ~(u32)0;
+ u32 new_value;
+
+ if (nbits < 32)
+ real_mask &= ethnl_lower_bits(nbits);
+ new_value = (*dst & ~real_mask) | (*value & real_mask);
+ if (new_value != *dst) {
+ *dst = new_value;
+ *mod = true;
+ }
+
+ if (nbits <= 32)
+ break;
+ dst++;
+ nbits -= 32;
+ value++;
+ if (mask)
+ mask++;
+ }
+}
+
+static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index)
+{
+ return map[index / 32] & (1U << (index % 32));
+}
+
+/**
+ * ethnl_bitset32_size() - Calculate size of bitset nested attribute
+ * @val: value bitmap (u32 based)
+ * @mask: mask bitmap (u32 based, optional)
+ * @nbits: bit length of the bitset
+ * @names: array of bit names (optional)
+ * @compact: assume compact format for output
+ *
+ * Estimate length of netlink attribute composed by a later call to
+ * ethnl_put_bitset32() call with the same arguments.
+ *
+ * Return: negative error code or attribute length estimate
+ */
+int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact)
+{
+ unsigned int len = 0;
+
+ /* list flag */
+ if (!mask)
+ len += nla_total_size(sizeof(u32));
+ /* size */
+ len += nla_total_size(sizeof(u32));
+
+ if (compact) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ /* value, mask */
+ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32));
+ } else {
+ unsigned int bits_len = 0;
+ unsigned int bit_len, i;
+
+ for (i = 0; i < nbits; i++) {
+ const char *name = names ? names[i] : NULL;
+
+ if (!ethnl_bitmap32_test_bit(mask ?: val, i))
+ continue;
+ /* index */
+ bit_len = nla_total_size(sizeof(u32));
+ /* name */
+ if (name)
+ bit_len += ethnl_strz_size(name);
+ /* value */
+ if (mask && ethnl_bitmap32_test_bit(val, i))
+ bit_len += nla_total_size(0);
+
+ /* bit nest */
+ bits_len += nla_total_size(bit_len);
+ }
+ /* bits nest */
+ len += nla_total_size(bits_len);
+ }
+
+ /* outermost nest */
+ return nla_total_size(len);
+}
+
+/**
+ * ethnl_put_bitset32() - Put a bitset nest into a message
+ * @skb: skb with the message
+ * @attrtype: attribute type for the bitset nest
+ * @val: value bitmap (u32 based)
+ * @mask: mask bitmap (u32 based, optional)
+ * @nbits: bit length of the bitset
+ * @names: array of bit names (optional)
+ * @compact: use compact format for the output
+ *
+ * Compose a nested attribute representing a bitset. If @mask is null, simple
+ * bitmap (bit list) is created, if @mask is provided, represent a value/mask
+ * pair. Bit names are only used in verbose mode and when provided by calller.
+ *
+ * Return: 0 on success, negative error value on error
+ */
+int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
+ const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact)
+{
+ struct nlattr *nest;
+ struct nlattr *attr;
+
+ nest = nla_nest_start(skb, attrtype);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits))
+ goto nla_put_failure;
+ if (compact) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+ unsigned int nbytes = nwords * sizeof(u32);
+ u32 *dst;
+
+ attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes);
+ if (!attr)
+ goto nla_put_failure;
+ dst = nla_data(attr);
+ memcpy(dst, val, nbytes);
+ if (nbits % 32)
+ dst[nwords - 1] &= ethnl_lower_bits(nbits);
+
+ if (mask) {
+ attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes);
+ if (!attr)
+ goto nla_put_failure;
+ dst = nla_data(attr);
+ memcpy(dst, mask, nbytes);
+ if (nbits % 32)
+ dst[nwords - 1] &= ethnl_lower_bits(nbits);
+ }
+ } else {
+ struct nlattr *bits;
+ unsigned int i;
+
+ bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS);
+ if (!bits)
+ goto nla_put_failure;
+ for (i = 0; i < nbits; i++) {
+ const char *name = names ? names[i] : NULL;
+
+ if (!ethnl_bitmap32_test_bit(mask ?: val, i))
+ continue;
+ attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT);
+ if (!attr)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i))
+ goto nla_put_failure;
+ if (name &&
+ ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name))
+ goto nla_put_failure;
+ if (mask && ethnl_bitmap32_test_bit(val, i) &&
+ nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE))
+ goto nla_put_failure;
+ nla_nest_end(skb, attr);
+ }
+ nla_nest_end(skb, bits);
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = {
+ [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG },
+ [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 },
+ [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY },
+ [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY },
+};
+
+static const struct nla_policy bit_policy[ETHTOOL_A_BITSET_BIT_MAX + 1] = {
+ [ETHTOOL_A_BITSET_BIT_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING },
+ [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG },
+};
+
+/**
+ * ethnl_bitset_is_compact() - check if bitset attribute represents a compact
+ * bitset
+ * @bitset: nested attribute representing a bitset
+ * @compact: pointer for return value
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact)
+{
+ struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1];
+ int ret;
+
+ ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, bitset,
+ bitset_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BITS]) {
+ if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK])
+ return -EINVAL;
+ *compact = false;
+ return 0;
+ }
+ if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE])
+ return -EINVAL;
+
+ *compact = true;
+ return 0;
+}
+
+/**
+ * ethnl_name_to_idx() - look up string index for a name
+ * @names: array of ETH_GSTRING_LEN sized strings
+ * @n_names: number of strings in the array
+ * @name: name to look up
+ *
+ * Return: index of the string if found, -ENOENT if not found
+ */
+static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names,
+ const char *name)
+{
+ unsigned int i;
+
+ if (!names)
+ return -ENOENT;
+
+ for (i = 0; i < n_names; i++) {
+ /* names[i] may not be null terminated */
+ if (!strncmp(names[i], name, ETH_GSTRING_LEN) &&
+ strlen(name) <= ETH_GSTRING_LEN)
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits,
+ const struct nlattr *bit_attr, bool no_mask,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ETHTOOL_A_BITSET_BIT_MAX + 1];
+ int ret, idx;
+
+ ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_BIT_MAX, bit_attr,
+ bit_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) {
+ const char *name;
+
+ idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]);
+ if (idx >= nbits) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_BITSET_BIT_INDEX],
+ "bit index too high");
+ return -EOPNOTSUPP;
+ }
+ name = names ? names[idx] : NULL;
+ if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name &&
+ strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name,
+ nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "bit index and name mismatch");
+ return -EINVAL;
+ }
+ } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) {
+ idx = ethnl_name_to_idx(names, nbits,
+ nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]));
+ if (idx < 0) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_BITSET_BIT_NAME],
+ "bit name not found");
+ return -EOPNOTSUPP;
+ }
+ } else {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "neither bit index nor name specified");
+ return -EINVAL;
+ }
+
+ *index = idx;
+ *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE];
+ return 0;
+}
+
+static int
+ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, struct nlattr **tb,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ struct nlattr *bit_attr;
+ bool no_mask;
+ int rem;
+ int ret;
+
+ if (tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "value only allowed in compact bitset");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask only allowed in compact bitset");
+ return -EINVAL;
+ }
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+
+ nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
+ bool old_val, new_val;
+ unsigned int idx;
+
+ if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS");
+ return -EINVAL;
+ }
+ ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask,
+ names, extack);
+ if (ret < 0)
+ return ret;
+ old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32));
+ if (new_val != old_val) {
+ if (new_val)
+ bitmap[idx / 32] |= ((u32)1 << (idx % 32));
+ else
+ bitmap[idx / 32] &= ~((u32)1 << (idx % 32));
+ *mod = true;
+ }
+ }
+
+ return 0;
+}
+
+static int ethnl_compact_sanity_checks(unsigned int nbits,
+ const struct nlattr *nest,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ unsigned int attr_nbits, attr_nwords;
+ const struct nlattr *test_attr;
+
+ if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask not allowed in list bitset");
+ return -EINVAL;
+ }
+ if (!tb[ETHTOOL_A_BITSET_SIZE]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing size in compact bitset");
+ return -EINVAL;
+ }
+ if (!tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing value in compact bitset");
+ return -EINVAL;
+ }
+ if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing mask in compact nonlist bitset");
+ return -EINVAL;
+ }
+
+ attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ attr_nwords = DIV_ROUND_UP(attr_nbits, 32);
+ if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "bitset value length does not match size");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK] &&
+ nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "bitset mask length does not match size");
+ return -EINVAL;
+ }
+ if (attr_nbits <= nbits)
+ return 0;
+
+ test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] :
+ tb[ETHTOOL_A_BITSET_MASK];
+ if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) {
+ NL_SET_ERR_MSG_ATTR(extack, test_attr,
+ "cannot modify bits past kernel bitset size");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap
+ * @bitmap: bitmap to update
+ * @nbits: size of the updated bitmap in bits
+ * @attr: nest attribute to parse and apply
+ * @names: array of bit names; may be null for compact format
+ * @extack: extack for error reporting
+ * @mod: set this to true if bitmap is modified, leave as it is if not
+ *
+ * Apply bitset netsted attribute to a bitmap. If the attribute represents
+ * a bit list, @bitmap is set to its contents; otherwise, bits in mask are
+ * set to values from value. Bitmaps in the attribute may be longer than
+ * @nbits but the message must not request modifying any bits past @nbits.
+ *
+ * Return: negative error code on failure, 0 on success
+ */
+int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1];
+ unsigned int change_bits;
+ bool no_mask;
+ int ret;
+
+ if (!attr)
+ return 0;
+ ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy,
+ extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BITS])
+ return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb,
+ names, extack, mod);
+ ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack);
+ if (ret < 0)
+ return ret;
+
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ change_bits = min_t(unsigned int,
+ nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits);
+ ethnl_bitmap32_update(bitmap, change_bits,
+ nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
+ no_mask ? NULL :
+ nla_data(tb[ETHTOOL_A_BITSET_MASK]),
+ mod);
+ if (no_mask && change_bits < nbits)
+ ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod);
+
+ return 0;
+}
+
+#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN)
+
+/* 64-bit big endian architectures are the only case when u32 based bitmaps
+ * and unsigned long based bitmaps have different memory layout so that we
+ * cannot simply cast the latter to the former and need actual wrappers
+ * converting the latter to the former.
+ *
+ * To reduce the number of slab allocations, the wrappers use fixed size local
+ * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the
+ * majority of bitmaps used by ethtool.
+ */
+#define ETHNL_SMALL_BITMAP_BITS 128
+#define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32)
+
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 small_val32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *mask32;
+ u32 *val32;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL);
+ if (!val32)
+ return -ENOMEM;
+ mask32 = val32 + nwords;
+ } else {
+ val32 = small_val32;
+ mask32 = small_mask32;
+ }
+
+ bitmap_to_arr32(val32, val, nbits);
+ if (mask)
+ bitmap_to_arr32(mask32, mask, nbits);
+ else
+ mask32 = NULL;
+ ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact);
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(val32);
+
+ return ret;
+}
+
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 small_val32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *mask32;
+ u32 *val32;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL);
+ if (!val32)
+ return -ENOMEM;
+ mask32 = val32 + nwords;
+ } else {
+ val32 = small_val32;
+ mask32 = small_mask32;
+ }
+
+ bitmap_to_arr32(val32, val, nbits);
+ if (mask)
+ bitmap_to_arr32(mask32, mask, nbits);
+ else
+ mask32 = NULL;
+ ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names,
+ compact);
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(val32);
+
+ return ret;
+}
+
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *bitmap32 = small_bitmap32;
+ bool u32_mod = false;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int dst_words = DIV_ROUND_UP(nbits, 32);
+
+ bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL);
+ if (!bitmap32)
+ return -ENOMEM;
+ }
+
+ bitmap_to_arr32(bitmap32, bitmap, nbits);
+ ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack,
+ &u32_mod);
+ if (u32_mod) {
+ bitmap_from_arr32(bitmap, bitmap32, nbits);
+ *mod = true;
+ }
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(bitmap32);
+
+ return ret;
+}
+
+#else
+
+/* On little endian 64-bit and all 32-bit architectures, an unsigned long
+ * based bitmap can be interpreted as u32 based one using a simple cast.
+ */
+
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits,
+ names, compact);
+}
+
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ return ethnl_put_bitset32(skb, attrtype, (const u32 *)val,
+ (const u32 *)mask, nbits, names, compact);
+}
+
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack,
+ mod);
+}
+
+#endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h
new file mode 100644
index 000000000000..b8247e34109d
--- /dev/null
+++ b/net/ethtool/bitset.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_BITSET_H
+#define _NET_ETHTOOL_BITSET_H
+
+typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN];
+
+int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact);
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact);
+int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact);
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact);
+int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
+ const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact);
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod);
+int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod);
+
+#endif /* _NET_ETHTOOL_BITSET_H */
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
new file mode 100644
index 000000000000..636ec6d5110e
--- /dev/null
+++ b/net/ethtool/common.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "common.h"
+
+const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
+ [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
+ [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
+ [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
+ [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
+ [NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
+ [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
+ [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
+ [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
+ [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
+};
+
+const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+ [ETH_RSS_HASH_CRC32_BIT] = "crc32",
+};
+
+const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
+ [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+ [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
+};
+
+const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+ [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down",
+ [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down",
+};
+
+#define __LINK_MODE_NAME(speed, type, duplex) \
+ #speed "base" #type "/" #duplex
+#define __DEFINE_LINK_MODE_NAME(speed, type, duplex) \
+ [ETHTOOL_LINK_MODE(speed, type, duplex)] = \
+ __LINK_MODE_NAME(speed, type, duplex)
+#define __DEFINE_SPECIAL_MODE_NAME(_mode, _name) \
+ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = _name
+
+const char link_mode_names[][ETH_GSTRING_LEN] = {
+ __DEFINE_LINK_MODE_NAME(10, T, Half),
+ __DEFINE_LINK_MODE_NAME(10, T, Full),
+ __DEFINE_LINK_MODE_NAME(100, T, Half),
+ __DEFINE_LINK_MODE_NAME(100, T, Full),
+ __DEFINE_LINK_MODE_NAME(1000, T, Half),
+ __DEFINE_LINK_MODE_NAME(1000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Autoneg, "Autoneg"),
+ __DEFINE_SPECIAL_MODE_NAME(TP, "TP"),
+ __DEFINE_SPECIAL_MODE_NAME(AUI, "AUI"),
+ __DEFINE_SPECIAL_MODE_NAME(MII, "MII"),
+ __DEFINE_SPECIAL_MODE_NAME(FIBRE, "FIBRE"),
+ __DEFINE_SPECIAL_MODE_NAME(BNC, "BNC"),
+ __DEFINE_LINK_MODE_NAME(10000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Pause, "Pause"),
+ __DEFINE_SPECIAL_MODE_NAME(Asym_Pause, "Asym_Pause"),
+ __DEFINE_LINK_MODE_NAME(2500, X, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Backplane, "Backplane"),
+ __DEFINE_LINK_MODE_NAME(1000, KX, Full),
+ __DEFINE_LINK_MODE_NAME(10000, KX4, Full),
+ __DEFINE_LINK_MODE_NAME(10000, KR, Full),
+ __DEFINE_SPECIAL_MODE_NAME(10000baseR_FEC, "10000baseR_FEC"),
+ __DEFINE_LINK_MODE_NAME(20000, MLD2, Full),
+ __DEFINE_LINK_MODE_NAME(20000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(40000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, LR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, LR4, Full),
+ __DEFINE_LINK_MODE_NAME(25000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(25000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(25000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(50000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR4_ER4, Full),
+ __DEFINE_LINK_MODE_NAME(50000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(1000, X, Full),
+ __DEFINE_LINK_MODE_NAME(10000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, LR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, LRM, Full),
+ __DEFINE_LINK_MODE_NAME(10000, ER, Full),
+ __DEFINE_LINK_MODE_NAME(2500, T, Full),
+ __DEFINE_LINK_MODE_NAME(5000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_NONE, "None"),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_RS, "RS"),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_BASER, "BASER"),
+ __DEFINE_LINK_MODE_NAME(50000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, T1, Full),
+ __DEFINE_LINK_MODE_NAME(1000, T1, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR8, Full),
+};
+static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+const char netif_msg_class_names[][ETH_GSTRING_LEN] = {
+ [NETIF_MSG_DRV_BIT] = "drv",
+ [NETIF_MSG_PROBE_BIT] = "probe",
+ [NETIF_MSG_LINK_BIT] = "link",
+ [NETIF_MSG_TIMER_BIT] = "timer",
+ [NETIF_MSG_IFDOWN_BIT] = "ifdown",
+ [NETIF_MSG_IFUP_BIT] = "ifup",
+ [NETIF_MSG_RX_ERR_BIT] = "rx_err",
+ [NETIF_MSG_TX_ERR_BIT] = "tx_err",
+ [NETIF_MSG_TX_QUEUED_BIT] = "tx_queued",
+ [NETIF_MSG_INTR_BIT] = "intr",
+ [NETIF_MSG_TX_DONE_BIT] = "tx_done",
+ [NETIF_MSG_RX_STATUS_BIT] = "rx_status",
+ [NETIF_MSG_PKTDATA_BIT] = "pktdata",
+ [NETIF_MSG_HW_BIT] = "hw",
+ [NETIF_MSG_WOL_BIT] = "wol",
+};
+static_assert(ARRAY_SIZE(netif_msg_class_names) == NETIF_MSG_CLASS_COUNT);
+
+const char wol_mode_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(WAKE_PHY)] = "phy",
+ [const_ilog2(WAKE_UCAST)] = "ucast",
+ [const_ilog2(WAKE_MCAST)] = "mcast",
+ [const_ilog2(WAKE_BCAST)] = "bcast",
+ [const_ilog2(WAKE_ARP)] = "arp",
+ [const_ilog2(WAKE_MAGIC)] = "magic",
+ [const_ilog2(WAKE_MAGICSECURE)] = "magicsecure",
+ [const_ilog2(WAKE_FILTER)] = "filter",
+};
+static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT);
+
+/* return false if legacy contained non-0 deprecated fields
+ * maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+int __ethtool_get_link(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->get_link)
+ return -EOPNOTSUPP;
+
+ return netif_running(dev) && dev->ethtool_ops->get_link(dev);
+}
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
new file mode 100644
index 000000000000..40ba74e0b9bb
--- /dev/null
+++ b/net/ethtool/common.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ETHTOOL_COMMON_H
+#define _ETHTOOL_COMMON_H
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+/* compose link mode index from speed, type and duplex */
+#define ETHTOOL_LINK_MODE(speed, type, duplex) \
+ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT
+
+extern const char
+netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
+extern const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN];
+extern const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN];
+extern const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN];
+extern const char link_mode_names[][ETH_GSTRING_LEN];
+extern const char netif_msg_class_names[][ETH_GSTRING_LEN];
+extern const char wol_mode_names[][ETH_GSTRING_LEN];
+
+int __ethtool_get_link(struct net_device *dev);
+
+bool convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings);
+
+#endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c
new file mode 100644
index 000000000000..aaef4843e6ba
--- /dev/null
+++ b/net/ethtool/debug.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct debug_req_info {
+ struct ethnl_req_info base;
+};
+
+struct debug_reply_data {
+ struct ethnl_reply_data base;
+ u32 msg_mask;
+};
+
+#define DEBUG_REPDATA(__reply_base) \
+ container_of(__reply_base, struct debug_reply_data, base)
+
+static const struct nla_policy
+debug_get_policy[ETHTOOL_A_DEBUG_MAX + 1] = {
+ [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_REJECT },
+};
+
+static int debug_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_msglevel)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->msg_mask = dev->ethtool_ops->get_msglevel(dev);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int debug_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT,
+ netif_msg_class_names, compact);
+}
+
+static int debug_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask,
+ NULL, NETIF_MSG_CLASS_COUNT,
+ netif_msg_class_names, compact);
+}
+
+const struct ethnl_request_ops ethnl_debug_request_ops = {
+ .request_cmd = ETHTOOL_MSG_DEBUG_GET,
+ .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_DEBUG_HEADER,
+ .max_attr = ETHTOOL_A_DEBUG_MAX,
+ .req_info_size = sizeof(struct debug_req_info),
+ .reply_data_size = sizeof(struct debug_reply_data),
+ .request_policy = debug_get_policy,
+
+ .prepare_data = debug_prepare_data,
+ .reply_size = debug_reply_size,
+ .fill_reply = debug_fill_reply,
+};
+
+/* DEBUG_SET */
+
+static const struct nla_policy
+debug_set_policy[ETHTOOL_A_DEBUG_MAX + 1] = {
+ [ETHTOOL_A_DEBUG_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_DEBUG_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED },
+};
+
+int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_DEBUG_MAX + 1];
+ struct ethnl_req_info req_info = {};
+ struct net_device *dev;
+ bool mod = false;
+ u32 msg_mask;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_DEBUG_MAX, debug_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_DEBUG_HEADER],
+ genl_info_net(info), info->extack, true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel)
+ return -EOPNOTSUPP;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+
+ msg_mask = dev->ethtool_ops->get_msglevel(dev);
+ ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT,
+ tb[ETHTOOL_A_DEBUG_MSGMASK],
+ netif_msg_class_names, info->extack, &mod);
+ if (ret < 0 || !mod)
+ goto out_ops;
+
+ dev->ethtool_ops->set_msglevel(dev, msg_mask);
+ ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/core/ethtool.c b/net/ethtool/ioctl.c
index 6288e69e94fc..b987052d91ef 100644
--- a/net/core/ethtool.c
+++ b/net/ethtool/ioctl.c
@@ -17,6 +17,7 @@
#include <linux/phy.h>
#include <linux/bitops.h>
#include <linux/uaccess.h>
+#include <linux/vermagic.h>
#include <linux/vmalloc.h>
#include <linux/sfp.h>
#include <linux/slab.h>
@@ -26,6 +27,9 @@
#include <net/devlink.h>
#include <net/xdp_sock.h>
#include <net/flow_offload.h>
+#include <linux/ethtool_netlink.h>
+
+#include "common.h"
/*
* Some useful ethtool_ops methods that're device independent.
@@ -54,87 +58,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info);
#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
-static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
- [NETIF_F_SG_BIT] = "tx-scatter-gather",
- [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
- [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
- [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
- [NETIF_F_HIGHDMA_BIT] = "highdma",
- [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
- [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
-
- [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
- [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
- [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
- [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
- [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
- [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
- [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
- [NETIF_F_LLTX_BIT] = "tx-lockless",
- [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
- [NETIF_F_GRO_BIT] = "rx-gro",
- [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
- [NETIF_F_LRO_BIT] = "rx-lro",
-
- [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
- [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
- [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
- [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
- [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
- [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
- [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
- [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
- [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
- [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
- [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
- [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
- [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
- [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
- [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
- [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
-
- [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
- [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
- [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
- [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
- [NETIF_F_RXHASH_BIT] = "rx-hashing",
- [NETIF_F_RXCSUM_BIT] = "rx-checksum",
- [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
- [NETIF_F_LOOPBACK_BIT] = "loopback",
- [NETIF_F_RXFCS_BIT] = "rx-fcs",
- [NETIF_F_RXALL_BIT] = "rx-all",
- [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
- [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
- [NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
- [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
- [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
- [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
- [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
- [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
-};
-
-static const char
-rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
- [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
- [ETH_RSS_HASH_XOR_BIT] = "xor",
- [ETH_RSS_HASH_CRC32_BIT] = "crc32",
-};
-
-static const char
-tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
- [ETHTOOL_ID_UNSPEC] = "Unspec",
- [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
- [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
- [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
-};
-
-static const char
-phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
- [ETHTOOL_ID_UNSPEC] = "Unspec",
- [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
- [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down",
-};
-
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -233,6 +156,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
!ops->get_ethtool_phy_stats)
return phy_ethtool_get_sset_count(dev->phydev);
+ if (sset == ETH_SS_LINK_MODES)
+ return __ETHTOOL_LINK_MODE_MASK_NBITS;
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -257,6 +183,9 @@ static void __ethtool_get_strings(struct net_device *dev,
else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
!ops->get_ethtool_phy_stats)
phy_ethtool_get_strings(dev->phydev, data);
+ else if (stringset == ETH_SS_LINK_MODES)
+ memcpy(data, link_mode_names,
+ __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN);
else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -431,54 +360,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
}
EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
-/* return false if legacy contained non-0 deprecated fields
- * maxtxpkt/maxrxpkt. rest of ksettings always updated
- */
-static bool
-convert_legacy_settings_to_link_ksettings(
- struct ethtool_link_ksettings *link_ksettings,
- const struct ethtool_cmd *legacy_settings)
-{
- bool retval = true;
-
- memset(link_ksettings, 0, sizeof(*link_ksettings));
-
- /* This is used to tell users that driver is still using these
- * deprecated legacy fields, and they should not use
- * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
- */
- if (legacy_settings->maxtxpkt ||
- legacy_settings->maxrxpkt)
- retval = false;
-
- ethtool_convert_legacy_u32_to_link_mode(
- link_ksettings->link_modes.supported,
- legacy_settings->supported);
- ethtool_convert_legacy_u32_to_link_mode(
- link_ksettings->link_modes.advertising,
- legacy_settings->advertising);
- ethtool_convert_legacy_u32_to_link_mode(
- link_ksettings->link_modes.lp_advertising,
- legacy_settings->lp_advertising);
- link_ksettings->base.speed
- = ethtool_cmd_speed(legacy_settings);
- link_ksettings->base.duplex
- = legacy_settings->duplex;
- link_ksettings->base.port
- = legacy_settings->port;
- link_ksettings->base.phy_address
- = legacy_settings->phy_address;
- link_ksettings->base.autoneg
- = legacy_settings->autoneg;
- link_ksettings->base.mdio_support
- = legacy_settings->mdio_support;
- link_ksettings->base.eth_tp_mdix
- = legacy_settings->eth_tp_mdix;
- link_ksettings->base.eth_tp_mdix_ctrl
- = legacy_settings->eth_tp_mdix_ctrl;
- return retval;
-}
-
/* return false if ksettings link modes had higher bits
* set. legacy_settings always updated (best effort)
*/
@@ -692,7 +573,12 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
!= link_ksettings.base.link_mode_masks_nwords)
return -EINVAL;
- return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (err >= 0) {
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL);
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL);
+ }
+ return err;
}
/* Query device for its ethtool_cmd settings.
@@ -741,6 +627,7 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
+ int ret;
ASSERT_RTNL();
@@ -753,7 +640,12 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
return -EINVAL;
link_ksettings.base.link_mode_masks_nwords =
__ETHTOOL_LINK_MODE_MASK_NU32;
- return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (ret >= 0) {
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL);
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL);
+ }
+ return ret;
}
static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
@@ -764,6 +656,7 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
memset(&info, 0, sizeof(info));
info.cmd = ETHTOOL_GDRVINFO;
+ strlcpy(info.version, UTS_RELEASE, sizeof(info.version));
if (ops->get_drvinfo) {
ops->get_drvinfo(dev, &info);
} else if (dev->dev.parent && dev->dev.parent->driver) {
@@ -1395,11 +1288,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+ struct ethtool_wolinfo wol;
if (!dev->ethtool_ops->get_wol)
return -EOPNOTSUPP;
+ memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+ wol.cmd = ETHTOOL_GWOL;
dev->ethtool_ops->get_wol(dev, &wol);
if (copy_to_user(useraddr, &wol, sizeof(wol)))
@@ -1423,6 +1318,7 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
return ret;
dev->wol_enabled = !!wol.wolopts;
+ ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL);
return 0;
}
@@ -1472,12 +1368,12 @@ static int ethtool_nway_reset(struct net_device *dev)
static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
{
struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+ int link = __ethtool_get_link(dev);
- if (!dev->ethtool_ops->get_link)
- return -EOPNOTSUPP;
-
- edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
+ if (link < 0)
+ return link;
+ edata.data = link;
if (copy_to_user(useraddr, &edata, sizeof(edata)))
return -EFAULT;
return 0;
@@ -2167,8 +2063,8 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
memset(&info, 0, sizeof(info));
info.cmd = ETHTOOL_GET_TS_INFO;
- if (phydev && phydev->drv && phydev->drv->ts_info) {
- err = phydev->drv->ts_info(phydev, &info);
+ if (phy_has_tsinfo(phydev)) {
+ err = phy_ts_info(phydev, &info);
} else if (ops->get_ts_info) {
err = ops->get_ts_info(dev, &info);
} else {
@@ -2451,6 +2347,11 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
tuna->type_id != ETHTOOL_TUNABLE_U8)
return -EINVAL;
break;
+ case ETHTOOL_PHY_EDPD:
+ if (tuna->len != sizeof(u16) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U16)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -2647,6 +2548,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SMSGLVL:
rc = ethtool_set_value_void(dev, useraddr,
dev->ethtool_ops->set_msglevel);
+ if (!rc)
+ ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL);
break;
case ETHTOOL_GEEE:
rc = ethtool_get_eee(dev, useraddr);
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
new file mode 100644
index 000000000000..5d16cb4e8693
--- /dev/null
+++ b/net/ethtool/linkinfo.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct linkinfo_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_link_ksettings ksettings;
+ struct ethtool_link_settings *lsettings;
+};
+
+#define LINKINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkinfo_reply_data, base)
+
+static const struct nla_policy
+linkinfo_get_policy[ETHTOOL_A_LINKINFO_MAX + 1] = {
+ [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT },
+};
+
+static int linkinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ data->lsettings = &data->ksettings.base;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = __ethtool_get_link_ksettings(dev, &data->ksettings);
+ if (ret < 0 && info)
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int linkinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */
+ + 0;
+}
+
+static int linkinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base);
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR,
+ data->lsettings->phy_address) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX,
+ data->lsettings->eth_tp_mdix) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL,
+ data->lsettings->eth_tp_mdix_ctrl) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER,
+ data->lsettings->transceiver))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_linkinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKINFO_HEADER,
+ .max_attr = ETHTOOL_A_LINKINFO_MAX,
+ .req_info_size = sizeof(struct linkinfo_req_info),
+ .reply_data_size = sizeof(struct linkinfo_reply_data),
+ .request_policy = linkinfo_get_policy,
+
+ .prepare_data = linkinfo_prepare_data,
+ .reply_size = linkinfo_reply_size,
+ .fill_reply = linkinfo_fill_reply,
+};
+
+/* LINKINFO_SET */
+
+static const struct nla_policy
+linkinfo_set_policy[ETHTOOL_A_LINKINFO_MAX + 1] = {
+ [ETHTOOL_A_LINKINFO_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_TP_MDIX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_TRANSCEIVER] = { .type = NLA_REJECT },
+};
+
+int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_LINKINFO_MAX + 1];
+ struct ethtool_link_ksettings ksettings = {};
+ struct ethtool_link_settings *lsettings;
+ struct ethnl_req_info req_info = {};
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_LINKINFO_MAX, linkinfo_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKINFO_HEADER],
+ genl_info_net(info), info->extack, true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ if (!dev->ethtool_ops->get_link_ksettings ||
+ !dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+
+ ret = __ethtool_get_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ if (info)
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ goto out_ops;
+ }
+ lsettings = &ksettings.base;
+
+ ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod);
+ ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR],
+ &mod);
+ ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl,
+ tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings);
+ if (ret < 0)
+ GENL_SET_ERR_MSG(info, "link settings update failed");
+ else
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
new file mode 100644
index 000000000000..96f20be64553
--- /dev/null
+++ b/net/ethtool/linkmodes.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct linkmodes_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkmodes_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_link_ksettings ksettings;
+ struct ethtool_link_settings *lsettings;
+ bool peer_empty;
+};
+
+#define LINKMODES_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkmodes_reply_data, base)
+
+static const struct nla_policy
+linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = {
+ [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT },
+};
+
+static int linkmodes_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ data->lsettings = &data->ksettings.base;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &data->ksettings);
+ if (ret < 0 && info) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ goto out;
+ }
+
+ data->peer_empty =
+ bitmap_empty(data->ksettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int len, ret;
+
+ len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */
+ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */
+ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */
+ + 0;
+ ret = ethnl_bitset_size(ksettings->link_modes.advertising,
+ ksettings->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ if (!data->peer_empty) {
+ ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising,
+ NULL, __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
+
+ return len;
+}
+
+static int linkmodes_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+ const struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg))
+ return -EMSGSIZE;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS,
+ ksettings->link_modes.advertising,
+ ksettings->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names,
+ compact);
+ if (ret < 0)
+ return -EMSGSIZE;
+ if (!data->peer_empty) {
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER,
+ ksettings->link_modes.lp_advertising,
+ NULL, __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_linkmodes_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKMODES_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKMODES_HEADER,
+ .max_attr = ETHTOOL_A_LINKMODES_MAX,
+ .req_info_size = sizeof(struct linkmodes_req_info),
+ .reply_data_size = sizeof(struct linkmodes_reply_data),
+ .request_policy = linkmodes_get_policy,
+
+ .prepare_data = linkmodes_prepare_data,
+ .reply_size = linkmodes_reply_size,
+ .fill_reply = linkmodes_fill_reply,
+};
+
+/* LINKMODES_SET */
+
+struct link_mode_info {
+ int speed;
+ u8 duplex;
+};
+
+#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \
+ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
+ .speed = SPEED_ ## _speed, \
+ .duplex = __DUPLEX_ ## _duplex \
+ }
+#define __DUPLEX_Half DUPLEX_HALF
+#define __DUPLEX_Full DUPLEX_FULL
+#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
+ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \
+ .speed = SPEED_UNKNOWN, \
+ .duplex = DUPLEX_UNKNOWN, \
+ }
+
+static const struct link_mode_info link_mode_params[] = {
+ __DEFINE_LINK_MODE_PARAMS(10, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(1000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Autoneg),
+ __DEFINE_SPECIAL_MODE_PARAMS(TP),
+ __DEFINE_SPECIAL_MODE_PARAMS(AUI),
+ __DEFINE_SPECIAL_MODE_PARAMS(MII),
+ __DEFINE_SPECIAL_MODE_PARAMS(FIBRE),
+ __DEFINE_SPECIAL_MODE_PARAMS(BNC),
+ __DEFINE_LINK_MODE_PARAMS(10000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Pause),
+ __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause),
+ __DEFINE_LINK_MODE_PARAMS(2500, X, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Backplane),
+ __DEFINE_LINK_MODE_PARAMS(1000, KX, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, KR, Full),
+ [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = {
+ .speed = SPEED_10000,
+ .duplex = DUPLEX_FULL,
+ },
+ __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full),
+ __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, X, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, LR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, ER, Full),
+ __DEFINE_LINK_MODE_PARAMS(2500, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(5000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, T1, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, T1, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
+};
+
+static const struct nla_policy
+linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = {
+ [ETHTOOL_A_LINKMODES_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 },
+ [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 },
+};
+
+/* Set advertised link modes to all supported modes matching requested speed
+ * and duplex values. Called when autonegotiation is on, speed or duplex is
+ * requested but no link mode change. This is done in userspace with ioctl()
+ * interface, move it into kernel for netlink.
+ * Returns true if advertised modes bitmap was modified.
+ */
+static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
+ bool req_speed, bool req_duplex)
+{
+ unsigned long *advertising = ksettings->link_modes.advertising;
+ unsigned long *supported = ksettings->link_modes.supported;
+ DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ unsigned int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(link_mode_params) !=
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) {
+ const struct link_mode_info *info = &link_mode_params[i];
+
+ if (info->speed == SPEED_UNKNOWN)
+ continue;
+ if (test_bit(i, supported) &&
+ (!req_speed || info->speed == ksettings->base.speed) &&
+ (!req_duplex || info->duplex == ksettings->base.duplex))
+ set_bit(i, advertising);
+ else
+ clear_bit(i, advertising);
+ }
+
+ return !bitmap_equal(old_adv, advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
+ struct ethtool_link_ksettings *ksettings,
+ bool *mod)
+{
+ struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool req_speed, req_duplex;
+ int ret;
+
+ *mod = false;
+ req_speed = tb[ETHTOOL_A_LINKMODES_SPEED];
+ req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX];
+
+ ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG],
+ mod);
+ ret = ethnl_update_bitset(ksettings->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names,
+ info->extack, mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED],
+ mod);
+ ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX],
+ mod);
+
+ if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg &&
+ (req_speed || req_duplex) &&
+ ethnl_auto_linkmodes(ksettings, req_speed, req_duplex))
+ *mod = true;
+
+ return 0;
+}
+
+int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_LINKMODES_MAX + 1];
+ struct ethtool_link_ksettings ksettings = {};
+ struct ethnl_req_info req_info = {};
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_LINKMODES_MAX, linkmodes_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKMODES_HEADER],
+ genl_info_net(info), info->extack, true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ if (!dev->ethtool_ops->get_link_ksettings ||
+ !dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+
+ ret = __ethtool_get_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ if (info)
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ goto out_ops;
+ }
+
+ ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod);
+ if (ret < 0)
+ goto out_ops;
+
+ if (mod) {
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings);
+ if (ret < 0)
+ GENL_SET_ERR_MSG(info, "link settings update failed");
+ else
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL);
+ }
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
new file mode 100644
index 000000000000..2740cde0a182
--- /dev/null
+++ b/net/ethtool/linkstate.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct linkstate_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkstate_reply_data {
+ struct ethnl_reply_data base;
+ int link;
+};
+
+#define LINKSTATE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkstate_reply_data, base)
+
+static const struct nla_policy
+linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = {
+ [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT },
+};
+
+static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->link = __ethtool_get_link(dev);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int linkstate_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
+ + 0;
+}
+
+static int linkstate_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+
+ if (data->link >= 0 &&
+ nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_linkstate_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKSTATE_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER,
+ .max_attr = ETHTOOL_A_LINKSTATE_MAX,
+ .req_info_size = sizeof(struct linkstate_req_info),
+ .reply_data_size = sizeof(struct linkstate_reply_data),
+ .request_policy = linkstate_get_policy,
+
+ .prepare_data = linkstate_prepare_data,
+ .reply_size = linkstate_reply_size,
+ .fill_reply = linkstate_fill_reply,
+};
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
new file mode 100644
index 000000000000..180c194fab07
--- /dev/null
+++ b/net/ethtool/netlink.c
@@ -0,0 +1,729 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/sock.h>
+#include <linux/ethtool_netlink.h>
+#include "netlink.h"
+
+static struct genl_family ethtool_genl_family;
+
+static bool ethnl_ok __read_mostly;
+static u32 ethnl_bcast_seq;
+
+static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = {
+ [ETHTOOL_A_HEADER_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = { .type = NLA_U32 },
+};
+
+/**
+ * ethnl_parse_header() - parse request header
+ * @req_info: structure to put results into
+ * @header: nest attribute with request header
+ * @net: request netns
+ * @extack: netlink extack for error reporting
+ * @require_dev: fail if no device identified in header
+ *
+ * Parse request header in nested attribute @nest and puts results into
+ * the structure pointed to by @req_info. Extack from @info is used for error
+ * reporting. If req_info->dev is not null on return, reference to it has
+ * been taken. If error is returned, *req_info is null initialized and no
+ * reference is held.
+ *
+ * Return: 0 on success or negative error code
+ */
+int ethnl_parse_header(struct ethnl_req_info *req_info,
+ const struct nlattr *header, struct net *net,
+ struct netlink_ext_ack *extack, bool require_dev)
+{
+ struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1];
+ const struct nlattr *devname_attr;
+ struct net_device *dev = NULL;
+ int ret;
+
+ if (!header) {
+ NL_SET_ERR_MSG(extack, "request header missing");
+ return -EINVAL;
+ }
+ ret = nla_parse_nested(tb, ETHTOOL_A_HEADER_MAX, header,
+ ethnl_header_policy, extack);
+ if (ret < 0)
+ return ret;
+ devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
+
+ if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
+ u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]);
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_HEADER_DEV_INDEX],
+ "no device matches ifindex");
+ return -ENODEV;
+ }
+ /* if both ifindex and ifname are passed, they must match */
+ if (devname_attr &&
+ strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) {
+ dev_put(dev);
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "ifindex and name do not match");
+ return -ENODEV;
+ }
+ } else if (devname_attr) {
+ dev = dev_get_by_name(net, nla_data(devname_attr));
+ if (!dev) {
+ NL_SET_ERR_MSG_ATTR(extack, devname_attr,
+ "no device matches name");
+ return -ENODEV;
+ }
+ } else if (require_dev) {
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "neither ifindex nor name specified");
+ return -EINVAL;
+ }
+
+ if (dev && !netif_device_present(dev)) {
+ dev_put(dev);
+ NL_SET_ERR_MSG(extack, "device not present");
+ return -ENODEV;
+ }
+
+ req_info->dev = dev;
+ if (tb[ETHTOOL_A_HEADER_FLAGS])
+ req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
+
+ return 0;
+}
+
+/**
+ * ethnl_fill_reply_header() - Put common header into a reply message
+ * @skb: skb with the message
+ * @dev: network device to describe in header
+ * @attrtype: attribute type to use for the nest
+ *
+ * Create a nested attribute with attributes describing given network device.
+ *
+ * Return: 0 on success, error value (-EMSGSIZE only) on error
+ */
+int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
+ u16 attrtype)
+{
+ struct nlattr *nest;
+
+ if (!dev)
+ return 0;
+ nest = nla_nest_start(skb, attrtype);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) ||
+ nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name))
+ goto nla_put_failure;
+ /* If more attributes are put into reply header, ethnl_header_size()
+ * must be updated to account for them.
+ */
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+/**
+ * ethnl_reply_init() - Create skb for a reply and fill device identification
+ * @payload: payload length (without netlink and genetlink header)
+ * @dev: device the reply is about (may be null)
+ * @cmd: ETHTOOL_MSG_* message type for reply
+ * @hdr_attrtype: attribute type for common header
+ * @info: genetlink info of the received packet we respond to
+ * @ehdrp: place to store payload pointer returned by genlmsg_new()
+ *
+ * Return: pointer to allocated skb on success, NULL on error
+ */
+struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
+ u16 hdr_attrtype, struct genl_info *info,
+ void **ehdrp)
+{
+ struct sk_buff *skb;
+
+ skb = genlmsg_new(payload, GFP_KERNEL);
+ if (!skb)
+ goto err;
+ *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd);
+ if (!*ehdrp)
+ goto err_free;
+
+ if (dev) {
+ int ret;
+
+ ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype);
+ if (ret < 0)
+ goto err_free;
+ }
+ return skb;
+
+err_free:
+ nlmsg_free(skb);
+err:
+ if (info)
+ GENL_SET_ERR_MSG(info, "failed to setup reply message");
+ return NULL;
+}
+
+static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd)
+{
+ return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0,
+ cmd);
+}
+
+static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev)
+{
+ return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb,
+ 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL);
+}
+
+/* GET request helpers */
+
+/**
+ * struct ethnl_dump_ctx - context structure for generic dumpit() callback
+ * @ops: request ops of currently processed message type
+ * @req_info: parsed request header of processed request
+ * @reply_data: data needed to compose the reply
+ * @pos_hash: saved iteration position - hashbucket
+ * @pos_idx: saved iteration position - index
+ *
+ * These parameters are kept in struct netlink_callback as context preserved
+ * between iterations. They are initialized by ethnl_default_start() and used
+ * in ethnl_default_dumpit() and ethnl_default_done().
+ */
+struct ethnl_dump_ctx {
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct ethnl_reply_data *reply_data;
+ int pos_hash;
+ int pos_idx;
+};
+
+static const struct ethnl_request_ops *
+ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
+ [ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops,
+ [ETHTOOL_MSG_LINKINFO_GET] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKMODES_GET] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops,
+ [ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops,
+};
+
+static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
+{
+ return (struct ethnl_dump_ctx *)cb->ctx;
+}
+
+/**
+ * ethnl_default_parse() - Parse request message
+ * @req_info: pointer to structure to put data into
+ * @nlhdr: pointer to request message header
+ * @net: request netns
+ * @request_ops: struct request_ops for request type
+ * @extack: netlink extack for error reporting
+ * @require_dev: fail if no device identified in header
+ *
+ * Parse universal request header and call request specific ->parse_request()
+ * callback (if defined) to parse the rest of the message.
+ *
+ * Return: 0 on success or negative error code
+ */
+static int ethnl_default_parse(struct ethnl_req_info *req_info,
+ const struct nlmsghdr *nlhdr, struct net *net,
+ const struct ethnl_request_ops *request_ops,
+ struct netlink_ext_ack *extack, bool require_dev)
+{
+ struct nlattr **tb;
+ int ret;
+
+ tb = kmalloc_array(request_ops->max_attr + 1, sizeof(tb[0]),
+ GFP_KERNEL);
+ if (!tb)
+ return -ENOMEM;
+
+ ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, request_ops->max_attr,
+ request_ops->request_policy, extack);
+ if (ret < 0)
+ goto out;
+ ret = ethnl_parse_header(req_info, tb[request_ops->hdr_attr], net,
+ extack, require_dev);
+ if (ret < 0)
+ goto out;
+
+ if (request_ops->parse_request) {
+ ret = request_ops->parse_request(req_info, tb, extack);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kfree(tb);
+ return ret;
+}
+
+/**
+ * ethnl_init_reply_data() - Initialize reply data for GET request
+ * @reply_data: pointer to embedded struct ethnl_reply_data
+ * @ops: instance of struct ethnl_request_ops describing the layout
+ * @dev: network device to initialize the reply for
+ *
+ * Fills the reply data part with zeros and sets the dev member. Must be called
+ * before calling the ->fill_reply() callback (for each iteration when handling
+ * dump requests).
+ */
+static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data,
+ const struct ethnl_request_ops *ops,
+ struct net_device *dev)
+{
+ memset(reply_data, 0, ops->reply_data_size);
+ reply_data->dev = dev;
+}
+
+/* default ->doit() handler for GET type requests */
+static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_reply_data *reply_data = NULL;
+ struct ethnl_req_info *req_info = NULL;
+ const u8 cmd = info->genlhdr->cmd;
+ const struct ethnl_request_ops *ops;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len;
+ int ret;
+
+ ops = ethnl_default_requests[cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return -ENOMEM;
+ }
+
+ ret = ethnl_default_parse(req_info, info->nlhdr, genl_info_net(info), ops,
+ info->extack, !ops->allow_nodev_do);
+ if (ret < 0)
+ goto err_dev;
+ ethnl_init_reply_data(reply_data, ops, req_info->dev);
+
+ rtnl_lock();
+ ret = ops->prepare_data(req_info, reply_data, info);
+ rtnl_unlock();
+ if (ret < 0)
+ goto err_cleanup;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+ reply_len = ret;
+ ret = -ENOMEM;
+ rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd,
+ ops->hdr_attr, info, &reply_payload);
+ if (!rskb)
+ goto err_cleanup;
+ ret = ops->fill_reply(rskb, req_info, reply_data);
+ if (ret < 0)
+ goto err_msg;
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+
+ genlmsg_end(rskb, reply_payload);
+ if (req_info->dev)
+ dev_put(req_info->dev);
+ kfree(reply_data);
+ kfree(req_info);
+ return genlmsg_reply(rskb, info);
+
+err_msg:
+ WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len);
+ nlmsg_free(rskb);
+err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+err_dev:
+ if (req_info->dev)
+ dev_put(req_info->dev);
+ kfree(reply_data);
+ kfree(req_info);
+ return ret;
+}
+
+static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
+ const struct ethnl_dump_ctx *ctx)
+{
+ int ret;
+
+ ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev);
+ rtnl_lock();
+ ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, NULL);
+ rtnl_unlock();
+ if (ret < 0)
+ goto out;
+ ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr);
+ if (ret < 0)
+ goto out;
+ ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data);
+
+out:
+ if (ctx->ops->cleanup_data)
+ ctx->ops->cleanup_data(ctx->reply_data);
+ ctx->reply_data->dev = NULL;
+ return ret;
+}
+
+/* Default ->dumpit() handler for GET requests. Device iteration copied from
+ * rtnl_dump_ifinfo(); we have to be more careful about device hashtable
+ * persistence as we cannot guarantee to hold RTNL lock through the whole
+ * function as rtnetnlink does.
+ */
+static int ethnl_default_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+ struct net *net = sock_net(skb->sk);
+ int s_idx = ctx->pos_idx;
+ int h, idx = 0;
+ int ret = 0;
+ void *ehdr;
+
+ rtnl_lock();
+ for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ struct hlist_head *head;
+ struct net_device *dev;
+ unsigned int seq;
+
+ head = &net->dev_index_head[h];
+
+restart_chain:
+ seq = net->dev_base_seq;
+ cb->seq = seq;
+ idx = 0;
+ hlist_for_each_entry(dev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ dev_hold(dev);
+ rtnl_unlock();
+
+ ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ &ethtool_genl_family, 0,
+ ctx->ops->reply_cmd);
+ if (!ehdr) {
+ dev_put(dev);
+ ret = -EMSGSIZE;
+ goto out;
+ }
+ ret = ethnl_default_dump_one(skb, dev, ctx);
+ dev_put(dev);
+ if (ret < 0) {
+ genlmsg_cancel(skb, ehdr);
+ if (ret == -EOPNOTSUPP)
+ goto lock_and_cont;
+ if (likely(skb->len))
+ ret = skb->len;
+ goto out;
+ }
+ genlmsg_end(skb, ehdr);
+lock_and_cont:
+ rtnl_lock();
+ if (net->dev_base_seq != seq) {
+ s_idx = idx + 1;
+ goto restart_chain;
+ }
+cont:
+ idx++;
+ }
+
+ }
+ rtnl_unlock();
+
+out:
+ ctx->pos_hash = h;
+ ctx->pos_idx = idx;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+ return ret;
+}
+
+/* generic ->start() handler for GET requests */
+static int ethnl_default_start(struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genlmsghdr *ghdr;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ ghdr = nlmsg_data(cb->nlh);
+ ops = ethnl_default_requests[ghdr->cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ ret = ethnl_default_parse(req_info, cb->nlh, sock_net(cb->skb->sk), ops,
+ cb->extack, false);
+ if (req_info->dev) {
+ /* We ignore device specification in dump requests but as the
+ * same parser as for non-dump (doit) requests is used, it
+ * would take reference to the device if it finds one
+ */
+ dev_put(req_info->dev);
+ req_info->dev = NULL;
+ }
+ if (ret < 0)
+ goto free_reply_data;
+
+ ctx->ops = ops;
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_hash = 0;
+ ctx->pos_idx = 0;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+/* default ->done() handler for GET requests */
+static int ethnl_default_done(struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+
+ kfree(ctx->reply_data);
+ kfree(ctx->req_info);
+
+ return 0;
+}
+
+static const struct ethnl_request_ops *
+ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
+ [ETHTOOL_MSG_LINKINFO_NTF] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops,
+};
+
+/* default notification handler */
+static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
+ const void *data)
+{
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct sk_buff *skb;
+ void *reply_payload;
+ int reply_len;
+ int ret;
+
+ if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX ||
+ !ethnl_default_notify_ops[cmd],
+ "unexpected notification type %u\n", cmd))
+ return;
+ ops = ethnl_default_notify_ops[cmd];
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return;
+ }
+
+ req_info->dev = dev;
+ req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ ethnl_init_reply_data(reply_data, ops, dev);
+ ret = ops->prepare_data(req_info, reply_data, NULL);
+ if (ret < 0)
+ goto err_cleanup;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+ reply_len = ret;
+ ret = -ENOMEM;
+ skb = genlmsg_new(reply_len, GFP_KERNEL);
+ if (!skb)
+ goto err_cleanup;
+ reply_payload = ethnl_bcastmsg_put(skb, cmd);
+ if (!reply_payload)
+ goto err_skb;
+ ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr);
+ if (ret < 0)
+ goto err_msg;
+ ret = ops->fill_reply(skb, req_info, reply_data);
+ if (ret < 0)
+ goto err_msg;
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+
+ genlmsg_end(skb, reply_payload);
+ kfree(reply_data);
+ kfree(req_info);
+ ethnl_multicast(skb, dev);
+ return;
+
+err_msg:
+ WARN_ONCE(ret == -EMSGSIZE,
+ "calculated message payload length (%d) not sufficient\n",
+ reply_len);
+err_skb:
+ nlmsg_free(skb);
+err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+ kfree(reply_data);
+ kfree(req_info);
+ return;
+}
+
+/* notifications */
+
+typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd,
+ const void *data);
+
+static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
+ [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify,
+};
+
+void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
+{
+ if (unlikely(!ethnl_ok))
+ return;
+ ASSERT_RTNL();
+
+ if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) &&
+ ethnl_notify_handlers[cmd]))
+ ethnl_notify_handlers[cmd](dev, cmd, data);
+ else
+ WARN_ONCE(1, "notification %u not implemented (dev=%s)\n",
+ cmd, netdev_name(dev));
+}
+EXPORT_SYMBOL(ethtool_notify);
+
+/* genetlink setup */
+
+static const struct genl_ops ethtool_genl_ops[] = {
+ {
+ .cmd = ETHTOOL_MSG_STRSET_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKINFO_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_linkinfo,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKMODES_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKMODES_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_linkmodes,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKSTATE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_DEBUG_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_DEBUG_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_debug,
+ },
+ {
+ .cmd = ETHTOOL_MSG_WOL_GET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_WOL_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_wol,
+ },
+};
+
+static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
+ [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME },
+};
+
+static struct genl_family ethtool_genl_family = {
+ .name = ETHTOOL_GENL_NAME,
+ .version = ETHTOOL_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ethtool_genl_ops,
+ .n_ops = ARRAY_SIZE(ethtool_genl_ops),
+ .mcgrps = ethtool_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps),
+};
+
+/* module setup */
+
+static int __init ethnl_init(void)
+{
+ int ret;
+
+ ret = genl_register_family(&ethtool_genl_family);
+ if (WARN(ret < 0, "ethtool: genetlink family registration failed"))
+ return ret;
+ ethnl_ok = true;
+
+ return 0;
+}
+
+subsys_initcall(ethnl_init);
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
new file mode 100644
index 000000000000..60efd87686ad
--- /dev/null
+++ b/net/ethtool/netlink.h
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_NETLINK_H
+#define _NET_ETHTOOL_NETLINK_H
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+
+struct ethnl_req_info;
+
+int ethnl_parse_header(struct ethnl_req_info *req_info,
+ const struct nlattr *nest, struct net *net,
+ struct netlink_ext_ack *extack, bool require_dev);
+int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
+ u16 attrtype);
+struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
+ u16 hdr_attrtype, struct genl_info *info,
+ void **ehdrp);
+
+/**
+ * ethnl_strz_size() - calculate attribute length for fixed size string
+ * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
+ *
+ * Return: total length of an attribute with null terminated string from @s
+ */
+static inline int ethnl_strz_size(const char *s)
+{
+ return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1);
+}
+
+/**
+ * ethnl_put_strz() - put string attribute with fixed size string
+ * @skb: skb with the message
+ * @attrype: attribute type
+ * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
+ *
+ * Puts an attribute with null terminated string from @s into the message.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype,
+ const char *s)
+{
+ unsigned int len = strnlen(s, ETH_GSTRING_LEN);
+ struct nlattr *attr;
+
+ attr = nla_reserve(skb, attrtype, len + 1);
+ if (!attr)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(attr), s, len);
+ ((char *)nla_data(attr))[len] = '\0';
+ return 0;
+}
+
+/**
+ * ethnl_update_u32() - update u32 value from NLA_U32 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Copy the u32 value from NLA_U32 netlink attribute @attr into variable
+ * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
+ * is set to true if this function changed the value of *dst, otherwise it
+ * is left as is.
+ */
+static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u32 val;
+
+ if (!attr)
+ return;
+ val = nla_get_u32(attr);
+ if (*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_u8() - update u8 value from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Copy the u8 value from NLA_U8 netlink attribute @attr into variable
+ * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
+ * is set to true if this function changed the value of *dst, otherwise it
+ * is left as is.
+ */
+static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = nla_get_u8(attr);
+ if (*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable
+ * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is
+ * null. Bool pointed to by @mod is set to true if this function changed the
+ * logical value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = !!nla_get_u8(attr);
+ if (!!*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_binary() - update binary data from NLA_BINARY atribute
+ * @dst: value to update
+ * @len: destination buffer length
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block
+ * of length @len at @dst by attribute payload; do nothing if @attr is null.
+ * Bool pointed to by @mod is set to true if this function changed the logical
+ * value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_binary(void *dst, unsigned int len,
+ const struct nlattr *attr, bool *mod)
+{
+ if (!attr)
+ return;
+ if (nla_len(attr) < len)
+ len = nla_len(attr);
+ if (!memcmp(dst, nla_data(attr), len))
+ return;
+
+ memcpy(dst, nla_data(attr), len);
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Update bits in u32 value which are set in attribute's mask to values from
+ * attribute's value. Do nothing if @attr is null or the value wouldn't change;
+ * otherwise, set bool pointed to by @mod to true.
+ */
+static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ struct nla_bitfield32 change;
+ u32 newval;
+
+ if (!attr)
+ return;
+ change = nla_get_bitfield32(attr);
+ newval = (*dst & ~change.selector) | (change.value & change.selector);
+ if (*dst == newval)
+ return;
+
+ *dst = newval;
+ *mod = true;
+}
+
+/**
+ * ethnl_reply_header_size() - total size of reply header
+ *
+ * This is an upper estimate so that we do not need to hold RTNL lock longer
+ * than necessary (to prevent rename between size estimate and composing the
+ * message). Accounts only for device ifindex and name as those are the only
+ * attributes ethnl_fill_reply_header() puts into the reply header.
+ */
+static inline unsigned int ethnl_reply_header_size(void)
+{
+ return nla_total_size(nla_total_size(sizeof(u32)) +
+ nla_total_size(IFNAMSIZ));
+}
+
+/* GET request handling */
+
+/* Unified processing of GET requests uses two data structures: request info
+ * and reply data. Request info holds information parsed from client request
+ * and its stays constant through all request processing. Reply data holds data
+ * retrieved from ethtool_ops callbacks or other internal sources which is used
+ * to compose the reply. When processing a dump request, request info is filled
+ * only once (when the request message is parsed) but reply data is filled for
+ * each reply message.
+ *
+ * Both structures consist of part common for all request types (struct
+ * ethnl_req_info and struct ethnl_reply_data defined below) and optional
+ * parts specific for each request type. Common part always starts at offset 0.
+ */
+
+/**
+ * struct ethnl_req_info - base type of request information for GET requests
+ * @dev: network device the request is for (may be null)
+ * @flags: request flags common for all request types
+ *
+ * This is a common base for request specific structures holding data from
+ * parsed userspace request. These always embed struct ethnl_req_info at
+ * zero offset.
+ */
+struct ethnl_req_info {
+ struct net_device *dev;
+ u32 flags;
+};
+
+/**
+ * struct ethnl_reply_data - base type of reply data for GET requests
+ * @dev: device for current reply message; in single shot requests it is
+ * equal to &ethnl_req_info.dev; in dumps it's different for each
+ * reply message
+ *
+ * This is a common base for request specific structures holding data for
+ * kernel reply message. These always embed struct ethnl_reply_data at zero
+ * offset.
+ */
+struct ethnl_reply_data {
+ struct net_device *dev;
+};
+
+static inline int ethnl_ops_begin(struct net_device *dev)
+{
+ if (dev && dev->ethtool_ops->begin)
+ return dev->ethtool_ops->begin(dev);
+ else
+ return 0;
+}
+
+static inline void ethnl_ops_complete(struct net_device *dev)
+{
+ if (dev && dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+}
+
+/**
+ * struct ethnl_request_ops - unified handling of GET requests
+ * @request_cmd: command id for request (GET)
+ * @reply_cmd: command id for reply (GET_REPLY)
+ * @hdr_attr: attribute type for request header
+ * @max_attr: maximum (top level) attribute type
+ * @req_info_size: size of request info
+ * @reply_data_size: size of reply data
+ * @request_policy: netlink policy for message contents
+ * @allow_nodev_do: allow non-dump request with no device identification
+ * @parse_request:
+ * Parse request except common header (struct ethnl_req_info). Common
+ * header is already filled on entry, the rest up to @repdata_offset
+ * is zero initialized. This callback should only modify type specific
+ * request info by parsed attributes from request message.
+ * @prepare_data:
+ * Retrieve and prepare data needed to compose a reply message. Calls to
+ * ethtool_ops handlers are limited to this callback. Common reply data
+ * (struct ethnl_reply_data) is filled on entry, type specific part after
+ * it is zero initialized. This callback should only modify the type
+ * specific part of reply data. Device identification from struct
+ * ethnl_reply_data is to be used as for dump requests, it iterates
+ * through network devices while dev member of struct ethnl_req_info
+ * points to the device from client request.
+ * @reply_size:
+ * Estimate reply message size. Returned value must be sufficient for
+ * message payload without common reply header. The callback may returned
+ * estimate higher than actual message size if exact calculation would
+ * not be worth the saved memory space.
+ * @fill_reply:
+ * Fill reply message payload (except for common header) from reply data.
+ * The callback must not generate more payload than previously called
+ * ->reply_size() estimated.
+ * @cleanup_data:
+ * Optional cleanup called when reply data is no longer needed. Can be
+ * used e.g. to free any additional data structures outside the main
+ * structure which were allocated by ->prepare_data(). When processing
+ * dump requests, ->cleanup() is called for each message.
+ *
+ * Description of variable parts of GET request handling when using the
+ * unified infrastructure. When used, a pointer to an instance of this
+ * structure is to be added to &ethnl_default_requests array and generic
+ * handlers ethnl_default_doit(), ethnl_default_dumpit(),
+ * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops;
+ * ethnl_default_notify() can be used in @ethnl_notify_handlers to send
+ * notifications of the corresponding type.
+ */
+struct ethnl_request_ops {
+ u8 request_cmd;
+ u8 reply_cmd;
+ u16 hdr_attr;
+ unsigned int max_attr;
+ unsigned int req_info_size;
+ unsigned int reply_data_size;
+ const struct nla_policy *request_policy;
+ bool allow_nodev_do;
+
+ int (*parse_request)(struct ethnl_req_info *req_info,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack);
+ int (*prepare_data)(const struct ethnl_req_info *req_info,
+ struct ethnl_reply_data *reply_data,
+ struct genl_info *info);
+ int (*reply_size)(const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data);
+ int (*fill_reply)(struct sk_buff *skb,
+ const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data);
+ void (*cleanup_data)(struct ethnl_reply_data *reply_data);
+};
+
+/* request handlers */
+
+extern const struct ethnl_request_ops ethnl_strset_request_ops;
+extern const struct ethnl_request_ops ethnl_linkinfo_request_ops;
+extern const struct ethnl_request_ops ethnl_linkmodes_request_ops;
+extern const struct ethnl_request_ops ethnl_linkstate_request_ops;
+extern const struct ethnl_request_ops ethnl_debug_request_ops;
+extern const struct ethnl_request_ops ethnl_wol_request_ops;
+
+int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info);
+
+#endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
new file mode 100644
index 000000000000..8e5911887b4c
--- /dev/null
+++ b/net/ethtool/strset.c
@@ -0,0 +1,437 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include "netlink.h"
+#include "common.h"
+
+struct strset_info {
+ bool per_dev;
+ bool free_strings;
+ unsigned int count;
+ const char (*strings)[ETH_GSTRING_LEN];
+};
+
+static const struct strset_info info_template[] = {
+ [ETH_SS_TEST] = {
+ .per_dev = true,
+ },
+ [ETH_SS_STATS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_PRIV_FLAGS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_FEATURES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(netdev_features_strings),
+ .strings = netdev_features_strings,
+ },
+ [ETH_SS_RSS_HASH_FUNCS] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(rss_hash_func_strings),
+ .strings = rss_hash_func_strings,
+ },
+ [ETH_SS_TUNABLES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(tunable_strings),
+ .strings = tunable_strings,
+ },
+ [ETH_SS_PHY_STATS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_PHY_TUNABLES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(phy_tunable_strings),
+ .strings = phy_tunable_strings,
+ },
+ [ETH_SS_LINK_MODES] = {
+ .per_dev = false,
+ .count = __ETHTOOL_LINK_MODE_MASK_NBITS,
+ .strings = link_mode_names,
+ },
+ [ETH_SS_MSG_CLASSES] = {
+ .per_dev = false,
+ .count = NETIF_MSG_CLASS_COUNT,
+ .strings = netif_msg_class_names,
+ },
+ [ETH_SS_WOL_MODES] = {
+ .per_dev = false,
+ .count = WOL_MODE_COUNT,
+ .strings = wol_mode_names,
+ },
+};
+
+struct strset_req_info {
+ struct ethnl_req_info base;
+ u32 req_ids;
+ bool counts_only;
+};
+
+#define STRSET_REQINFO(__req_base) \
+ container_of(__req_base, struct strset_req_info, base)
+
+struct strset_reply_data {
+ struct ethnl_reply_data base;
+ struct strset_info sets[ETH_SS_COUNT];
+};
+
+#define STRSET_REPDATA(__reply_base) \
+ container_of(__reply_base, struct strset_reply_data, base)
+
+static const struct nla_policy strset_get_policy[ETHTOOL_A_STRSET_MAX + 1] = {
+ [ETHTOOL_A_STRSET_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_STRSET_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+get_stringset_policy[ETHTOOL_A_STRINGSET_MAX + 1] = {
+ [ETHTOOL_A_STRINGSET_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 },
+ [ETHTOOL_A_STRINGSET_COUNT] = { .type = NLA_REJECT },
+ [ETHTOOL_A_STRINGSET_STRINGS] = { .type = NLA_REJECT },
+};
+
+/**
+ * strset_include() - test if a string set should be included in reply
+ * @info: parsed client request
+ * @data: pointer to request data structure
+ * @id: id of string set to check (ETH_SS_* constants)
+ */
+static bool strset_include(const struct strset_req_info *info,
+ const struct strset_reply_data *data, u32 id)
+{
+ bool per_dev;
+
+ BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids));
+
+ if (info->req_ids)
+ return info->req_ids & (1U << id);
+ per_dev = data->sets[id].per_dev;
+ if (!per_dev && !data->sets[id].strings)
+ return false;
+
+ return data->base.dev ? per_dev : !per_dev;
+}
+
+static int strset_get_id(const struct nlattr *nest, u32 *val,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ETHTOOL_A_STRINGSET_MAX + 1];
+ int ret;
+
+ ret = nla_parse_nested(tb, ETHTOOL_A_STRINGSET_MAX, nest,
+ get_stringset_policy, extack);
+ if (ret < 0)
+ return ret;
+ if (!tb[ETHTOOL_A_STRINGSET_ID])
+ return -EINVAL;
+
+ *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]);
+ return 0;
+}
+
+static const struct nla_policy
+strset_stringsets_policy[ETHTOOL_A_STRINGSETS_MAX + 1] = {
+ [ETHTOOL_A_STRINGSETS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED },
+};
+
+static int strset_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS];
+ struct nlattr *attr;
+ int rem, ret;
+
+ if (!nest)
+ return 0;
+ ret = nla_validate_nested(nest, ETHTOOL_A_STRINGSETS_MAX,
+ strset_stringsets_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY];
+ nla_for_each_nested(attr, nest, rem) {
+ u32 id;
+
+ if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET,
+ "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n",
+ nla_type(attr)))
+ return -EINVAL;
+
+ ret = strset_get_id(attr, &id, extack);
+ if (ret < 0)
+ return ret;
+ if (ret >= ETH_SS_COUNT) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "unknown string set id");
+ return -EOPNOTSUPP;
+ }
+
+ req_info->req_ids |= (1U << id);
+ }
+
+ return 0;
+}
+
+static void strset_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ unsigned int i;
+
+ for (i = 0; i < ETH_SS_COUNT; i++)
+ if (data->sets[i].free_strings) {
+ kfree(data->sets[i].strings);
+ data->sets[i].strings = NULL;
+ data->sets[i].free_strings = false;
+ }
+}
+
+static int strset_prepare_set(struct strset_info *info, struct net_device *dev,
+ unsigned int id, bool counts_only)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *strings;
+ int count, ret;
+
+ if (id == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ ret = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops->get_sset_count && ops->get_strings)
+ ret = ops->get_sset_count(dev, id);
+ else
+ ret = -EOPNOTSUPP;
+ if (ret <= 0) {
+ info->count = 0;
+ return 0;
+ }
+
+ count = ret;
+ if (!counts_only) {
+ strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL);
+ if (!strings)
+ return -ENOMEM;
+ if (id == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, strings);
+ else
+ ops->get_strings(dev, id, strings);
+ info->strings = strings;
+ info->free_strings = true;
+ }
+ info->count = count;
+
+ return 0;
+}
+
+static int strset_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ unsigned int i;
+ int ret;
+
+ BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT);
+ memcpy(&data->sets, &info_template, sizeof(data->sets));
+
+ if (!dev) {
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if ((req_info->req_ids & (1U << i)) &&
+ data->sets[i].per_dev) {
+ if (info)
+ GENL_SET_ERR_MSG(info, "requested per device strings without dev");
+ return -EINVAL;
+ }
+ }
+ return 0;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto err_strset;
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if (!strset_include(req_info, data, i) ||
+ !data->sets[i].per_dev)
+ continue;
+
+ ret = strset_prepare_set(&data->sets[i], dev, i,
+ req_info->counts_only);
+ if (ret < 0)
+ goto err_ops;
+ }
+ ethnl_ops_complete(dev);
+
+ return 0;
+err_ops:
+ ethnl_ops_complete(dev);
+err_strset:
+ strset_cleanup_data(reply_base);
+ return ret;
+}
+
+/* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */
+static int strset_set_size(const struct strset_info *info, bool counts_only)
+{
+ unsigned int len = 0;
+ unsigned int i;
+
+ if (info->count == 0)
+ return 0;
+ if (counts_only)
+ return nla_total_size(2 * nla_total_size(sizeof(u32)));
+
+ for (i = 0; i < info->count; i++) {
+ const char *str = info->strings[i];
+
+ /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */
+ len += nla_total_size(nla_total_size(sizeof(u32)) +
+ ethnl_strz_size(str));
+ }
+ /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */
+ len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len);
+
+ return nla_total_size(len);
+}
+
+static int strset_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ const struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ unsigned int i;
+ int len = 0;
+ int ret;
+
+ len += ethnl_reply_header_size();
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ const struct strset_info *set_info = &data->sets[i];
+
+ if (!strset_include(req_info, data, i))
+ continue;
+
+ ret = strset_set_size(set_info, req_info->counts_only);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
+
+ return len;
+}
+
+/* fill one string into reply */
+static int strset_fill_string(struct sk_buff *skb,
+ const struct strset_info *set_info, u32 idx)
+{
+ struct nlattr *string_attr;
+ const char *value;
+
+ value = set_info->strings[idx];
+
+ string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING);
+ if (!string_attr)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) ||
+ ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(skb, string_attr);
+
+ return 0;
+nla_put_failure:
+ nla_nest_cancel(skb, string_attr);
+ return -EMSGSIZE;
+}
+
+/* fill one string set into reply */
+static int strset_fill_set(struct sk_buff *skb,
+ const struct strset_info *set_info, u32 id,
+ bool counts_only)
+{
+ struct nlattr *stringset_attr;
+ struct nlattr *strings_attr;
+ unsigned int i;
+
+ if (!set_info->per_dev && !set_info->strings)
+ return -EOPNOTSUPP;
+ if (set_info->count == 0)
+ return 0;
+ stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET);
+ if (!stringset_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) ||
+ nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count))
+ goto nla_put_failure;
+
+ if (!counts_only) {
+ strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS);
+ if (!strings_attr)
+ goto nla_put_failure;
+ for (i = 0; i < set_info->count; i++) {
+ if (strset_fill_string(skb, set_info, i) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, strings_attr);
+ }
+
+ nla_nest_end(skb, stringset_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, stringset_attr);
+ return -EMSGSIZE;
+}
+
+static int strset_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ const struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ struct nlattr *nest;
+ unsigned int i;
+ int ret;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if (strset_include(req_info, data, i)) {
+ ret = strset_fill_set(skb, &data->sets[i], i,
+ req_info->counts_only);
+ if (ret < 0)
+ goto nla_put_failure;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_strset_request_ops = {
+ .request_cmd = ETHTOOL_MSG_STRSET_GET,
+ .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_STRSET_HEADER,
+ .max_attr = ETHTOOL_A_STRSET_MAX,
+ .req_info_size = sizeof(struct strset_req_info),
+ .reply_data_size = sizeof(struct strset_reply_data),
+ .request_policy = strset_get_policy,
+ .allow_nodev_do = true,
+
+ .parse_request = strset_parse_request,
+ .prepare_data = strset_prepare_data,
+ .reply_size = strset_reply_size,
+ .fill_reply = strset_fill_reply,
+ .cleanup_data = strset_cleanup_data,
+};
diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c
new file mode 100644
index 000000000000..e1b8a65b64c4
--- /dev/null
+++ b/net/ethtool/wol.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct wol_req_info {
+ struct ethnl_req_info base;
+};
+
+struct wol_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_wolinfo wol;
+ bool show_sopass;
+};
+
+#define WOL_REPDATA(__reply_base) \
+ container_of(__reply_base, struct wol_reply_data, base)
+
+static const struct nla_policy
+wol_get_policy[ETHTOOL_A_WOL_MAX + 1] = {
+ [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_WOL_MODES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_REJECT },
+};
+
+static int wol_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_wol(dev, &data->wol);
+ ethnl_ops_complete(dev);
+ /* do not include password in notifications */
+ data->show_sopass = info && (data->wol.supported & WAKE_MAGICSECURE);
+
+ return 0;
+}
+
+static int wol_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ int len;
+
+ len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported,
+ WOL_MODE_COUNT, wol_mode_names, compact);
+ if (len < 0)
+ return len;
+ if (data->show_sopass)
+ len += nla_total_size(sizeof(data->wol.sopass));
+
+ return len;
+}
+
+static int wol_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts,
+ &data->wol.supported, WOL_MODE_COUNT,
+ wol_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ if (data->show_sopass &&
+ nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass),
+ data->wol.sopass))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_wol_request_ops = {
+ .request_cmd = ETHTOOL_MSG_WOL_GET,
+ .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_WOL_HEADER,
+ .max_attr = ETHTOOL_A_WOL_MAX,
+ .req_info_size = sizeof(struct wol_req_info),
+ .reply_data_size = sizeof(struct wol_reply_data),
+ .request_policy = wol_get_policy,
+
+ .prepare_data = wol_prepare_data,
+ .reply_size = wol_reply_size,
+ .fill_reply = wol_fill_reply,
+};
+
+/* WOL_SET */
+
+static const struct nla_policy
+wol_set_policy[ETHTOOL_A_WOL_MAX + 1] = {
+ [ETHTOOL_A_WOL_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_WOL_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED },
+ [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY,
+ .len = SOPASS_MAX },
+};
+
+int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+ struct nlattr *tb[ETHTOOL_A_WOL_MAX + 1];
+ struct ethnl_req_info req_info = {};
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_WOL_MAX,
+ wol_set_policy, info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_WOL_HEADER],
+ genl_info_net(info), info->extack, true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol)
+ return -EOPNOTSUPP;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+
+ dev->ethtool_ops->get_wol(dev, &wol);
+ ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT,
+ tb[ETHTOOL_A_WOL_MODES], wol_mode_names,
+ info->extack, &mod);
+ if (ret < 0)
+ goto out_ops;
+ if (wol.wolopts & ~wol.supported) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES],
+ "cannot enable unsupported WoL mode");
+ ret = -EINVAL;
+ goto out_ops;
+ }
+ if (tb[ETHTOOL_A_WOL_SOPASS]) {
+ if (!(wol.supported & WAKE_MAGICSECURE)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_WOL_SOPASS],
+ "magicsecure not supported, cannot set password");
+ ret = -EINVAL;
+ goto out_ops;
+ }
+ ethnl_update_binary(wol.sopass, sizeof(wol.sopass),
+ tb[ETHTOOL_A_WOL_SOPASS], &mod);
+ }
+
+ if (!mod)
+ goto out_ops;
+ ret = dev->ethtool_ops->set_wol(dev, &wol);
+ if (ret)
+ goto out_ops;
+ dev->wol_enabled = !!wol.wolopts;
+ ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 94447974a3c0..d5f709b940ff 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -20,6 +20,8 @@
#include "hsr_main.h"
#include "hsr_framereg.h"
+static struct dentry *hsr_debugfs_root_dir;
+
static void print_mac_address(struct seq_file *sfp, unsigned char *mac)
{
seq_printf(sfp, "%02x:%02x:%02x:%02x:%02x:%02x:",
@@ -63,8 +65,20 @@ hsr_node_table_open(struct inode *inode, struct file *filp)
return single_open(filp, hsr_node_table_show, inode->i_private);
}
+void hsr_debugfs_rename(struct net_device *dev)
+{
+ struct hsr_priv *priv = netdev_priv(dev);
+ struct dentry *d;
+
+ d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root,
+ hsr_debugfs_root_dir, dev->name);
+ if (IS_ERR(d))
+ netdev_warn(dev, "failed to rename\n");
+ else
+ priv->node_tbl_root = d;
+}
+
static const struct file_operations hsr_fops = {
- .owner = THIS_MODULE,
.open = hsr_node_table_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -78,15 +92,14 @@ static const struct file_operations hsr_fops = {
* When debugfs is configured this routine sets up the node_table file per
* hsr device for dumping the node_table entries
*/
-int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
+void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
{
- int rc = -1;
struct dentry *de = NULL;
- de = debugfs_create_dir(hsr_dev->name, NULL);
- if (!de) {
- pr_err("Cannot create hsr debugfs root\n");
- return rc;
+ de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir);
+ if (IS_ERR(de)) {
+ pr_err("Cannot create hsr debugfs directory\n");
+ return;
}
priv->node_tbl_root = de;
@@ -94,13 +107,13 @@ int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
de = debugfs_create_file("node_table", S_IFREG | 0444,
priv->node_tbl_root, priv,
&hsr_fops);
- if (!de) {
- pr_err("Cannot create hsr node_table directory\n");
- return rc;
+ if (IS_ERR(de)) {
+ pr_err("Cannot create hsr node_table file\n");
+ debugfs_remove(priv->node_tbl_root);
+ priv->node_tbl_root = NULL;
+ return;
}
priv->node_tbl_file = de;
-
- return 0;
}
/* hsr_debugfs_term - Tear down debugfs intrastructure
@@ -117,3 +130,18 @@ hsr_debugfs_term(struct hsr_priv *priv)
debugfs_remove(priv->node_tbl_root);
priv->node_tbl_root = NULL;
}
+
+void hsr_debugfs_create_root(void)
+{
+ hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL);
+ if (IS_ERR(hsr_debugfs_root_dir)) {
+ pr_err("Cannot create hsr debugfs root directory\n");
+ hsr_debugfs_root_dir = NULL;
+ }
+}
+
+void hsr_debugfs_remove_root(void)
+{
+ /* debugfs_remove() internally checks NULL and ERROR */
+ debugfs_remove(hsr_debugfs_root_dir);
+}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index f509b495451a..c7bd6c49fadf 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -227,8 +227,13 @@ static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct hsr_port *master;
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
- skb->dev = master->dev;
- hsr_forward_skb(skb, master);
+ if (master) {
+ skb->dev = master->dev;
+ hsr_forward_skb(skb, master);
+ } else {
+ atomic_long_inc(&dev->tx_dropped);
+ dev_kfree_skb_any(skb);
+ }
return NETDEV_TX_OK;
}
@@ -267,6 +272,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master,
skb->dev->dev_addr, skb->len) <= 0)
goto out;
skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
if (hsr_ver > 0) {
hsr_tag = skb_put(skb, sizeof(struct hsr_tag));
@@ -363,7 +370,7 @@ static void hsr_dev_destroy(struct net_device *hsr_dev)
del_timer_sync(&hsr->prune_timer);
del_timer_sync(&hsr->announce_timer);
- hsr_del_self_node(&hsr->self_node_db);
+ hsr_del_self_node(hsr);
hsr_del_nodes(&hsr->node_db);
}
@@ -435,11 +442,12 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
INIT_LIST_HEAD(&hsr->ports);
INIT_LIST_HEAD(&hsr->node_db);
INIT_LIST_HEAD(&hsr->self_node_db);
+ spin_lock_init(&hsr->list_lock);
ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr);
/* Make sure we recognize frames from ourselves in hsr_rcv() */
- res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr,
+ res = hsr_create_self_node(hsr, hsr_dev->dev_addr,
slave[1]->dev_addr);
if (res < 0)
return res;
@@ -472,31 +480,32 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER);
if (res)
- goto err_add_port;
+ goto err_add_master;
res = register_netdevice(hsr_dev);
if (res)
- goto fail;
+ goto err_unregister;
res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A);
if (res)
- goto fail;
+ goto err_add_slaves;
+
res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B);
if (res)
- goto fail;
+ goto err_add_slaves;
+ hsr_debugfs_init(hsr, hsr_dev);
mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));
- res = hsr_debugfs_init(hsr, hsr_dev);
- if (res)
- goto fail;
return 0;
-fail:
+err_add_slaves:
+ unregister_netdevice(hsr_dev);
+err_unregister:
list_for_each_entry_safe(port, tmp, &hsr->ports, port_list)
hsr_del_port(port);
-err_add_port:
- hsr_del_self_node(&hsr->self_node_db);
+err_add_master:
+ hsr_del_self_node(hsr);
return res;
}
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 292be446007b..364ea2cc028e 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -35,7 +35,6 @@ static bool seq_nr_after(u16 a, u16 b)
}
#define seq_nr_before(a, b) seq_nr_after((b), (a))
-#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b)))
#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b)))
bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
@@ -75,10 +74,11 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db,
/* Helper for device init; the self_node_db is used in hsr_rcv() to recognize
* frames from self that's been looped over the HSR ring.
*/
-int hsr_create_self_node(struct list_head *self_node_db,
+int hsr_create_self_node(struct hsr_priv *hsr,
unsigned char addr_a[ETH_ALEN],
unsigned char addr_b[ETH_ALEN])
{
+ struct list_head *self_node_db = &hsr->self_node_db;
struct hsr_node *node, *oldnode;
node = kmalloc(sizeof(*node), GFP_KERNEL);
@@ -88,33 +88,33 @@ int hsr_create_self_node(struct list_head *self_node_db,
ether_addr_copy(node->macaddress_A, addr_a);
ether_addr_copy(node->macaddress_B, addr_b);
- rcu_read_lock();
+ spin_lock_bh(&hsr->list_lock);
oldnode = list_first_or_null_rcu(self_node_db,
struct hsr_node, mac_list);
if (oldnode) {
list_replace_rcu(&oldnode->mac_list, &node->mac_list);
- rcu_read_unlock();
- synchronize_rcu();
- kfree(oldnode);
+ spin_unlock_bh(&hsr->list_lock);
+ kfree_rcu(oldnode, rcu_head);
} else {
- rcu_read_unlock();
list_add_tail_rcu(&node->mac_list, self_node_db);
+ spin_unlock_bh(&hsr->list_lock);
}
return 0;
}
-void hsr_del_self_node(struct list_head *self_node_db)
+void hsr_del_self_node(struct hsr_priv *hsr)
{
+ struct list_head *self_node_db = &hsr->self_node_db;
struct hsr_node *node;
- rcu_read_lock();
+ spin_lock_bh(&hsr->list_lock);
node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list);
- rcu_read_unlock();
if (node) {
list_del_rcu(&node->mac_list);
- kfree(node);
+ kfree_rcu(node, rcu_head);
}
+ spin_unlock_bh(&hsr->list_lock);
}
void hsr_del_nodes(struct list_head *node_db)
@@ -130,30 +130,43 @@ void hsr_del_nodes(struct list_head *node_db)
* seq_out is used to initialize filtering of outgoing duplicate frames
* originating from the newly added node.
*/
-struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
- u16 seq_out)
+static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
+ struct list_head *node_db,
+ unsigned char addr[],
+ u16 seq_out)
{
- struct hsr_node *node;
+ struct hsr_node *new_node, *node;
unsigned long now;
int i;
- node = kzalloc(sizeof(*node), GFP_ATOMIC);
- if (!node)
+ new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC);
+ if (!new_node)
return NULL;
- ether_addr_copy(node->macaddress_A, addr);
+ ether_addr_copy(new_node->macaddress_A, addr);
/* We are only interested in time diffs here, so use current jiffies
* as initialization. (0 could trigger an spurious ring error warning).
*/
now = jiffies;
for (i = 0; i < HSR_PT_PORTS; i++)
- node->time_in[i] = now;
+ new_node->time_in[i] = now;
for (i = 0; i < HSR_PT_PORTS; i++)
- node->seq_out[i] = seq_out;
-
- list_add_tail_rcu(&node->mac_list, node_db);
+ new_node->seq_out[i] = seq_out;
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_rcu(node, node_db, mac_list) {
+ if (ether_addr_equal(node->macaddress_A, addr))
+ goto out;
+ if (ether_addr_equal(node->macaddress_B, addr))
+ goto out;
+ }
+ list_add_tail_rcu(&new_node->mac_list, node_db);
+ spin_unlock_bh(&hsr->list_lock);
+ return new_node;
+out:
+ spin_unlock_bh(&hsr->list_lock);
+ kfree(new_node);
return node;
}
@@ -163,6 +176,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
bool is_sup)
{
struct list_head *node_db = &port->hsr->node_db;
+ struct hsr_priv *hsr = port->hsr;
struct hsr_node *node;
struct ethhdr *ethhdr;
u16 seq_out;
@@ -196,7 +210,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
seq_out = HSR_SEQNR_START;
}
- return hsr_add_node(node_db, ethhdr->h_source, seq_out);
+ return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out);
}
/* Use the Supervision frame's info about an eventual macaddress_B for merging
@@ -206,10 +220,11 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
struct hsr_port *port_rcv)
{
- struct ethhdr *ethhdr;
- struct hsr_node *node_real;
+ struct hsr_priv *hsr = port_rcv->hsr;
struct hsr_sup_payload *hsr_sp;
+ struct hsr_node *node_real;
struct list_head *node_db;
+ struct ethhdr *ethhdr;
int i;
ethhdr = (struct ethhdr *)skb_mac_header(skb);
@@ -231,7 +246,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A);
if (!node_real)
/* No frame received from AddrA of this node yet */
- node_real = hsr_add_node(node_db, hsr_sp->macaddress_A,
+ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A,
HSR_SEQNR_START - 1);
if (!node_real)
goto done; /* No mem */
@@ -252,7 +267,9 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
}
node_real->addr_B_port = port_rcv->type;
+ spin_lock_bh(&hsr->list_lock);
list_del_rcu(&node_curr->mac_list);
+ spin_unlock_bh(&hsr->list_lock);
kfree_rcu(node_curr, rcu_head);
done:
@@ -368,12 +385,13 @@ void hsr_prune_nodes(struct timer_list *t)
{
struct hsr_priv *hsr = from_timer(hsr, t, prune_timer);
struct hsr_node *node;
+ struct hsr_node *tmp;
struct hsr_port *port;
unsigned long timestamp;
unsigned long time_a, time_b;
- rcu_read_lock();
- list_for_each_entry_rcu(node, &hsr->node_db, mac_list) {
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) {
/* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A]
* nor time_in[HSR_PT_SLAVE_B], will ever be updated for
* the master port. Thus the master node will be repeatedly
@@ -421,7 +439,7 @@ void hsr_prune_nodes(struct timer_list *t)
kfree_rcu(node, rcu_head);
}
}
- rcu_read_unlock();
+ spin_unlock_bh(&hsr->list_lock);
/* Restart timer */
mod_timer(&hsr->prune_timer,
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 89a3ce38151d..0f0fa12b4329 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -12,10 +12,8 @@
struct hsr_node;
-void hsr_del_self_node(struct list_head *self_node_db);
+void hsr_del_self_node(struct hsr_priv *hsr);
void hsr_del_nodes(struct list_head *node_db);
-struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
- u16 seq_out);
struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
bool is_sup);
void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
@@ -33,7 +31,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
void hsr_prune_nodes(struct timer_list *t);
-int hsr_create_self_node(struct list_head *self_node_db,
+int hsr_create_self_node(struct hsr_priv *hsr,
unsigned char addr_a[ETH_ALEN],
unsigned char addr_b[ETH_ALEN]);
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index b9988a662ee1..9e389accbfc7 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -45,6 +45,10 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
case NETDEV_CHANGE: /* Link (carrier) state changes */
hsr_check_carrier_and_operstate(hsr);
break;
+ case NETDEV_CHANGENAME:
+ if (is_hsr_master(dev))
+ hsr_debugfs_rename(dev);
+ break;
case NETDEV_CHANGEADDR:
if (port->type == HSR_PT_MASTER) {
/* This should not happen since there's no
@@ -64,7 +68,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
/* Make sure we recognize frames from ourselves in hsr_rcv() */
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
- res = hsr_create_self_node(&hsr->self_node_db,
+ res = hsr_create_self_node(hsr,
master->dev->dev_addr,
port ?
port->dev->dev_addr :
@@ -123,6 +127,7 @@ static void __exit hsr_exit(void)
{
unregister_netdevice_notifier(&hsr_nb);
hsr_netlink_exit();
+ hsr_debugfs_remove_root();
}
module_init(hsr_init);
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 96fac696a1e1..754d84b217f0 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -160,8 +160,9 @@ struct hsr_priv {
int announce_count;
u16 sequence_nr;
u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */
- u8 prot_version; /* Indicate if HSRv0 or HSRv1. */
- spinlock_t seqnr_lock; /* locking for sequence_nr */
+ u8 prot_version; /* Indicate if HSRv0 or HSRv1. */
+ spinlock_t seqnr_lock; /* locking for sequence_nr */
+ spinlock_t list_lock; /* locking for node list */
unsigned char sup_multicast_addr[ETH_ALEN];
#ifdef CONFIG_DEBUG_FS
struct dentry *node_tbl_root;
@@ -184,17 +185,24 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb)
}
#if IS_ENABLED(CONFIG_DEBUG_FS)
-int hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev);
+void hsr_debugfs_rename(struct net_device *dev);
+void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev);
void hsr_debugfs_term(struct hsr_priv *priv);
+void hsr_debugfs_create_root(void);
+void hsr_debugfs_remove_root(void);
#else
-static inline int hsr_debugfs_init(struct hsr_priv *priv,
- struct net_device *hsr_dev)
+static inline void hsr_debugfs_rename(struct net_device *dev)
{
- return 0;
}
-
+static inline void hsr_debugfs_init(struct hsr_priv *priv,
+ struct net_device *hsr_dev)
+{}
static inline void hsr_debugfs_term(struct hsr_priv *priv)
{}
+static inline void hsr_debugfs_create_root(void)
+{}
+static inline void hsr_debugfs_remove_root(void)
+{}
#endif
#endif /* __HSR_PRIVATE_H */
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 8f8337f893ba..8dc0547f01d0 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -476,6 +476,7 @@ int __init hsr_netlink_init(void)
if (rc)
goto fail_genl_register_family;
+ hsr_debugfs_create_root();
return 0;
fail_genl_register_family:
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index ee561297d8a7..fbfd0db182b7 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -27,6 +27,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
rcu_read_lock(); /* hsr->node_db, hsr->ports */
port = hsr_port_get_rcu(skb->dev);
+ if (!port)
+ goto finish_pass;
if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
/* Directly kill frames sent by ourselves */
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 3297e7fa9945..c0b107cdd715 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -58,13 +58,6 @@ static const struct header_ops lowpan_header_ops = {
.create = lowpan_header_create,
};
-static int lowpan_dev_init(struct net_device *ldev)
-{
- netdev_lockdep_set_classes(ldev);
-
- return 0;
-}
-
static int lowpan_open(struct net_device *dev)
{
if (!open_count)
@@ -96,7 +89,6 @@ static int lowpan_get_iflink(const struct net_device *dev)
}
static const struct net_device_ops lowpan_netdev_ops = {
- .ndo_init = lowpan_dev_init,
.ndo_start_xmit = lowpan_xmit,
.ndo_open = lowpan_open,
.ndo_stop = lowpan_stop,
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index 60b7ac56a1f5..de259b5170ab 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -23,11 +23,6 @@
LIST_HEAD(cfg802154_rdev_list);
int cfg802154_rdev_list_generation;
-static int wpan_phy_match(struct device *dev, const void *data)
-{
- return !strcmp(dev_name(dev), (const char *)data);
-}
-
struct wpan_phy *wpan_phy_find(const char *str)
{
struct device *dev;
@@ -35,7 +30,7 @@ struct wpan_phy *wpan_phy_find(const char *str)
if (WARN_ON(!str))
return NULL;
- dev = class_find_device(&wpan_phy_class, NULL, str, wpan_phy_match);
+ dev = class_find_device_by_name(&wpan_phy_class, str);
if (!dev)
return NULL;
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index ffcfcef76291..7c5a1aa5adb4 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -236,21 +236,14 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
struct cfg802154_registered_device **rdev,
struct wpan_dev **wpan_dev)
{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
int err;
rtnl_lock();
if (!cb->args[0]) {
- err = nlmsg_parse_deprecated(cb->nlh,
- GENL_HDRLEN + nl802154_fam.hdrsize,
- genl_family_attrbuf(&nl802154_fam),
- nl802154_fam.maxattr,
- nl802154_policy, NULL);
- if (err)
- goto out_unlock;
-
*wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
- genl_family_attrbuf(&nl802154_fam));
+ info->attrs);
if (IS_ERR(*wpan_dev)) {
err = PTR_ERR(*wpan_dev);
goto out_unlock;
@@ -557,17 +550,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl802154_dump_wpan_phy_state *state)
{
- struct nlattr **tb = genl_family_attrbuf(&nl802154_fam);
- int ret = nlmsg_parse_deprecated(cb->nlh,
- GENL_HDRLEN + nl802154_fam.hdrsize,
- tb, nl802154_fam.maxattr,
- nl802154_policy, NULL);
-
- /* TODO check if we can handle error here,
- * we have no backward compatibility
- */
- if (ret)
- return 0;
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct nlattr **tb = info->attrs;
if (tb[NL802154_ATTR_WPAN_PHY])
state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]);
@@ -2203,7 +2187,8 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
static const struct genl_ops nl802154_ops[] = {
{
.cmd = NL802154_CMD_GET_WPAN_PHY,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.doit = nl802154_get_wpan_phy,
.dumpit = nl802154_dump_wpan_phy,
.done = nl802154_dump_wpan_phy_done,
@@ -2343,7 +2328,8 @@ static const struct genl_ops nl802154_ops[] = {
},
{
.cmd = NL802154_CMD_GET_SEC_KEY,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO .doit by matching key id? */
.dumpit = nl802154_dump_llsec_key,
.flags = GENL_ADMIN_PERM,
@@ -2369,7 +2355,8 @@ static const struct genl_ops nl802154_ops[] = {
/* TODO unique identifier must short+pan OR extended_addr */
{
.cmd = NL802154_CMD_GET_SEC_DEV,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO .doit by matching extended_addr? */
.dumpit = nl802154_dump_llsec_dev,
.flags = GENL_ADMIN_PERM,
@@ -2395,7 +2382,8 @@ static const struct genl_ops nl802154_ops[] = {
/* TODO remove complete devkey, put it as nested? */
{
.cmd = NL802154_CMD_GET_SEC_DEVKEY,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO doit by matching ??? */
.dumpit = nl802154_dump_llsec_devkey,
.flags = GENL_ADMIN_PERM,
@@ -2420,7 +2408,8 @@ static const struct genl_ops nl802154_ops[] = {
},
{
.cmd = NL802154_CMD_GET_SEC_LEVEL,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO .doit by matching frame_type? */
.dumpit = nl802154_dump_llsec_seclevel,
.flags = GENL_ADMIN_PERM,
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index dacbd58e1799..d93d4531aa9b 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -1008,6 +1008,9 @@ static int ieee802154_create(struct net *net, struct socket *sock,
switch (sock->type) {
case SOCK_RAW:
+ rc = -EPERM;
+ if (!capable(CAP_NET_RAW))
+ goto out;
proto = &ieee802154_raw_prot;
ops = &ieee802154_raw_ops;
break;
@@ -1092,7 +1095,7 @@ static struct packet_type ieee802154_packet_type = {
static int __init af_ieee802154_init(void)
{
- int rc = -EINVAL;
+ int rc;
rc = proto_register(&ieee802154_raw_prot, 1);
if (rc)
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
index 6cd1f6d18f30..bcf650564db4 100644
--- a/net/ife/Kconfig
+++ b/net/ife/Kconfig
@@ -5,7 +5,7 @@
menuconfig NET_IFE
depends on NET
- tristate "Inter-FE based on IETF ForCES InterFE LFB"
+ tristate "Inter-FE based on IETF ForCES InterFE LFB"
default n
help
Say Y here to add support of IFE encapsulation protocol
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 974de4d20f25..f96bd489b362 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -180,8 +180,8 @@ config NET_IPIP
config NET_IPGRE_DEMUX
tristate "IP: GRE demultiplexer"
help
- This is helper module to demultiplex GRE packets on GRE version field criteria.
- Required by ip_gre and pptp modules.
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
config NET_IP_TUNNEL
tristate
@@ -378,6 +378,17 @@ config INET_ESP_OFFLOAD
If unsure, say N.
+config INET_ESPINTCP
+ bool "IP: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv4 sockets.
+
+ If unsure, say N.
+
config INET_IPCOMP
tristate "IP: IPComp transformation"
select INET_XFRM_TUNNEL
@@ -459,200 +470,200 @@ config TCP_CONG_BIC
tristate "Binary Increase Congestion (BIC) control"
default m
---help---
- BIC-TCP is a sender-side only change that ensures a linear RTT
- fairness under large windows while offering both scalability and
- bounded TCP-friendliness. The protocol combines two schemes
- called additive increase and binary search increase. When the
- congestion window is large, additive increase with a large
- increment ensures linear RTT fairness as well as good
- scalability. Under small congestion windows, binary search
- increase provides TCP friendliness.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
config TCP_CONG_CUBIC
tristate "CUBIC TCP"
default y
---help---
- This is version 2.0 of BIC-TCP which uses a cubic growth function
- among other techniques.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
---help---
- TCP Westwood+ is a sender-side only modification of the TCP Reno
- protocol stack that optimizes the performance of TCP congestion
- control. It is based on end-to-end bandwidth estimation to set
- congestion window and slow start threshold after a congestion
- episode. Using this estimation, TCP Westwood+ adaptively sets a
- slow start threshold and a congestion window which takes into
- account the bandwidth used at the time congestion is experienced.
- TCP Westwood+ significantly increases fairness wrt TCP Reno in
- wired networks and throughput over wireless links.
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
config TCP_CONG_HTCP
- tristate "H-TCP"
- default m
+ tristate "H-TCP"
+ default m
---help---
- H-TCP is a send-side only modifications of the TCP Reno
- protocol stack that optimizes the performance of TCP
- congestion control for high speed network links. It uses a
- modeswitch to change the alpha and beta parameters of TCP Reno
- based on network conditions and in a way so as to be fair with
- other Reno and H-TCP flows.
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
config TCP_CONG_HSTCP
tristate "High Speed TCP"
default n
---help---
- Sally Floyd's High Speed TCP (RFC 3649) congestion control.
- A modification to TCP's congestion control mechanism for use
- with large congestion windows. A table indicates how much to
- increase the congestion window by when an ACK is received.
- For more detail see http://www.icir.org/floyd/hstcp.html
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
default n
---help---
- TCP-Hybla is a sender-side only change that eliminates penalization of
- long-RTT, large-bandwidth connections, like when satellite legs are
- involved, especially when sharing a common bottleneck with normal
- terrestrial connections.
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
config TCP_CONG_VEGAS
tristate "TCP Vegas"
default n
---help---
- TCP Vegas is a sender-side only change to TCP that anticipates
- the onset of congestion by estimating the bandwidth. TCP Vegas
- adjusts the sending rate by modifying the congestion
- window. TCP Vegas should provide less packet loss, but it is
- not as aggressive as TCP Reno.
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
config TCP_CONG_NV
- tristate "TCP NV"
- default n
- ---help---
- TCP NV is a follow up to TCP Vegas. It has been modified to deal with
- 10G networks, measurement noise introduced by LRO, GRO and interrupt
- coalescence. In addition, it will decrease its cwnd multiplicatively
- instead of linearly.
+ tristate "TCP NV"
+ default n
+ ---help---
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
- Note that in general congestion avoidance (cwnd decreased when # packets
- queued grows) cannot coexist with congestion control (cwnd decreased only
- when there is packet loss) due to fairness issues. One scenario when they
- can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
- For further details see http://www.brakmo.org/networking/tcp-nv/
+ For further details see http://www.brakmo.org/networking/tcp-nv/
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
default n
---help---
- Scalable TCP is a sender-side only change to TCP which uses a
- MIMD congestion control algorithm which has some nice scaling
- properties, though is known to have fairness issues.
- See http://www.deneholme.net/tom/scalable/
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
config TCP_CONG_LP
tristate "TCP Low Priority"
default n
---help---
- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
- to utilize only the excess network bandwidth as compared to the
- ``fair share`` of bandwidth as targeted by TCP.
- See http://www-ece.rice.edu/networks/TCP-LP/
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
config TCP_CONG_VENO
tristate "TCP Veno"
default n
---help---
- TCP Veno is a sender-side only enhancement of TCP to obtain better
- throughput over wireless networks. TCP Veno makes use of state
- distinguishing to circumvent the difficult judgment of the packet loss
- type. TCP Veno cuts down less congestion window in response to random
- loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
select TCP_CONG_VEGAS
default n
---help---
- YeAH-TCP is a sender-side high-speed enabled TCP congestion control
- algorithm, which uses a mixed loss/delay approach to compute the
- congestion window. It's design goals target high efficiency,
- internal, RTT and Reno fairness, resilience to link loss while
- keeping network elements load as low as possible.
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
- For further details look here:
- http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
default n
---help---
- TCP-Illinois is a sender-side modification of TCP Reno for
- high speed long delay links. It uses round-trip-time to
- adjust the alpha and beta parameters to achieve a higher average
- throughput and maintain fairness.
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
- For further details see:
- http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
config TCP_CONG_DCTCP
tristate "DataCenter TCP (DCTCP)"
default n
---help---
- DCTCP leverages Explicit Congestion Notification (ECN) in the network to
- provide multi-bit feedback to the end hosts. It is designed to provide:
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
- - High burst tolerance (incast due to partition/aggregate),
- - Low latency (short flows, queries),
- - High throughput (continuous data updates, large file transfers) with
- commodity, shallow-buffered switches.
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
- All switches in the data center network running DCTCP must support
- ECN marking and be configured for marking when reaching defined switch
- buffer thresholds. The default ECN marking threshold heuristic for
- DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
- (~100KB) at 10Gbps, but might need further careful tweaking.
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
- For further details see:
- http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
config TCP_CONG_CDG
tristate "CAIA Delay-Gradient (CDG)"
default n
---help---
- CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
- the TCP sender in order to:
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
o Use the delay gradient as a congestion signal.
o Back off with an average probability that is independent of the RTT.
o Coexist with flows that use loss-based congestion control.
o Tolerate packet loss unrelated to congestion.
- For further details see:
- D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
- delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
config TCP_CONG_BBR
tristate "BBR TCP"
default n
---help---
- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
- maximize network utilization and minimize queues. It builds an explicit
- model of the the bottleneck delivery rate and path round-trip
- propagation delay. It tolerates packet loss and delay unrelated to
- congestion. It can operate over LAN, WAN, cellular, wifi, or cable
- modem links. It can coexist with flows that use loss-based congestion
- control, and can operate with shallow buffers, deep buffers,
- bufferbloat, policers, or AQM schemes that do not provide a delay
- signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index d57ecfaf89d4..9d97bace13c8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
+
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ed2301ef872e..2fe295432c24 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
- sk->sk_max_ack_backlog = backlog;
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -495,7 +495,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < inet_prot_sock(net) &&
+ if (snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -1845,13 +1845,8 @@ static __net_init int inet_init_net(struct net *net)
return 0;
}
-static __net_exit void inet_exit_net(struct net *net)
-{
-}
-
static __net_initdata struct pernet_operations af_inet_ops = {
.init = inet_init_net,
- .exit = inet_exit_net,
};
static int __init init_inet_pernet_ops(void)
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
new file mode 100644
index 000000000000..574972bc7299
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/tcp.h>
+
+static u32 optional_ops[] = {
+ offsetof(struct tcp_congestion_ops, init),
+ offsetof(struct tcp_congestion_ops, release),
+ offsetof(struct tcp_congestion_ops, set_state),
+ offsetof(struct tcp_congestion_ops, cwnd_event),
+ offsetof(struct tcp_congestion_ops, in_ack_event),
+ offsetof(struct tcp_congestion_ops, pkts_acked),
+ offsetof(struct tcp_congestion_ops, min_tso_segs),
+ offsetof(struct tcp_congestion_ops, sndbuf_expand),
+ offsetof(struct tcp_congestion_ops, cong_control),
+};
+
+static u32 unsupported_ops[] = {
+ offsetof(struct tcp_congestion_ops, get_info),
+};
+
+static const struct btf_type *tcp_sock_type;
+static u32 tcp_sock_id, sock_id;
+
+static int bpf_tcp_ca_init(struct btf *btf)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ sock_id = type_id;
+
+ type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ tcp_sock_id = type_id;
+ tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+
+ return 0;
+}
+
+static bool is_optional(u32 member_offset)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
+ if (member_offset == optional_ops[i])
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_unsupported(u32 member_offset)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) {
+ if (member_offset == unsupported_ops[i])
+ return true;
+ }
+
+ return false;
+}
+
+extern struct btf *btf_vmlinux;
+
+static bool bpf_tcp_ca_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ if (!btf_ctx_access(off, size, type, prog, info))
+ return false;
+
+ if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id)
+ /* promote it to tcp_sock */
+ info->btf_id = tcp_sock_id;
+
+ return true;
+}
+
+static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id)
+{
+ size_t end;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, t, off, size, atype, next_btf_id);
+
+ if (t != tcp_sock_type) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
+ end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
+ break;
+ case offsetof(struct inet_connection_sock, icsk_ack.pending):
+ end = offsetofend(struct inet_connection_sock,
+ icsk_ack.pending);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd):
+ end = offsetofend(struct tcp_sock, snd_cwnd);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd_cnt):
+ end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
+ break;
+ case offsetof(struct tcp_sock, snd_ssthresh):
+ end = offsetofend(struct tcp_sock, snd_ssthresh);
+ break;
+ case offsetof(struct tcp_sock, ecn_flags):
+ end = offsetofend(struct tcp_sock, ecn_flags);
+ break;
+ default:
+ bpf_log(log, "no write support to tcp_sock at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return NOT_INIT;
+}
+
+BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
+{
+ /* bpf_tcp_ca prog cannot have NULL tp */
+ __tcp_send_ack((struct sock *)tp, rcv_nxt);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
+ .func = bpf_tcp_send_ack,
+ .gpl_only = false,
+ /* In case we want to report error later */
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_ANYTHING,
+ .btf_id = &tcp_sock_id,
+};
+
+static const struct bpf_func_proto *
+bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_tcp_send_ack:
+ return &bpf_tcp_send_ack_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
+ .get_func_proto = bpf_tcp_ca_get_func_proto,
+ .is_valid_access = bpf_tcp_ca_is_valid_access,
+ .btf_struct_access = bpf_tcp_ca_btf_struct_access,
+};
+
+static int bpf_tcp_ca_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct tcp_congestion_ops *utcp_ca;
+ struct tcp_congestion_ops *tcp_ca;
+ size_t tcp_ca_name_len;
+ int prog_fd;
+ u32 moff;
+
+ utcp_ca = (const struct tcp_congestion_ops *)udata;
+ tcp_ca = (struct tcp_congestion_ops *)kdata;
+
+ moff = btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct tcp_congestion_ops, flags):
+ if (utcp_ca->flags & ~TCP_CONG_MASK)
+ return -EINVAL;
+ tcp_ca->flags = utcp_ca->flags;
+ return 1;
+ case offsetof(struct tcp_congestion_ops, name):
+ tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
+ if (!tcp_ca_name_len ||
+ tcp_ca_name_len == sizeof(utcp_ca->name))
+ return -EINVAL;
+ if (tcp_ca_find(utcp_ca->name))
+ return -EEXIST;
+ memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
+ return 1;
+ }
+
+ if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
+ return 0;
+
+ /* Ensure bpf_prog is provided for compulsory func ptr */
+ prog_fd = (int)(*(unsigned long *)(udata + moff));
+ if (!prog_fd && !is_optional(moff) && !is_unsupported(moff))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int bpf_tcp_ca_check_member(const struct btf_type *t,
+ const struct btf_member *member)
+{
+ if (is_unsupported(btf_member_bit_offset(t, member) / 8))
+ return -ENOTSUPP;
+ return 0;
+}
+
+static int bpf_tcp_ca_reg(void *kdata)
+{
+ return tcp_register_congestion_control(kdata);
+}
+
+static void bpf_tcp_ca_unreg(void *kdata)
+{
+ tcp_unregister_congestion_control(kdata);
+}
+
+/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */
+extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+
+struct bpf_struct_ops bpf_tcp_congestion_ops = {
+ .verifier_ops = &bpf_tcp_ca_verifier_ops,
+ .reg = bpf_tcp_ca_reg,
+ .unreg = bpf_tcp_ca_unreg,
+ .check_member = bpf_tcp_ca_check_member,
+ .init_member = bpf_tcp_ca_init_member,
+ .init = bpf_tcp_ca_init,
+ .name = "tcp_congestion_ops",
+};
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 7bd29e694603..4a8550c49202 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -15,6 +15,7 @@
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
+#include <net/sock_reuseport.h>
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
@@ -69,9 +70,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
}
inet->inet_daddr = fl4->daddr;
inet->inet_dport = usin->sin_port;
+ reuseport_has_conns(sk, true);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
- inet->inet_id = jiffies;
+ inet->inet_id = prandom_u32();
sk_dst_set(sk, &rt->dst);
err = 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a4b5bd4d2c89..e4632bd2026d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1496,11 +1496,6 @@ skip:
}
}
-static bool inetdev_valid_mtu(unsigned int mtu)
-{
- return mtu >= IPV4_MIN_MTU;
-}
-
static void inetdev_send_gratuitous_arp(struct net_device *dev,
struct in_device *in_dev)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 5c967764041f..103c7d599a3c 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
#include <linux/highmem.h>
@@ -117,6 +119,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
put_page(sg_page(sg));
}
+#ifdef CONFIG_INET_ESPINTCP
+struct esp_tcp_sk {
+ struct sock *sk;
+ struct rcu_head rcu;
+};
+
+static void esp_free_tcp_sk(struct rcu_head *head)
+{
+ struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
+
+ sock_put(esk->sk);
+ kfree(esk);
+}
+
+static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct esp_tcp_sk *esk;
+ __be16 sport, dport;
+ struct sock *nsk;
+ struct sock *sk;
+
+ sk = rcu_dereference(x->encap_sk);
+ if (sk && sk->sk_state == TCP_ESTABLISHED)
+ return sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ nsk = rcu_dereference_protected(x->encap_sk,
+ lockdep_is_held(&x->lock));
+ if (sk && sk == nsk) {
+ esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
+ if (!esk) {
+ spin_unlock_bh(&x->lock);
+ return ERR_PTR(-ENOMEM);
+ }
+ RCU_INIT_POINTER(x->encap_sk, NULL);
+ esk->sk = sk;
+ call_rcu(&esk->rcu, esp_free_tcp_sk);
+ }
+ spin_unlock_bh(&x->lock);
+
+ sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4,
+ dport, x->props.saddr.a4, sport, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ spin_lock_bh(&x->lock);
+ nsk = rcu_dereference_protected(x->encap_sk,
+ lockdep_is_held(&x->lock));
+ if (encap->encap_sport != sport ||
+ encap->encap_dport != dport) {
+ sock_put(sk);
+ sk = nsk ?: ERR_PTR(-EREMCHG);
+ } else if (sk == nsk) {
+ sock_put(sk);
+ } else {
+ rcu_assign_pointer(x->encap_sk, sk);
+ }
+ spin_unlock_bh(&x->lock);
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ sk = esp_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err)
+ goto out;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+
+ return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ kfree_skb(skb);
+
+ return -EOPNOTSUPP;
+}
+#endif
+
static void esp_output_done(struct crypto_async_request *base, int err)
{
struct sk_buff *skb = base->data;
@@ -147,7 +275,11 @@ static void esp_output_done(struct crypto_async_request *base, int err)
secpath_reset(skb);
xfrm_dev_resume(skb);
} else {
- xfrm_output_resume(skb, err);
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb, err);
}
}
@@ -225,45 +357,100 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
tail[plen - 1] = proto;
}
-static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
{
- int encap_type;
struct udphdr *uh;
__be32 *udpdata32;
- __be16 sport, dport;
- struct xfrm_encap_tmpl *encap = x->encap;
- struct ip_esp_hdr *esph = esp->esph;
unsigned int len;
- spin_lock_bh(&x->lock);
- sport = encap->encap_sport;
- dport = encap->encap_dport;
- encap_type = encap->encap_type;
- spin_unlock_bh(&x->lock);
-
len = skb->len + esp->tailen - skb_transport_offset(skb);
- if (len + sizeof(struct iphdr) >= IP_MAX_MTU)
- return -EMSGSIZE;
+ if (len + sizeof(struct iphdr) > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
- uh = (struct udphdr *)esph;
+ uh = (struct udphdr *)esp->esph;
uh->source = sport;
uh->dest = dport;
uh->len = htons(len);
uh->check = 0;
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ return (struct ip_esp_hdr *)(udpdata32 + 2);
+ }
+
+ return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
switch (encap_type) {
default:
case UDP_ENCAP_ESPINUDP:
- esph = (struct ip_esp_hdr *)(uh + 1);
- break;
case UDP_ENCAP_ESPINUDP_NON_IKE:
- udpdata32 = (__be32 *)(uh + 1);
- udpdata32[0] = udpdata32[1] = 0;
- esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
+ break;
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp_output_tcp_encap(x, skb, esp);
break;
}
- *skb_mac_header(skb) = IPPROTO_UDP;
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
esp->esph = esph;
return 0;
@@ -279,9 +466,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
struct sk_buff *trailer;
int tailen = esp->tailen;
- /* this is non-NULL only with UDP Encapsulation */
+ /* this is non-NULL only with TCP/UDP Encapsulation */
if (x->encap) {
- int err = esp_output_udp_encap(x, skb, esp);
+ int err = esp_output_encap(x, skb, esp);
if (err < 0)
return err;
@@ -474,6 +661,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
if (sg != dsg)
esp_ssg_unref(x, tmp);
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
+
error_free:
kfree(tmp);
error:
@@ -600,7 +790,23 @@ int esp_input_done2(struct sk_buff *skb, int err)
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
+ struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+ __be16 source;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
/*
* 1) if the NAT-T peer's IP or port changed then
@@ -609,11 +815,11 @@ int esp_input_done2(struct sk_buff *skb, int err)
* SRC ports.
*/
if (iph->saddr != x->props.saddr.a4 ||
- uh->source != encap->encap_sport) {
+ source != encap->encap_sport) {
xfrm_address_t ipaddr;
ipaddr.a4 = iph->saddr;
- km_new_mapping(x, &ipaddr, uh->source);
+ km_new_mapping(x, &ipaddr, source);
/* XXX: perhaps add an extra
* policy check here, to see
@@ -988,6 +1194,14 @@ static int esp_init_state(struct xfrm_state *x)
case UDP_ENCAP_ESPINUDP_NON_IKE:
x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
break;
+#ifdef CONFIG_INET_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
+ break;
+#endif
}
}
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 0e4a7cf6bc87..e2e219c7854a 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -57,6 +57,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
if (!x)
goto out_reset;
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
sp->xvec[sp->len++] = x;
sp->olen++;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e8bc939b56dd..577db1d50a24 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -70,11 +70,6 @@ fail:
fib_free_table(main_table);
return -ENOMEM;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return false;
-}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -124,17 +119,13 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
h = id & (FIB_TABLE_HASHSZ - 1);
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held()) {
if (tb->tb_id == id)
return tb;
}
return NULL;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return net->ipv4.fib_has_custom_rules;
-}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
static void fib_replace_table(struct net *net, struct fib_table *old,
@@ -1147,7 +1138,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
if (!(dev->flags & IFF_UP) ||
ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
ipv4_is_zeronet(prefix) ||
- prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+ (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
return;
/* add the new */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index a68b5e21ec51..c092e9a55790 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -16,6 +16,9 @@ struct fib_alias {
u8 fa_slen;
u32 tb_id;
s16 fa_default;
+ u8 offload:1,
+ trap:1,
+ unused:6;
struct rcu_head rcu;
};
@@ -35,9 +38,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
- u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ struct fib_rt_info *fri, unsigned int flags);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index b804ccbdb241..0c28bd469a68 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -9,12 +9,12 @@
#include <net/netns/ipv4.h>
#include <net/ip_fib.h>
-int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+int call_fib4_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
info->family = AF_INET;
- return call_fib_notifier(nb, net, event_type, info);
+ return call_fib_notifier(nb, event_type, info);
}
int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
@@ -34,17 +34,16 @@ static unsigned int fib4_seq_read(struct net *net)
return net->ipv4.fib_seq + fib4_rules_seq_read(net);
}
-static int fib4_dump(struct net *net, struct notifier_block *nb)
+static int fib4_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
int err;
- err = fib4_rules_dump(net, nb);
+ err = fib4_rules_dump(net, nb, extack);
if (err)
return err;
- fib_notify(net, nb);
-
- return 0;
+ return fib_notify(net, nb, extack);
}
static const struct fib_notifier_ops fib4_notifier_ops_template = {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b43a7ba5c6a4..f99e3bac5cab 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib4_rule_default);
-int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, AF_INET);
+ return fib_rules_dump(net, nb, AF_INET, extack);
}
unsigned int fib4_rules_seq_read(struct net *net)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2db089e10ba0..a803cdd9400a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
int dst_len, u32 tb_id, const struct nl_info *info,
unsigned int nlm_flags)
{
+ struct fib_rt_info fri;
struct sk_buff *skb;
u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
@@ -512,9 +513,15 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
if (!skb)
goto errout;
- err = fib_dump_info(skb, info->portid, seq, event, tb_id,
- fa->fa_type, key, dst_len,
- fa->fa_tos, fa->fa_info, nlm_flags);
+ fri.fi = fa->fa_info;
+ fri.tb_id = tb_id;
+ fri.dst = key;
+ fri.dst_len = dst_len;
+ fri.tos = fa->fa_tos;
+ fri.type = fa->fa_type;
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
+ err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -1582,7 +1589,7 @@ failure:
}
int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
- unsigned char *flags, bool skip_oif)
+ u8 rt_family, unsigned char *flags, bool skip_oif)
{
if (nhc->nhc_flags & RTNH_F_DEAD)
*flags |= RTNH_F_DEAD;
@@ -1613,7 +1620,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
/* if gateway family does not match nexthop family
* gateway is encoded as RTA_VIA
*/
- if (nhc->nhc_gw_family != nhc->nhc_family) {
+ if (rt_family != nhc->nhc_gw_family) {
int alen = sizeof(struct in6_addr);
struct nlattr *nla;
struct rtvia *via;
@@ -1654,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info);
#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
- int nh_weight)
+ int nh_weight, u8 rt_family)
{
const struct net_device *dev = nhc->nhc_dev;
struct rtnexthop *rtnh;
@@ -1667,7 +1674,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
rtnh->rtnh_hops = nh_weight - 1;
rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
- if (fib_nexthop_info(skb, nhc, &flags, true) < 0)
+ if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
goto nla_put_failure;
rtnh->rtnh_flags = flags;
@@ -1693,13 +1700,14 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
goto nla_put_failure;
if (unlikely(fi->nh)) {
- if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+ if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
goto nla_put_failure;
goto mp_end;
}
for_nexthops(fi) {
- if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
+ if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+ AF_INET) < 0)
goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid &&
@@ -1724,10 +1732,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
#endif
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
- u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
- struct fib_info *fi, unsigned int flags)
+ struct fib_rt_info *fri, unsigned int flags)
{
- unsigned int nhs = fib_info_num_path(fi);
+ unsigned int nhs = fib_info_num_path(fri->fi);
+ struct fib_info *fi = fri->fi;
+ u32 tb_id = fri->tb_id;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
@@ -1737,22 +1746,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET;
- rtm->rtm_dst_len = dst_len;
+ rtm->rtm_dst_len = fri->dst_len;
rtm->rtm_src_len = 0;
- rtm->rtm_tos = tos;
+ rtm->rtm_tos = fri->tos;
if (tb_id < 256)
rtm->rtm_table = tb_id;
else
rtm->rtm_table = RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, tb_id))
goto nla_put_failure;
- rtm->rtm_type = type;
+ rtm->rtm_type = fri->type;
rtm->rtm_flags = fi->fib_flags;
rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
if (rtm->rtm_dst_len &&
- nla_put_in_addr(skb, RTA_DST, dst))
+ nla_put_in_addr(skb, RTA_DST, fri->dst))
goto nla_put_failure;
if (fi->fib_priority &&
nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
@@ -1775,7 +1784,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
unsigned char flags = 0;
- if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
+ if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
goto nla_put_failure;
rtm->rtm_flags = flags;
@@ -1794,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
goto nla_put_failure;
}
+ if (fri->offload)
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (fri->trap)
+ rtm->rtm_flags |= RTM_F_TRAP;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1813,8 +1827,8 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
+ int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
struct net *net = dev_net(dev);
- int tb_id = l3mdev_fib_table(dev);
struct fib_info *fi;
if (!fib_info_laddrhash || local == 0)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2b2b3d291ab0..ff0c24371e33 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -74,11 +74,13 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_entry_notifier(struct notifier_block *nb,
enum fib_event_type event_type, u32 dst,
- int dst_len, struct fib_alias *fa)
+ int dst_len, struct fib_alias *fa,
+ struct netlink_ext_ack *extack)
{
struct fib_entry_notifier_info info = {
+ .info.extack = extack,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
@@ -86,7 +88,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
- return call_fib4_notifier(nb, net, event_type, &info.info);
+ return call_fib4_notifier(nb, event_type, &info.info);
}
static int call_fib_entry_notifiers(struct net *net,
@@ -978,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t,
/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
+ * If 'find_first' is set, return the first matching
+ * fib alias, regardless of TOS and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
- u8 tos, u32 prio, u32 tb_id)
+ u8 tos, u32 prio, u32 tb_id,
+ bool find_first)
{
struct fib_alias *fa;
@@ -996,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
continue;
if (fa->tb_id != tb_id)
break;
+ if (find_first)
+ return fa;
if (fa->fa_tos > tos)
continue;
if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
@@ -1005,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
return NULL;
}
+static struct fib_alias *
+fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
+{
+ u8 slen = KEYLENGTH - fri->dst_len;
+ struct key_vector *l, *tp;
+ struct fib_table *tb;
+ struct fib_alias *fa;
+ struct trie *t;
+
+ tb = fib_get_table(net, fri->tb_id);
+ if (!tb)
+ return NULL;
+
+ t = (struct trie *)tb->tb_data;
+ l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
+ if (!l)
+ return NULL;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
+ fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
+ fa->fa_type == fri->type)
+ return fa;
+ }
+
+ return NULL;
+}
+
+void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
+{
+ struct fib_alias *fa_match;
+
+ rcu_read_lock();
+
+ fa_match = fib_find_matching_alias(net, fri);
+ if (!fa_match)
+ goto out;
+
+ fa_match->offload = fri->offload;
+ fa_match->trap = fri->trap;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
+
static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
while (!IS_TRIE(tn))
@@ -1061,9 +1114,6 @@ noleaf:
return -ENOMEM;
}
-/* fib notifier for ADD is sent before calling fib_insert_alias with
- * the expectation that the only possible failure ENOMEM
- */
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *new,
struct fib_alias *fa, t_key key)
@@ -1116,11 +1166,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
return true;
}
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old);
+
/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
{
- enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
@@ -1147,7 +1199,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
l = fib_find_node(t, &tp, key);
fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
- tb->tb_id) : NULL;
+ tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
* with the same keys [prefix,tos,priority], if such key already
@@ -1214,19 +1266,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
- err = call_fib_entry_notifiers(net,
- FIB_EVENT_ENTRY_REPLACE,
- key, plen, new_fa,
- extack);
- if (err)
- goto out_free_new_fa;
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+ if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
+ tb->tb_id, true) == new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event,
+ key, plen,
+ new_fa, extack);
+ if (err) {
+ hlist_replace_rcu(&new_fa->fa_list,
+ &fa->fa_list);
+ goto out_free_new_fa;
+ }
+ }
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
- hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
@@ -1242,12 +1304,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
- if (cfg->fc_nlflags & NLM_F_APPEND) {
- event = FIB_EVENT_ENTRY_APPEND;
+ if (cfg->fc_nlflags & NLM_F_APPEND)
nlflags |= NLM_F_APPEND;
- } else {
+ else
fa = fa_first;
- }
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1266,15 +1326,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_slen = slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
-
- err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
- if (err)
- goto out_free_new_fa;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_fib_notif;
+ goto out_free_new_fa;
+
+ /* The alias was already inserted, so the node must exist. */
+ l = l ? l : fib_find_node(t, &tp, key);
+ if (WARN_ON_ONCE(!l))
+ goto out_free_new_fa;
+
+ if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
+ new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event, key, plen,
+ new_fa, extack);
+ if (err)
+ goto out_remove_new_fa;
+ }
if (!plen)
tb->tb_num_default++;
@@ -1285,14 +1359,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
succeeded:
return 0;
-out_fib_notif:
- /* notifier was sent that entry would be added to trie, but
- * the add failed and need to recover. Only failure for
- * fib_insert_alias is ENOMEM.
- */
- NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
- plen, new_fa, NULL);
+out_remove_new_fa:
+ fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1543,6 +1611,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
node_pull_suffix(tp, fa->fa_slen);
}
+static void fib_notify_alias_delete(struct net *net, u32 key,
+ struct hlist_head *fah,
+ struct fib_alias *fa_to_delete,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_alias *fa_next, *fa_to_notify;
+ u32 tb_id = fa_to_delete->tb_id;
+ u8 slen = fa_to_delete->fa_slen;
+ enum fib_event_type fib_event;
+
+ /* Do not notify if we do not care about the route. */
+ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
+ return;
+
+ /* Determine if the route should be replaced by the next route in the
+ * list.
+ */
+ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
+ struct fib_alias, fa_list);
+ if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ fa_to_notify = fa_next;
+ } else {
+ fib_event = FIB_EVENT_ENTRY_DEL;
+ fa_to_notify = fa_to_delete;
+ }
+ call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
+ fa_to_notify, extack);
+}
+
/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
@@ -1564,7 +1662,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!l)
return -ESRCH;
- fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
+ fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
@@ -1596,8 +1694,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!fa_to_delete)
return -ESRCH;
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete, extack);
+ fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1921,10 +2018,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
continue;
}
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
- n->key,
- KEYLENGTH - fa->fa_slen, fa,
- NULL);
+ fib_notify_alias_delete(net, n->key, &n->leaf, fa,
+ NULL);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -2015,10 +2110,13 @@ void fib_info_notify_update(struct net *net, struct nl_info *info)
}
}
-static void fib_leaf_notify(struct net *net, struct key_vector *l,
- struct fib_table *tb, struct notifier_block *nb)
+static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct fib_alias *fa;
+ int last_slen = -1;
+ int err;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
@@ -2032,39 +2130,57 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
if (tb->tb_id != fa->tb_id)
continue;
- call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
- KEYLENGTH - fa->fa_slen, fa);
+ if (fa->fa_slen == last_slen)
+ continue;
+
+ last_slen = fa->fa_slen;
+ err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
+ l->key, KEYLENGTH - fa->fa_slen,
+ fa, extack);
+ if (err)
+ return err;
}
+ return 0;
}
-static void fib_table_notify(struct net *net, struct fib_table *tb,
- struct notifier_block *nb)
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
+ int err;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
- fib_leaf_notify(net, l, tb, nb);
+ err = fib_leaf_notify(l, tb, nb, extack);
+ if (err)
+ return err;
key = l->key + 1;
/* stop in case of wrap around */
if (key < l->key)
break;
}
+ return 0;
}
-void fib_notify(struct net *net, struct notifier_block *nb)
+int fib_notify(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
unsigned int h;
+ int err;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, head, tb_hlist)
- fib_table_notify(net, tb, nb);
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ err = fib_table_notify(tb, nb, extack);
+ if (err)
+ return err;
+ }
}
+ return 0;
}
static void __trie_free_rcu(struct rcu_head *head)
@@ -2128,14 +2244,20 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
if (filter->dump_routes) {
if (!s_fa) {
+ struct fib_rt_info fri;
+
+ fri.fi = fi;
+ fri.tb_id = tb->tb_id;
+ fri.dst = xkey;
+ fri.dst_len = KEYLENGTH - fa->fa_slen;
+ fri.tos = fa->fa_tos;
+ fri.type = fa->fa_type;
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
err = fib_dump_info(skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id, fa->fa_type,
- xkey,
- KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fi, flags);
+ RTM_NEWROUTE, &fri, flags);
if (err < 0)
goto stop;
}
@@ -2145,7 +2267,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
if (filter->dump_exceptions) {
err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
- &i_fa, s_fa);
+ &i_fa, s_fa, flags);
if (err < 0)
goto stop;
}
@@ -2175,6 +2297,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
int count = cb->args[2];
t_key key = cb->args[3];
+ /* First time here, count and key are both always 0. Count > 0
+ * and key == 0 means the dump has wrapped around and we are done.
+ */
+ if (count && !key)
+ return skb->len;
+
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
int err;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 30fa771d382a..dcc79ff54b41 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -662,8 +662,8 @@ static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
[FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
[FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
- [FOU_ATTR_LOCAL_V6] = { .type = sizeof(struct in6_addr), },
- [FOU_ATTR_PEER_V6] = { .type = sizeof(struct in6_addr), },
+ [FOU_ATTR_LOCAL_V6] = { .len = sizeof(struct in6_addr), },
+ [FOU_ATTR_PEER_V6] = { .len = sizeof(struct in6_addr), },
[FOU_ATTR_PEER_PORT] = { .type = NLA_U16, },
[FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
};
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 44bfeecac33e..5fd6e8ed02b5 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -127,7 +127,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
return -EINVAL;
- ershdr = (struct erspan_base_hdr *)options;
+ ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len);
tpi->key = cpu_to_be32(get_session_id(ershdr));
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 4de7e962d3da..2e6d1b7a7bc9 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
if (skb_gro_checksum_simple_validate(skb))
goto out_unlock;
- skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
null_compute_pseudo);
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1510e951f451..f369e7ce685b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -249,10 +249,11 @@ bool icmp_global_allow(void)
bool rc = false;
/* Check if token bucket is empty and cannot be refilled
- * without taking the spinlock.
+ * without taking the spinlock. The READ_ONCE() are paired
+ * with the following WRITE_ONCE() in this same function.
*/
- if (!icmp_global.credit) {
- delta = min_t(u32, now - icmp_global.stamp, HZ);
+ if (!READ_ONCE(icmp_global.credit)) {
+ delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
if (delta < HZ / 50)
return false;
}
@@ -262,14 +263,14 @@ bool icmp_global_allow(void)
if (delta >= HZ / 50) {
incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
if (incr)
- icmp_global.stamp = now;
+ WRITE_ONCE(icmp_global.stamp, now);
}
credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
if (credit) {
credit--;
rc = true;
}
- icmp_global.credit = credit;
+ WRITE_ONCE(icmp_global.credit, credit);
spin_unlock(&icmp_global.lock);
return rc;
}
@@ -582,7 +583,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (!rt)
goto out;
- net = dev_net(rt->dst.dev);
+
+ if (rt->dst.dev)
+ net = dev_net(rt->dst.dev);
+ else if (skb_in->dev)
+ net = dev_net(skb_in->dev);
+ else
+ goto out;
/*
* Find the original header. It is expected to be valid, of course.
@@ -676,7 +683,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
if (dev)
- saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ saddr = inet_select_addr(dev, iph->saddr,
+ RT_SCOPE_LINK);
else
saddr = 0;
rcu_read_unlock();
@@ -740,6 +748,39 @@ out:;
}
EXPORT_SYMBOL(__icmp_send);
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ __be32 orig_ip;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(ct->status & IPS_SRC_NAT)) {
+ icmp_send(skb_in, type, code, info);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct iphdr))))
+ goto out;
+
+ orig_ip = ip_hdr(skb_in)->saddr;
+ ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+ icmp_send(skb_in, type, code, info);
+ ip_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
@@ -902,7 +943,7 @@ static bool icmp_redirect(struct sk_buff *skb)
return false;
}
- icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
+ icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
return true;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 180f6896b98b..3b9c7a2725a9 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1475,7 +1475,7 @@ EXPORT_SYMBOL(__ip_mc_inc_group);
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
- __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+ __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
}
EXPORT_SYMBOL(ip_mc_inc_group);
@@ -1563,7 +1563,7 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb)
}
}
-static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
return skb_checksum_simple_validate(skb);
}
@@ -2197,7 +2197,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
iml->sflist = NULL;
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
- __ip_mc_inc_group(in_dev, addr, mode);
+ ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
done:
return err;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f5c163d4771b..a4db79b1b643 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
rcu_read_unlock();
return &rt->dst;
@@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
- if (opt && opt->opt.is_strictroute && rt->rt_gw_family)
+ if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
goto route_err;
return &rt->dst;
@@ -610,12 +610,6 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-#if IS_ENABLED(CONFIG_IPV6)
-#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
-#else
-#define AF_INET_FAMILY(fam) true
-#endif
-
/* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
const int max_retries,
@@ -716,7 +710,7 @@ static void reqsk_timer_handler(struct timer_list *t)
* ones are about to clog our table.
*/
qlen = reqsk_queue_len(queue);
- if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
@@ -770,6 +764,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
+static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
+ const gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(newsk);
+
+ if (!icsk->icsk_ulp_ops)
+ return;
+
+ if (icsk->icsk_ulp_ops->clone)
+ icsk->icsk_ulp_ops->clone(req, newsk, priority);
+}
+
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
* @sk: the socket to clone
@@ -810,6 +816,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
/* Deinitialize accept_queue to trap illegal accesses. */
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ inet_clone_ulp(req, newsk, priority);
+
security_inet_csk_clone(newsk, req);
}
return newsk;
@@ -906,7 +914,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
percpu_counter_inc(sk->sk_prot->orphan_count);
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
BUG_ON(sk != req->rsk_listener);
/* Paranoid, to prevent race condition if
@@ -915,7 +923,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
*/
- tcp_sk(child)->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
}
inet_csk_destroy_sock(child);
}
@@ -934,7 +942,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
req->sk = child;
req->dl_next = NULL;
if (queue->rskq_accept_head == NULL)
- queue->rskq_accept_head = req;
+ WRITE_ONCE(queue->rskq_accept_head, req);
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
@@ -1086,7 +1094,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
if (!dst)
goto out;
}
- dst->ops->update_pmtu(dst, sk, NULL, mtu);
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
dst = __sk_dst_check(sk, 0);
if (!dst)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index bbb005eb5218..f11e997e517b 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -193,7 +193,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
- .idiag_wmem = sk->sk_wmem_queued,
+ .idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
.idiag_fmem = sk->sk_forward_alloc,
.idiag_tmem = sk_wmem_alloc_get(sk),
};
@@ -226,17 +226,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires =
- jiffies_to_msecs(icsk->icsk_timeout - jiffies);
+ jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires =
- jiffies_to_msecs(icsk->icsk_timeout - jiffies);
+ jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = icsk->icsk_probes_out;
r->idiag_expires =
- jiffies_to_msecs(sk->sk_timer.expires - jiffies);
+ jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
} else {
r->idiag_timer = 0;
r->idiag_expires = 0;
@@ -342,16 +342,13 @@ static int inet_twsk_diag_fill(struct sock *sk,
r = nlmsg_data(nlh);
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
- tmo = tw->tw_timer.expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
inet_diag_msg_common_fill(r, sk);
r->idiag_retrans = 0;
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- r->idiag_expires = jiffies_to_msecs(tmo);
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
@@ -385,7 +382,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
offsetof(struct sock, sk_cookie));
tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
- r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
r->idiag_uid = 0;
@@ -914,11 +911,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock(&ilb->lock);
- sk_for_each(sk, &ilb->head) {
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 97824864e40d..2bbaaf0c7176 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -240,7 +240,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score = sk->sk_family == PF_INET ? 2 : 1;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
}
return score;
@@ -516,10 +516,11 @@ static int inet_reuseport_add_sock(struct sock *sk,
struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+ const struct hlist_nulls_node *node;
struct sock *sk2;
kuid_t uid = sock_i_uid(sk);
- sk_for_each_rcu(sk2, &ilb->head) {
+ sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
if (sk2 != sk &&
sk2->sk_family == sk->sk_family &&
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
@@ -555,9 +556,9 @@ int __inet_hash(struct sock *sk, struct sock *osk)
}
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
sk->sk_family == AF_INET6)
- hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
+ __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
else
- hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ __sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
inet_hash2(hashinfo, sk);
ilb->count++;
sock_set_flag(sk, SOCK_RCU_FREE);
@@ -606,11 +607,9 @@ void inet_unhash(struct sock *sk)
reuseport_detach_sock(sk);
if (ilb) {
inet_unhash2(hashinfo, sk);
- __sk_del_node_init(sk);
- ilb->count--;
- } else {
- __sk_nulls_del_node_init_rcu(sk);
+ ilb->count--;
}
+ __sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
spin_unlock_bh(lock);
@@ -750,7 +749,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head,
+ i + LISTENING_NULLS_BASE);
h->listening_hash[i].count = 0;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index be778599bfed..ff327a62c9ce 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -160,7 +160,12 @@ static void inet_peer_gc(struct inet_peer_base *base,
base->total / inet_peer_threshold * HZ;
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
- delta = (__u32)jiffies - p->dtime;
+
+ /* The READ_ONCE() pairs with the WRITE_ONCE()
+ * in inet_putpeer()
+ */
+ delta = (__u32)jiffies - READ_ONCE(p->dtime);
+
if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
gc_stack[i] = NULL;
}
@@ -237,7 +242,10 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
- p->dtime = (__u32)jiffies;
+ /* The WRITE_ONCE() pairs with itself (we run lockless)
+ * and the READ_ONCE() in inet_peer_gc()
+ */
+ WRITE_ONCE(p->dtime, (__u32)jiffies);
if (refcount_dec_and_test(&p->refcnt))
call_rcu(&p->rcu, inetpeer_free_rcu);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 06f6f280b9ff..00ec819f949b 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb)
rt = skb_rtable(skb);
- if (opt->is_strictroute && rt->rt_gw_family)
+ if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
IPCB(skb)->flags |= IPSKB_FORWARDED;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index a53a543fe055..8274f98c511c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -340,6 +340,8 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ const struct iphdr *tnl_params;
+
if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
raw_proto, false) < 0)
goto drop;
@@ -348,7 +350,9 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
skb_pop_mac_header(skb);
else
skb_reset_mac_header(skb);
- if (tunnel->collect_md) {
+
+ tnl_params = &tunnel->parms.iph;
+ if (tunnel->collect_md || tnl_params->daddr == 0) {
__be16 flags;
__be64 tun_id;
@@ -509,9 +513,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
key = &tun_info->key;
if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
goto err_free_skb;
- md = ip_tunnel_info_opts(tun_info);
- if (!md)
+ if (tun_info->options_len < sizeof(*md))
goto err_free_skb;
+ md = ip_tunnel_info_opts(tun_info);
/* ERSPAN has fixed 8 byte GRE header */
version = md->version;
@@ -1446,6 +1450,7 @@ static void erspan_setup(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
ether_setup(dev);
+ dev->max_mtu = 0;
dev->netdev_ops = &erspan_netdev_ops;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -1459,8 +1464,8 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
[IFLA_GRE_IKEY] = { .type = NLA_U32 },
[IFLA_GRE_OKEY] = { .type = NLA_U32 },
- [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
- [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_TOS] = { .type = NLA_U8 },
[IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 1e2392b7c64e..aa438c6758a7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -199,7 +199,7 @@ resubmit:
kfree_skb(skb);
return;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
}
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
skb);
@@ -302,16 +302,31 @@ drop:
return true;
}
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+ ip_hdr(hint)->tos == iph->tos;
+}
+
INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb, struct net_device *dev)
+ struct sk_buff *skb, struct net_device *dev,
+ const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
struct rtable *rt;
int err;
+ if (ip_can_use_hint(skb, iph, hint)) {
+ err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
+ dev, hint);
+ if (unlikely(err))
+ goto drop_error;
+ }
+
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev);
+ ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
}
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+ struct sk_buff *skb, int rt_type)
+{
+ if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+ return NULL;
+
+ return skb;
+}
+
static void ip_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
@@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip_extract_route_hint(net, skb,
+ ((struct rtable *)dst)->rt_type);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip_sublist_rcv_finish(&sublist);
@@ -611,5 +638,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
- ip_sublist_rcv(&sublist, curr_dev, curr_net);
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc7ef0d05bbd..d84819893db9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *skb, unsigned int mtu)
{
+ struct sk_buff *segs, *nskb;
netdev_features_t features;
- struct sk_buff *segs;
int ret = 0;
/* common case: seglen is <= mtu
@@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
consume_skb(skb);
- do {
- struct sk_buff *nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
int err;
skb_mark_not_on_list(segs);
@@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
if (err && ret == 0)
ret = err;
- segs = nskb;
- } while (segs);
+ }
return ret;
}
@@ -422,7 +420,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
+ struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
@@ -430,7 +428,7 @@ int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
+ net, sk, skb, indev, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -499,7 +497,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
- if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family)
+ if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
@@ -645,11 +643,12 @@ void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
EXPORT_SYMBOL(ip_fraglist_prepare);
void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
- unsigned int ll_rs, unsigned int mtu,
+ unsigned int ll_rs, unsigned int mtu, bool DF,
struct ip_frag_state *state)
{
struct iphdr *iph = ip_hdr(skb);
+ state->DF = DF;
state->hlen = hlen;
state->ll_rs = ll_rs;
state->mtu = mtu;
@@ -668,9 +667,6 @@ static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
/* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
- if (IPCB(from)->flags & IPSKB_FRAG_PMTU)
- state->iph->frag_off |= htons(IP_DF);
-
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -738,6 +734,8 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
*/
iph = ip_hdr(skb2);
iph->frag_off = htons((state->offset >> 3));
+ if (state->DF)
+ iph->frag_off |= htons(IP_DF);
/*
* Added AC : If we are fragmenting a fragment that's not the
@@ -771,6 +769,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rtable *rt = skb_rtable(skb);
unsigned int mtu, hlen, ll_rs;
struct ip_fraglist_iter iter;
+ ktime_t tstamp = skb->tstamp;
struct ip_frag_state state;
int err = 0;
@@ -846,6 +845,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
ip_fraglist_prepare(skb, &iter);
}
+ skb->tstamp = tstamp;
err = output(net, sk, skb);
if (!err)
@@ -881,7 +881,8 @@ slow_path:
* Fragment the datagram.
*/
- ip_frag_init(skb, hlen, ll_rs, mtu, &state);
+ ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
+ &state);
/*
* Keep copying data until we run out.
@@ -900,6 +901,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
+ skb2->tstamp = tstamp;
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -1254,18 +1256,22 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->addr = ipc->addr;
}
- /*
- * We steal reference to this route, caller should not release it
- */
- *rtp = NULL;
cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+ dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+
+ if (!inetdev_valid_mtu(cork->fragsize))
+ return -ENETUNREACH;
cork->gso_size = ipc->gso_size;
+
cork->dst = &rt->dst;
+ /* We stole this route, caller should not release it. */
+ *rtp = NULL;
+
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
+ cork->mark = ipc->sockc.mark;
cork->priority = ipc->priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
@@ -1529,7 +1535,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
}
skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = cork->mark;
skb->tstamp = cork->transmit_time;
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
@@ -1693,7 +1699,6 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
inet_sk(sk)->tos = arg->tos;
- sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
sk->sk_sndbuf = sysctl_wmem_default;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 38c02bb62e2c..74e1d964a615 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -505,7 +505,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
if (skb_valid_dst(skb))
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) &&
@@ -1236,10 +1236,8 @@ int ip_tunnel_init(struct net_device *dev)
iph->version = 4;
iph->ihl = 5;
- if (tunnel->collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
+ if (tunnel->collect_md)
netif_keep_dst(dev);
- }
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 1452a97914a0..47f8b947eef1 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -34,6 +34,9 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
const struct ip_tunnel_encap_ops __rcu *
iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
@@ -126,15 +129,14 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
if (!md || md->type != METADATA_IP_TUNNEL ||
md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
-
return NULL;
- res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
+ src = &md->u.tun_info;
+ res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
if (!res)
return NULL;
dst = &res->u.tun_info;
- src = &md->u.tun_info;
dst->key.tun_id = src->key.tun_id;
if (src->mode & IP_TUNNEL_INFO_IPV6)
memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
@@ -143,6 +145,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
dst->key.tun_flags = src->key.tun_flags;
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+ ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
+ src->options_len, 0);
return res;
}
@@ -211,30 +215,243 @@ void ip_tunnel_get_stats64(struct net_device *dev,
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
[LWTUNNEL_IP_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP_DST] = { .type = NLA_U32 },
[LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
[LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
[LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
+ [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 },
};
+static const struct nla_policy
+erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
+static int ip_tun_parse_opts_geneve(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
+ int data_len, err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
+ geneve_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ if (info) {
+ struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
+ opt->opt_class = nla_get_be16(attr);
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
+ opt->type = nla_get_u8(attr);
+ info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+ }
+
+ return sizeof(struct geneve_opt) + data_len;
+}
+
+static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
+ vxlan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
+ return -EINVAL;
+
+ if (info) {
+ struct vxlan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
+ md->gbp = nla_get_u32(attr);
+ info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int ip_tun_parse_opts_erspan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
+ erspan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
+ return -EINVAL;
+
+ ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
+ return -EINVAL;
+ } else if (ver == 2) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
+ !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (info) {
+ struct erspan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ md->version = ver;
+ if (ver == 1) {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(attr);
+ } else {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(attr);
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(attr));
+ }
+
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
+static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, opts_len = 0, type = 0;
+ struct nlattr *nla;
+
+ if (!attr)
+ return 0;
+
+ err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
+ ip_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case LWTUNNEL_IP_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case LWTUNNEL_IP_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case LWTUNNEL_IP_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return opts_len;
+}
+
+static int ip_tun_get_optlen(struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, NULL, extack);
+}
+
+static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, info, extack);
+}
+
static int ip_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_info *tun_info;
- struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
- int err;
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
ip_tun_policy, extack);
if (err < 0)
return err;
- new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
if (!new_state)
return -ENOMEM;
@@ -242,6 +459,12 @@ static int ip_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
#ifdef CONFIG_DST_CACHE
err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
if (err) {
@@ -266,10 +489,12 @@ static int ip_tun_build_state(struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
if (tb[LWTUNNEL_IP_FLAGS])
- tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
+ tun_info->key.tun_flags |=
+ (nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) &
+ ~TUNNEL_OPTIONS_PRESENT);
tun_info->mode = IP_TUNNEL_INFO_TX;
- tun_info->options_len = 0;
+ tun_info->options_len = opt_len;
*ts = new_state;
@@ -285,6 +510,114 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
#endif
}
+static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct geneve_opt *opt;
+ struct nlattr *nest;
+ int offset = 0;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
+ if (!nest)
+ return -ENOMEM;
+
+ while (tun_info->options_len > offset) {
+ opt = ip_tunnel_info_opts(tun_info) + offset;
+ if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+ nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+ opt->opt_data)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, nest);
+ return 0;
+err:
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+}
+
+static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nlattr *nest;
+ int err = 0;
+
+ if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, type);
+ if (!nest)
+ return -ENOMEM;
+
+ if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
+ err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+ else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
+ err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
+ else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
+ err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
+
+ if (err) {
+ nla_nest_cancel(skb, nest);
+ return err;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
static int ip_tun_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
@@ -296,12 +629,52 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
return -ENOMEM;
return 0;
}
+static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
+{
+ int opt_len;
+
+ if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+ return 0;
+
+ opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
+ if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */
+ while (info->options_len > offset) {
+ opt = ip_tunnel_info_opts(info) + offset;
+ opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */
+ + nla_total_size(1) /* OPT_GENEVE_TYPE */
+ + nla_total_size(opt->length * 4);
+ /* OPT_GENEVE_DATA */
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
+ + nla_total_size(4); /* OPT_VXLAN_GBP */
+ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ struct erspan_metadata *md = ip_tunnel_info_opts(info);
+
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */
+ + nla_total_size(1) /* OPT_ERSPAN_VER */
+ + (md->version == 1 ? nla_total_size(4)
+ /* OPT_ERSPAN_INDEX (v1) */
+ : nla_total_size(1) +
+ nla_total_size(1));
+ /* OPT_ERSPAN_DIR + HWID (v2) */
+ }
+
+ return opt_len;
+}
+
static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
@@ -309,13 +682,21 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+ nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP_TOS */
+ nla_total_size(1) /* LWTUNNEL_IP_TTL */
- + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */
+ + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP_OPTS */
}
static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
- return memcmp(lwt_tun_info(a), lwt_tun_info(b),
- sizeof(struct ip_tunnel_info));
+ struct ip_tunnel_info *info_a = lwt_tun_info(a);
+ struct ip_tunnel_info *info_b = lwt_tun_info(b);
+
+ return memcmp(info_a, info_b, sizeof(info_a->key)) ||
+ info_a->mode != info_b->mode ||
+ info_a->options_len != info_b->options_len ||
+ memcmp(ip_tunnel_info_opts(info_a),
+ ip_tunnel_info_opts(info_b), info_a->options_len);
}
static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
@@ -328,12 +709,14 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS },
[LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
[LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
static int ip6_tun_build_state(struct nlattr *attr,
@@ -341,17 +724,21 @@ static int ip6_tun_build_state(struct nlattr *attr,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_info *tun_info;
- struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
- int err;
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
ip6_tun_policy, extack);
if (err < 0)
return err;
- new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
if (!new_state)
return -ENOMEM;
@@ -359,6 +746,12 @@ static int ip6_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
if (tb[LWTUNNEL_IP6_ID])
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
@@ -375,10 +768,12 @@ static int ip6_tun_build_state(struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
if (tb[LWTUNNEL_IP6_FLAGS])
- tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+ tun_info->key.tun_flags |=
+ (nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) &
+ ~TUNNEL_OPTIONS_PRESENT);
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
- tun_info->options_len = 0;
+ tun_info->options_len = opt_len;
*ts = new_state;
@@ -396,7 +791,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
return -ENOMEM;
return 0;
@@ -409,7 +805,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+ nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
+ nla_total_size(1) /* LWTUNNEL_IP6_TC */
- + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */
+ + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP6_OPTS */
}
static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cfb025606793..37cddd18f282 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -187,8 +187,17 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
int mtu;
if (!dst) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
}
dst_hold(dst);
@@ -214,7 +223,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
mtu = dst_mtu(dst);
if (skb->len > mtu) {
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
@@ -580,8 +589,8 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
[IFLA_VTI_LINK] = { .type = NLA_U32 },
[IFLA_VTI_IKEY] = { .type = NLA_U32 },
[IFLA_VTI_OKEY] = { .type = NLA_U32 },
- [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
- [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
[IFLA_VTI_FWMARK] = { .type = NLA_U32 },
};
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 9bcca08efec9..4438f6b12335 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1334,7 +1334,7 @@ static int __init ipconfig_proc_net_init(void)
/* Create a new file under /proc/net/ipconfig */
static int ipconfig_proc_net_create(const char *name,
- const struct file_operations *fops)
+ const struct proc_ops *proc_ops)
{
char *pname;
struct proc_dir_entry *p;
@@ -1346,7 +1346,7 @@ static int ipconfig_proc_net_create(const char *name,
if (!pname)
return -ENOMEM;
- p = proc_create(pname, 0444, init_net.proc_net, fops);
+ p = proc_create(pname, 0444, init_net.proc_net, proc_ops);
kfree(pname);
if (!p)
return -ENOMEM;
@@ -1355,7 +1355,7 @@ static int ipconfig_proc_net_create(const char *name,
}
/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
-static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+static int ntp_servers_show(struct seq_file *seq, void *v)
{
int i;
@@ -1365,7 +1365,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v)
}
return 0;
}
-DEFINE_SHOW_ATTRIBUTE(ntp_servers_seq);
+DEFINE_PROC_SHOW_ATTRIBUTE(ntp_servers);
#endif /* CONFIG_PROC_FS */
/*
@@ -1412,6 +1412,9 @@ static int __init wait_for_devices(void)
struct net_device *dev;
int found = 0;
+ /* make sure deferred device probes are finished */
+ wait_for_device_probe();
+
rtnl_lock();
for_each_netdev(&init_net, dev) {
if (ic_is_init_dev(dev)) {
@@ -1453,7 +1456,7 @@ static int __init ip_auto_config(void)
proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
if (ipconfig_proc_net_init() == 0)
- ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_proc_ops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1483,10 +1486,10 @@ static int __init ip_auto_config(void)
* missing values.
*/
if (ic_myaddr == NONE ||
-#ifdef CONFIG_ROOT_NFS
+#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT)
(root_server_addr == NONE &&
ic_servaddr == NONE &&
- ROOT_DEV == Root_NFS) ||
+ (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) ||
#endif
ic_first_dev->next) {
#ifdef IPCONFIG_DYNAMIC
@@ -1513,6 +1516,12 @@ static int __init ip_auto_config(void)
goto try_try_again;
}
#endif
+#ifdef CONFIG_CIFS_ROOT
+ if (ROOT_DEV == Root_CIFS) {
+ pr_err("IP-Config: Retrying forever (CIFS root)...\n");
+ goto try_try_again;
+ }
+#endif
if (--retries) {
pr_err("IP-Config: Reopening network devices...\n");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c07bc82cbbe9..6e68def66822 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
rtnl_unlock();
}
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
}
static unsigned int ipmr_rules_seq_read(struct net *net)
@@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net)
rtnl_unlock();
}
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -1134,8 +1136,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
if (!found) {
/* Create a new entry if allowable */
- if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
- (c = ipmr_cache_alloc_unres()) == NULL) {
+ c = ipmr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
@@ -1794,7 +1796,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
ip_send_check(iph);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- nf_reset(skb);
+ nf_reset_ct(skb);
}
static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
@@ -2140,7 +2142,7 @@ int ip_mr_input(struct sk_buff *skb)
mroute_sk = rcu_dereference(mrt->mroute_sk);
if (mroute_sk) {
- nf_reset(skb);
+ nf_reset_ct(skb);
raw_rcv(mroute_sk, skb);
return 0;
}
@@ -2289,7 +2291,8 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
rcu_read_unlock();
return -ENODEV;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
read_unlock(&mrt_lock);
rcu_read_unlock();
@@ -3040,10 +3043,11 @@ static unsigned int ipmr_seq_read(struct net *net)
return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
}
-static int ipmr_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
- ipmr_mr_table_iter, &mrt_lock);
+ ipmr_mr_table_iter, &mrt_lock, extack);
}
static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index ea48bd15a575..aa8738a91210 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute);
int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
int (*rules_dump)(struct net *net,
- struct notifier_block *nb),
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack),
struct mr_table *(*mr_iter)(struct net *net,
struct mr_table *mrt),
- rwlock_t *mrt_lock)
+ rwlock_t *mrt_lock,
+ struct netlink_ext_ack *extack)
{
struct mr_table *mrt;
int err;
- err = rules_dump(net, nb);
+ err = rules_dump(net, nb, extack);
if (err)
return err;
@@ -409,17 +411,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
if (!v->dev)
continue;
- mr_call_vif_notifier(nb, net, family,
- FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id);
+ err = mr_call_vif_notifier(nb, family,
+ FIB_EVENT_VIF_ADD,
+ v, vifi, mrt->id, extack);
+ if (err)
+ break;
}
read_unlock(mrt_lock);
+ if (err)
+ return err;
+
/* Notify on table MFC entries */
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
- mr_call_mfc_notifier(nb, net, family,
- FIB_EVENT_ENTRY_ADD,
- mfc, mrt->id);
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ err = mr_call_mfc_notifier(nb, family,
+ FIB_EVENT_ENTRY_ADD,
+ mfc, mrt->id, extack);
+ if (err)
+ return err;
+ }
}
return 0;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 69e76d677f9e..f17b402111ce 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -272,7 +272,7 @@ config IP_NF_TARGET_CLUSTERIP
The CLUSTERIP target allows you to build load-balancing clusters of
network servers without having a dedicated load-balancing
router/server/switch.
-
+
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_TARGET_ECN
@@ -281,7 +281,7 @@ config IP_NF_TARGET_ECN
depends on NETFILTER_ADVANCED
---help---
This option adds a `ECN' target, which can be used in the iptables mangle
- table.
+ table.
You can use this target to remove the ECN bits from the IPv4 header of
an IP packet. This is particularly useful, if you need to work around
@@ -306,7 +306,7 @@ config IP_NF_RAW
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
-
+
If you want to compile it as a module, say M here and read
<file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
@@ -318,7 +318,7 @@ config IP_NF_SECURITY
help
This option adds a `security' table to iptables, for use
with Mandatory Access Control (MAC) policy.
-
+
If unsure, say N.
endif # IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c50e0ec095d2..7c497c78105f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
# flow table support
obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
-# generic IP tables
+# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
# the three instances of ip_tables
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 214154b47d56..f1f78a742b36 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -384,10 +384,11 @@ next: ;
return 1;
}
-static inline int check_target(struct arpt_entry *e, const char *name)
+static int check_target(struct arpt_entry *e, struct net *net, const char *name)
{
struct xt_entry_target *t = arpt_get_target(e);
struct xt_tgchk_param par = {
+ .net = net,
.table = name,
.entryinfo = e,
.target = t->u.kernel.target,
@@ -399,8 +400,9 @@ static inline int check_target(struct arpt_entry *e, const char *name)
return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
}
-static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+static int
+find_check_entry(struct arpt_entry *e, struct net *net, const char *name,
+ unsigned int size,
struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
@@ -419,7 +421,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
}
t->u.kernel.target = target;
- ret = check_target(e, name);
+ ret = check_target(e, net, name);
if (ret)
goto err;
return 0;
@@ -494,12 +496,13 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
return 0;
}
-static inline void cleanup_entry(struct arpt_entry *e)
+static void cleanup_entry(struct arpt_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
struct xt_entry_target *t;
t = arpt_get_target(e);
+ par.net = net;
par.target = t->u.kernel.target;
par.targinfo = t->data;
par.family = NFPROTO_ARP;
@@ -512,7 +515,9 @@ static inline void cleanup_entry(struct arpt_entry *e)
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
-static int translate_table(struct xt_table_info *newinfo, void *entry0,
+static int translate_table(struct net *net,
+ struct xt_table_info *newinfo,
+ void *entry0,
const struct arpt_replace *repl)
{
struct xt_percpu_counter_alloc_state alloc_state = { 0 };
@@ -569,7 +574,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size,
+ ret = find_check_entry(iter, net, repl->name, repl->size,
&alloc_state);
if (ret != 0)
break;
@@ -580,7 +585,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
xt_entry_foreach(iter, entry0, newinfo->size) {
if (i-- == 0)
break;
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
}
return ret;
}
@@ -923,7 +928,7 @@ static int __do_replace(struct net *net, const char *name,
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries;
xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
@@ -974,7 +979,7 @@ static int do_replace(struct net *net, const void __user *user,
goto free_newinfo;
}
- ret = translate_table(newinfo, loc_cpu_entry, &tmp);
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -986,7 +991,7 @@ static int do_replace(struct net *net, const void __user *user,
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1149,7 +1154,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
}
}
-static int translate_compat_table(struct xt_table_info **pinfo,
+static int translate_compat_table(struct net *net,
+ struct xt_table_info **pinfo,
void **pentry0,
const struct compat_arpt_replace *compatr)
{
@@ -1217,7 +1223,7 @@ static int translate_compat_table(struct xt_table_info **pinfo,
repl.num_counters = 0;
repl.counters = NULL;
repl.size = newinfo->size;
- ret = translate_table(newinfo, entry1, &repl);
+ ret = translate_table(net, newinfo, entry1, &repl);
if (ret)
goto free_newinfo;
@@ -1270,7 +1276,7 @@ static int compat_do_replace(struct net *net, void __user *user,
goto free_newinfo;
}
- ret = translate_compat_table(&newinfo, &loc_cpu_entry, &tmp);
+ ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -1282,7 +1288,7 @@ static int compat_do_replace(struct net *net, void __user *user,
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1509,7 +1515,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
return ret;
}
-static void __arpt_unregister_table(struct xt_table *table)
+static void __arpt_unregister_table(struct net *net, struct xt_table *table)
{
struct xt_table_info *private;
void *loc_cpu_entry;
@@ -1521,7 +1527,7 @@ static void __arpt_unregister_table(struct xt_table *table)
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
@@ -1546,7 +1552,7 @@ int arpt_register_table(struct net *net,
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(newinfo, loc_cpu_entry, repl);
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0)
goto out_free;
@@ -1561,7 +1567,7 @@ int arpt_register_table(struct net *net,
ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
if (ret != 0) {
- __arpt_unregister_table(new_table);
+ __arpt_unregister_table(net, new_table);
*res = NULL;
}
@@ -1576,7 +1582,7 @@ void arpt_unregister_table(struct net *net, struct xt_table *table,
const struct nf_hook_ops *ops)
{
nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __arpt_unregister_table(table);
+ __arpt_unregister_table(net, table);
}
/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 6bdb1ab8af61..f8755a4ae9d4 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -58,7 +58,7 @@ struct clusterip_config {
};
#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
+static const struct proc_ops clusterip_proc_ops;
#endif
struct clusterip_net {
@@ -280,7 +280,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
mutex_lock(&cn->mutex);
c->pde = proc_create_data(buffer, 0600,
cn->procdir,
- &clusterip_proc_fops, c);
+ &clusterip_proc_ops, c);
mutex_unlock(&cn->mutex);
if (!c->pde) {
err = -ENOMEM;
@@ -804,12 +804,12 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
return size;
}
-static const struct file_operations clusterip_proc_fops = {
- .open = clusterip_proc_open,
- .read = seq_read,
- .write = clusterip_proc_write,
- .llseek = seq_lseek,
- .release = clusterip_proc_release,
+static const struct proc_ops clusterip_proc_ops = {
+ .proc_open = clusterip_proc_open,
+ .proc_read = seq_read,
+ .proc_write = clusterip_proc_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = clusterip_proc_release,
};
#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 0e70f3f65f6f..748dc3ce58d3 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -36,8 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
opts.options |= XT_SYNPROXY_OPT_ECN;
opts.options &= info->options;
- opts.mss_encode = opts.mss;
- opts.mss = info->mss;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, &opts);
else
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index af3fbf76dbd3..6cc5743c553a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
- nf_reset(skb);
+ nf_reset_ct(skb);
nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
#endif
/*
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 012c4047c788..e32e41b99f0f 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -9,6 +9,8 @@
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
.init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv4,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 36a28d46149c..c94445b44d8c 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -31,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
if (icmph == NULL)
return 1;
- switch (icmph->type) {
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_REDIRECT:
- case ICMP_TIME_EXCEEDED:
- case ICMP_PARAMETERPROB:
- break;
- default:
+ if (!icmp_is_err(icmph->type))
return 1;
- }
inside_iph = skb_header_pointer(skb, outside_hdrlen +
sizeof(struct icmphdr),
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5fe5a3981d43..d072c326dd64 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -23,7 +23,6 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
- [NHA_UNSPEC] = { .strict_start_type = NHA_UNSPEC + 1 },
[NHA_ID] = { .type = NLA_U32 },
[NHA_GROUP] = { .type = NLA_BINARY },
[NHA_GROUP_TYPE] = { .type = NLA_U16 },
@@ -322,7 +321,9 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh)
static size_t nh_nlmsg_size(struct nexthop *nh)
{
- size_t sz = nla_total_size(4); /* NHA_ID */
+ size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
+
+ sz += nla_total_size(4); /* NHA_ID */
if (nh->is_group)
sz += nh_nlmsg_size_grp(nh);
@@ -1151,7 +1152,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
.fc_encap_type = cfg->nh_encap_type,
};
u32 tb_id = l3mdev_fib_table(cfg->dev);
- int err = -EINVAL;
+ int err;
err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
if (err) {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9d24ef5c5d8f..535427292194 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
} else if (!ipc.oif)
ipc.oif = inet->uc_index;
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
sk->sk_uid);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index cc90243ccf76..2580303249e2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
+ SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH),
+ SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 40a6abbc9cf6..3183413ebc6c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
return NET_RX_DROP;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_push(skb, skb->data - skb_network_header(skb));
@@ -375,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb_reserve(skb, hlen);
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = sockc->mark;
skb->tstamp = sockc->transmit_time;
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -623,7 +623,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
}
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE,
hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 517300d587a7..ebe7060d0fc9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,7 +139,8 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst);
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu);
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void ipv4_dst_destroy(struct dst_entry *dst);
@@ -236,11 +237,11 @@ static int rt_cache_seq_open(struct inode *inode, struct file *file)
return seq_open(file, &rt_cache_seq_ops);
}
-static const struct file_operations rt_cache_seq_fops = {
- .open = rt_cache_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops rt_cache_proc_ops = {
+ .proc_open = rt_cache_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
@@ -270,6 +271,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu+1;
return &per_cpu(rt_cache_stat, cpu);
}
+ (*pos)++;
return NULL;
}
@@ -326,11 +328,11 @@ static int rt_cpu_seq_open(struct inode *inode, struct file *file)
return seq_open(file, &rt_cpu_seq_ops);
}
-static const struct file_operations rt_cpu_seq_fops = {
- .open = rt_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops rt_cpu_proc_ops = {
+ .proc_open = rt_cpu_seq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -364,12 +366,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
struct proc_dir_entry *pde;
pde = proc_create("rt_cache", 0444, net->proc_net,
- &rt_cache_seq_fops);
+ &rt_cache_proc_ops);
if (!pde)
goto err1;
pde = proc_create("rt_cache", 0444,
- net->proc_net_stat, &rt_cpu_seq_fops);
+ net->proc_net_stat, &rt_cpu_proc_ops);
if (!pde)
goto err2;
@@ -635,6 +637,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_uses_gateway = 1;
rt->rt_gw_family = AF_INET;
rt->rt_gw4 = fnhe->fnhe_gw;
}
@@ -915,16 +918,15 @@ void ip_rt_send_redirect(struct sk_buff *skb)
if (peer->rate_tokens == 0 ||
time_after(jiffies,
(peer->rate_last +
- (ip_rt_redirect_load << peer->rate_tokens)))) {
+ (ip_rt_redirect_load << peer->n_redirects)))) {
__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
peer->rate_last = jiffies;
- ++peer->rate_tokens;
++peer->n_redirects;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- peer->rate_tokens == ip_rt_redirect_number)
+ peer->n_redirects == ip_rt_redirect_number)
net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
&ip_hdr(skb)->saddr, inet_iif(skb),
&ip_hdr(skb)->daddr, &gw);
@@ -1043,7 +1045,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
}
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct rtable *rt = (struct rtable *) dst;
struct flowi4 fl4;
@@ -1313,7 +1316,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = READ_ONCE(dst->dev->mtu);
if (unlikely(ip_mtu_locked(dst))) {
- if (rt->rt_gw_family && mtu > 576)
+ if (rt->rt_uses_gateway && mtu > 576)
mtu = 576;
}
@@ -1482,7 +1485,7 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
if (orig) {
- dst_dev_put(&orig->dst);
+ rt_add_uncached_list(orig);
dst_release(&orig->dst);
}
} else {
@@ -1569,6 +1572,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct fib_nh_common *nhc = FIB_RES_NHC(*res);
if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
+ rt->rt_uses_gateway = 1;
rt->rt_gw_family = nhc->nhc_gw_family;
/* only INET and INET6 are supported */
if (likely(nhc->nhc_gw_family == AF_INET))
@@ -1634,6 +1638,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
rt->rt_iif = 0;
rt->rt_pmtu = 0;
rt->rt_mtu_locked = 0;
+ rt->rt_uses_gateway = 0;
rt->rt_gw_family = 0;
rt->rt_gw4 = 0;
INIT_LIST_HEAD(&rt->rt_uncached);
@@ -1892,10 +1897,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
if (!icmph)
goto out;
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_REDIRECT &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB)
+ if (!icmp_is_err(icmph->type))
goto out;
inner_iph = skb_header_pointer(skb,
@@ -2020,10 +2022,52 @@ static int ip_mkroute_input(struct sk_buff *skb,
return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
}
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev,
+ const struct sk_buff *hint)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rt = (struct rtable *)hint;
+ struct net *net = dev_net(dev);
+ int err = -EINVAL;
+ u32 tag = 0;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ goto martian_source;
+
+ if (ipv4_is_zeronet(saddr))
+ goto martian_source;
+
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ goto martian_source;
+
+ if (rt->rt_type != RTN_LOCAL)
+ goto skip_validate_source;
+
+ tos &= IPTOS_RT_MASK;
+ err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
+ if (err < 0)
+ goto martian_source;
+
+skip_validate_source:
+ skb_dst_copy(skb, hint);
+ return 0;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ return err;
+}
+
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
+ * Changes in the enforced policies must be applied also to
+ * ip_route_use_hint().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
@@ -2468,14 +2512,17 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
int orig_oif = fl4->flowi4_oif;
unsigned int flags = 0;
struct rtable *rth;
- int err = -ENETUNREACH;
+ int err;
if (fl4->saddr) {
- rth = ERR_PTR(-EINVAL);
if (ipv4_is_multicast(fl4->saddr) ||
ipv4_is_lbcast(fl4->saddr) ||
- ipv4_is_zeronet(fl4->saddr))
+ ipv4_is_zeronet(fl4->saddr)) {
+ rth = ERR_PTR(-EINVAL);
goto out;
+ }
+
+ rth = ERR_PTR(-ENETUNREACH);
/* I removed check for oif == dev_out->oif here.
It was wrong for two reasons:
@@ -2643,7 +2690,8 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
}
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
}
@@ -2694,6 +2742,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
+ rt->rt_uses_gateway = ort->rt_uses_gateway;
rt->rt_gw_family = ort->rt_gw_family;
if (rt->rt_gw_family == AF_INET)
rt->rt_gw4 = ort->rt_gw4;
@@ -2728,7 +2777,8 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
- struct sk_buff *skb, u32 portid, u32 seq)
+ struct sk_buff *skb, u32 portid, u32 seq,
+ unsigned int flags)
{
struct rtmsg *r;
struct nlmsghdr *nlh;
@@ -2736,7 +2786,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
if (!nlh)
return -EMSGSIZE;
@@ -2777,21 +2827,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
- if (rt->rt_gw_family == AF_INET &&
- nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
- goto nla_put_failure;
- } else if (rt->rt_gw_family == AF_INET6) {
- int alen = sizeof(struct in6_addr);
- struct nlattr *nla;
- struct rtvia *via;
-
- nla = nla_reserve(skb, RTA_VIA, alen + 2);
- if (!nla)
+ if (rt->rt_uses_gateway) {
+ if (rt->rt_gw_family == AF_INET &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
goto nla_put_failure;
-
- via = nla_data(nla);
- via->rtvia_family = AF_INET6;
- memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ } else if (rt->rt_gw_family == AF_INET6) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ }
}
expires = rt->dst.expires;
@@ -2860,7 +2912,7 @@ nla_put_failure:
static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
struct netlink_callback *cb, u32 table_id,
struct fnhe_hash_bucket *bucket, int genid,
- int *fa_index, int fa_start)
+ int *fa_index, int fa_start, unsigned int flags)
{
int i;
@@ -2891,7 +2943,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
table_id, NULL, skb,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq);
+ cb->nlh->nlmsg_seq, flags);
if (err)
return err;
next:
@@ -2904,7 +2956,7 @@ next:
int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
u32 table_id, struct fib_info *fi,
- int *fa_index, int fa_start)
+ int *fa_index, int fa_start, unsigned int flags)
{
struct net *net = sock_net(cb->skb->sk);
int nhsel, genid = fnhe_genid(net);
@@ -2922,7 +2974,8 @@ int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
err = 0;
if (bucket)
err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
- genid, fa_index, fa_start);
+ genid, fa_index, fa_start,
+ flags);
rcu_read_unlock();
if (err)
return err;
@@ -3171,19 +3224,45 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
skb_reset_mac_header(skb);
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ struct fib_rt_info fri;
+
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
goto errout_rcu;
}
+ fri.fi = res.fi;
+ fri.tb_id = table_id;
+ fri.dst = res.prefix;
+ fri.dst_len = res.prefixlen;
+ fri.tos = fl4.flowi4_tos;
+ fri.type = rt->rt_type;
+ fri.offload = 0;
+ fri.trap = 0;
+ if (res.fa_head) {
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
+ u8 slen = 32 - fri.dst_len;
+
+ if (fa->fa_slen == slen &&
+ fa->tb_id == fri.tb_id &&
+ fa->fa_tos == fri.tos &&
+ fa->fa_info == res.fi &&
+ fa->fa_type == fri.type) {
+ fri.offload = fa->offload;
+ fri.trap = fa->trap;
+ break;
+ }
+ }
+ }
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
- rt->rt_type, res.prefix, res.prefixlen,
- fl4.flowi4_tos, res.fi, 0);
+ nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
} else {
err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
}
if (err < 0)
goto errout_rcu;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 535b69326f66..9a4f6b16c9bc 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
* Since subsequent timestamps use the normal tcp_time_stamp value, we
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
*/
-u64 cookie_init_timestamp(struct request_sock *req)
+u64 cookie_init_timestamp(struct request_sock *req, u64 now)
{
struct inet_request_sock *ireq;
- u32 ts, ts_now = tcp_time_stamp_raw();
+ u32 ts, ts_now = tcp_ns_to_ts(now);
u32 options = 0;
ireq = inet_rsk(req);
@@ -349,6 +349,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
treq->snt_synack = 0;
treq->tfo_listener = false;
+
+ if (IS_ENABLED(CONFIG_MPTCP))
+ treq->is_mptcp = 0;
+
if (IS_ENABLED(CONFIG_SMC))
ireq->smc_ok = 0;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0b980e841927..9684af02e0a5 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -340,6 +340,10 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
user_key[i * 4 + 1],
user_key[i * 4 + 2],
user_key[i * 4 + 3]);
+
+ if (WARN_ON_ONCE(off >= tbl.maxlen - 1))
+ break;
+
if (i + 1 < n_keys)
off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
}
@@ -820,6 +824,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &tcp_min_snd_mss_max,
},
{
+ .procname = "tcp_mtu_probe_floor",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
+ {
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
@@ -1028,7 +1041,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = &two,
},
#endif
{
@@ -1180,6 +1193,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "tcp_no_ssthresh_metrics_save",
+ .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "tcp_moderate_rcvbuf",
.data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 77b485d60b9d..eb2d80519f8e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,6 +271,7 @@
#include <net/icmp.h>
#include <net/inet_common.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -326,7 +327,7 @@ void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
- if (tcp_memory_pressure)
+ if (READ_ONCE(tcp_memory_pressure))
return;
val = jiffies;
@@ -341,7 +342,7 @@ void tcp_leave_memory_pressure(struct sock *sk)
{
unsigned long val;
- if (!tcp_memory_pressure)
+ if (!READ_ONCE(tcp_memory_pressure))
return;
val = xchg(&tcp_memory_pressure, 0);
if (val)
@@ -443,15 +444,13 @@ void tcp_init_sock(struct sock *sk)
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
- sk->sk_state = TCP_CLOSE;
-
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
+ WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
+ WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
sk_sockets_allocated_inc(sk);
sk->sk_route_forced_caps = NETIF_F_GSO;
@@ -477,7 +476,7 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
int target, struct sock *sk)
{
- return (tp->rcv_nxt - tp->copied_seq >= target) ||
+ return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) ||
(sk->sk_prot->stream_memory_read ?
sk->sk_prot->stream_memory_read(sk) : false);
}
@@ -543,10 +542,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
/* Connected or passive Fast Open socket? */
if (state != TCP_SYN_SENT &&
- (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
- if (tp->urg_seq == tp->copied_seq &&
+ if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
!sock_flag(sk, SOCK_URGINLINE) &&
tp->urg_data)
target++;
@@ -584,7 +583,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
/* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb();
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR;
return mask;
@@ -607,7 +606,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ answ = tp->urg_data &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
if (sk->sk_state == TCP_LISTEN)
@@ -616,7 +616,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_una;
+ answ = READ_ONCE(tp->write_seq) - tp->snd_una;
break;
case SIOCOUTQNSD:
if (sk->sk_state == TCP_LISTEN)
@@ -625,7 +625,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_nxt;
+ answ = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
break;
default:
return -ENOIOCTLCMD;
@@ -657,7 +658,7 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb)
tcb->sacked = 0;
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
@@ -690,8 +691,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
}
-static void tcp_push(struct sock *sk, int flags, int mss_now,
- int nonagle, int size_goal)
+void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -925,7 +926,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return max(size_goal, mss_now);
}
-static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
int mss_now;
@@ -935,6 +936,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
+/* In some cases, both sendpage() and sendmsg() could have added
+ * an skb to the write queue, but failed adding payload on it.
+ * We need to remove it to consume less memory, but more
+ * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
+ * users.
+ */
+static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb && !skb->len) {
+ tcp_unlink_write_queue(skb, sk);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -1016,10 +1033,10 @@ new_segment:
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
- sk->sk_wmem_queued += copy;
+ sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
- tp->write_seq += copy;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1064,12 +1081,12 @@ out:
return copied;
do_error:
+ tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
if (copied)
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
- err == -EAGAIN)) {
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
@@ -1165,7 +1182,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
- bool process_backlog = false;
+ int process_backlog = 0;
bool zc = false;
long timeo;
@@ -1257,9 +1274,10 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- if (process_backlog && sk_flush_backlog(sk)) {
- process_backlog = false;
- goto restart;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
@@ -1267,7 +1285,7 @@ new_segment:
if (!skb)
goto wait_for_memory;
- process_backlog = true;
+ process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
@@ -1344,7 +1362,7 @@ new_segment:
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
- tp->write_seq += copy;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1388,26 +1406,18 @@ out_nopush:
sock_zerocopy_put(uarg);
return copied + copied_syn;
+do_error:
+ skb = tcp_write_queue_tail(sk);
do_fault:
- if (!skb->len) {
- tcp_unlink_write_queue(skb, sk);
- /* It is the one place in all of TCP, except connection
- * reset, where we can be unlinking the send_head.
- */
- if (tcp_write_queue_empty(sk))
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- sk_wmem_free_skb(sk, skb);
- }
+ tcp_remove_empty_skb(sk, skb);
-do_error:
if (copied + copied_syn)
goto out;
out_err:
sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
- err == -EAGAIN)) {
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
@@ -1657,9 +1667,9 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_eat_skb(sk, skb);
if (!desc->count)
break;
- tp->copied_seq = seq;
+ WRITE_ONCE(tp->copied_seq, seq);
}
- tp->copied_seq = seq;
+ WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
@@ -1688,7 +1698,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
else
cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
val = min(val, cap);
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
/* Check if we need to signal EPOLLIN right now */
tcp_data_ready(sk);
@@ -1698,7 +1708,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
val <<= 1;
if (val > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = val;
+ WRITE_ONCE(sk->sk_rcvbuf, val);
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
}
return 0;
@@ -1728,8 +1738,8 @@ static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
unsigned long address = (unsigned long)zc->address;
+ u32 length = 0, seq, offset, zap_len;
const skb_frag_t *frags = NULL;
- u32 length = 0, seq, offset;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
struct tcp_sock *tp;
@@ -1756,17 +1766,19 @@ static int tcp_zerocopy_receive(struct sock *sk,
seq = tp->copied_seq;
inq = tcp_inq(sk);
zc->length = min_t(u32, zc->length, inq);
- zc->length &= ~(PAGE_SIZE - 1);
- if (zc->length) {
- zap_page_range(vma, address, zc->length);
+ zap_len = zc->length & ~(PAGE_SIZE - 1);
+ if (zap_len) {
+ zap_page_range(vma, address, zap_len);
zc->recv_skip_hint = 0;
} else {
- zc->recv_skip_hint = inq;
+ zc->recv_skip_hint = zc->length;
}
ret = 0;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
if (skb) {
+ if (zc->recv_skip_hint > 0)
+ break;
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
} else {
@@ -1779,18 +1791,18 @@ static int tcp_zerocopy_receive(struct sock *sk,
break;
frags = skb_shinfo(skb)->frags;
while (offset) {
- if (frags->size > offset)
+ if (skb_frag_size(frags) > offset)
goto out;
- offset -= frags->size;
+ offset -= skb_frag_size(frags);
frags++;
}
}
- if (frags->size != PAGE_SIZE || frags->page_offset) {
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
int remaining = zc->recv_skip_hint;
- while (remaining && (frags->size != PAGE_SIZE ||
- frags->page_offset)) {
- remaining -= frags->size;
+ while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
+ skb_frag_off(frags))) {
+ remaining -= skb_frag_size(frags);
frags++;
}
zc->recv_skip_hint -= remaining;
@@ -1808,7 +1820,7 @@ static int tcp_zerocopy_receive(struct sock *sk,
out:
up_read(&current->mm->mmap_sem);
if (length) {
- tp->copied_seq = seq;
+ WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
@@ -1851,29 +1863,33 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
if (sock_flag(sk, SOCK_RCVTSTAMP)) {
if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
if (new_tstamp) {
- struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
-
+ struct __kernel_timespec kts = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
sizeof(kts), &kts);
} else {
- struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
-
+ struct __kernel_old_timespec ts_old = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
sizeof(ts_old), &ts_old);
}
} else {
if (new_tstamp) {
- struct __kernel_sock_timeval stv;
-
- stv.tv_sec = tss->ts[0].tv_sec;
- stv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ struct __kernel_sock_timeval stv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
sizeof(stv), &stv);
} else {
- struct __kernel_old_timeval tv;
-
- tv.tv_sec = tss->ts[0].tv_sec;
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+ struct __kernel_old_timeval tv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
sizeof(tv), &tv);
}
@@ -1945,13 +1961,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
struct sk_buff *skb, *last;
u32 urg_hole = 0;
struct scm_timestamping_internal tss;
- bool has_tss = false;
- bool has_cmsg;
+ int cmsg_flags;
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
- if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
+ if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
(sk->sk_state == TCP_ESTABLISHED))
sk_busy_loop(sk, nonblock);
@@ -1961,7 +1976,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (sk->sk_state == TCP_LISTEN)
goto out;
- has_cmsg = tp->recvmsg_inq;
+ cmsg_flags = tp->recvmsg_inq ? 1 : 0;
timeo = sock_rcvtimeo(sk, nonblock);
/* Urgent data needs to be handled specially. */
@@ -2034,7 +2049,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail)
+ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
@@ -2106,7 +2121,7 @@ found_ok_skb:
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
urg_hole++;
offset++;
used--;
@@ -2128,7 +2143,7 @@ found_ok_skb:
}
}
- *seq += used;
+ WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
@@ -2144,8 +2159,7 @@ skip_copy:
if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, &tss);
- has_tss = true;
- has_cmsg = true;
+ cmsg_flags |= 2;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
@@ -2155,7 +2169,7 @@ skip_copy:
found_fin_ok:
/* Process the FIN. */
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
sk_eat_skb(sk, skb);
break;
@@ -2170,10 +2184,10 @@ found_fin_ok:
release_sock(sk);
- if (has_cmsg) {
- if (has_tss)
+ if (cmsg_flags) {
+ if (cmsg_flags & 2)
tcp_recv_timestamp(msg, sk, &tss);
- if (tp->recvmsg_inq) {
+ if (cmsg_flags & 1) {
inq = tcp_inq_hint(sk);
put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
}
@@ -2476,7 +2490,10 @@ adjudge_to_death:
}
if (sk->sk_state == TCP_CLOSE) {
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
/* We could get here with a non-NULL req if the socket is
* aborted (e.g., closed with unread data) before 3WHS
* finishes.
@@ -2508,6 +2525,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
{
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+ tcp_sk(sk)->highest_sack = NULL;
while (p) {
struct sk_buff *skb = rb_to_skb(p);
@@ -2548,6 +2566,7 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int old_state = sk->sk_state;
+ u32 seq;
if (old_state != TCP_CLOSE)
tcp_set_state(sk, TCP_CLOSE);
@@ -2574,7 +2593,7 @@ int tcp_disconnect(struct sock *sk, int flags)
__kfree_skb(sk->sk_rx_skb_cache);
sk->sk_rx_skb_cache = NULL;
}
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->urg_data = 0;
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
@@ -2590,21 +2609,25 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->srtt_us = 0;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rcv_rtt_last_tsecr = 0;
- tp->write_seq += tp->max_window + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
+
+ seq = tp->write_seq + tp->max_window + 2;
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
+
icsk->icsk_backoff = 0;
- tp->snd_cwnd = 2;
icsk->icsk_probes_out = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
tp->window_clamp = 0;
+ tp->delivered = 0;
tp->delivered_ce = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
+ tp->total_retrans = 0;
inet_csk_delack_init(sk);
/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
* issue in __tcp_select_window()
@@ -2616,10 +2639,14 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
+ tp->segs_in = 0;
+ tp->segs_out = 0;
tp->bytes_sent = 0;
tp->bytes_acked = 0;
tp->bytes_received = 0;
tp->bytes_retrans = 0;
+ tp->data_segs_in = 0;
+ tp->data_segs_out = 0;
tp->duplicate_sack[0].start_seq = 0;
tp->duplicate_sack[0].end_seq = 0;
tp->dsack_dups = 0;
@@ -2640,11 +2667,13 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
inet->defer_connect = 0;
+ tp->fastopen_client_fail = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -2918,9 +2947,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (sk->sk_state != TCP_CLOSE)
err = -EPERM;
else if (tp->repair_queue == TCP_SEND_QUEUE)
- tp->write_seq = val;
+ WRITE_ONCE(tp->write_seq, val);
else if (tp->repair_queue == TCP_RECV_QUEUE)
- tp->rcv_nxt = val;
+ WRITE_ONCE(tp->rcv_nxt, val);
else
err = -EINVAL;
break;
@@ -3203,8 +3232,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
* tcpi_unacked -> Number of children ready for accept()
* tcpi_sacked -> max backlog
*/
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
return;
}
@@ -3282,6 +3311,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3311,6 +3343,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
0;
}
@@ -3365,6 +3398,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
return stats;
}
@@ -3784,8 +3818,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
- const struct skb_frag_struct *f = &shi->frags[i];
- unsigned int offset = f->page_offset;
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
sg_set_page(&sg, page, skb_frag_size(f),
@@ -3817,7 +3851,13 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
void tcp_done(struct sock *sk)
{
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ /* We might be called with a new socket, after
+ * inet_csk_prepare_forced_close() has been called
+ * so we can not use lockdep_sock_is_held(sk)
+ */
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
@@ -3916,7 +3956,7 @@ void __init tcp_init(void)
BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
- FIELD_SIZEOF(struct sk_buff, cb));
+ sizeof_field(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
@@ -3990,4 +4030,5 @@ void __init tcp_init(void)
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
tcp_tasklet_init();
+ mptcp_init();
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 56be7d27f208..6c4d79baff26 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -306,7 +306,8 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
- bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ bytes = min_t(unsigned long,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
@@ -346,7 +347,7 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
*
- * bdp = bw * min_rtt * gain
+ * bdp = ceil(bw * min_rtt * gain)
*
* The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT
@@ -370,7 +371,9 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
w = (u64)bw * bbr->min_rtt_us;
- /* Apply a gain to the given value, then remove the BW_SCALE shift. */
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
+ * round the value up to avoid a negative feedback loop.
+ */
bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
return bdp;
@@ -386,7 +389,7 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
* which allows 2 outstanding 2-packet sequences, to try to keep pipe
* full even with ACK-every-other-packet delayed ACKs.
*/
-static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
{
struct bbr *bbr = inet_csk_ca(sk);
@@ -397,7 +400,7 @@ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
cwnd = (cwnd + 1) & ~1U;
/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
- if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
cwnd += 2;
return cwnd;
@@ -409,7 +412,7 @@ static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
u32 inflight;
inflight = bbr_bdp(sk, bw, gain);
- inflight = bbr_quantization_budget(sk, inflight, gain);
+ inflight = bbr_quantization_budget(sk, inflight);
return inflight;
}
@@ -529,7 +532,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
* due to aggregation (of data and/or ACKs) visible in the ACK stream.
*/
target_cwnd += bbr_ack_aggregation_cwnd(sk);
- target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
/* If we're below target cwnd, slow start cwnd toward target cwnd. */
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
@@ -776,8 +779,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
* bandwidth sample. Delivered is in packets and interval_us in uS and
* ratio will be <<1 for most connections. So delivered is first scaled.
*/
- bw = (u64)rs->delivered * BW_UNIT;
- do_div(bw, rs->interval_us);
+ bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
/* If this sample is application-limited, it is likely to have a very
* low delivered count that represents application behavior rather than
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a56e09cfb0e..8a01428f80c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -121,14 +121,14 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
struct sk_psock *psock;
int copied, ret;
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
- if (!skb_queue_empty(&sk->sk_receive_queue))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
psock = sk_psock_get(sk);
if (unlikely(!psock))
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
lock_sock(sk);
msg_bytes_ready:
copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
@@ -139,7 +139,7 @@ msg_bytes_ready:
timeo = sock_rcvtimeo(sk, nonblock);
data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
if (data) {
- if (skb_queue_empty(&sk->sk_receive_queue))
+ if (!sk_psock_queue_empty(psock))
goto msg_bytes_ready;
release_sock(sk);
sk_psock_put(sk, psock);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, int *copied, int flags)
{
- bool cork = false, enospc = msg->sg.start == msg->sg.end;
+ bool cork = false, enospc = sk_msg_full(msg);
struct sock *sk_redir;
u32 tosend, delta = 0;
int ret;
@@ -315,10 +315,7 @@ more_data:
*/
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
- if (msg->sg.size < delta)
- delta -= msg->sg.size;
- else
- delta = 0;
+ delta -= msg->sg.size;
}
if (msg->cork_bytes &&
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index c445a81d144e..3172e31987be 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
/* Simple linear search, don't expect many entries! */
-static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
struct tcp_congestion_ops *e;
@@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk)
rcu_read_lock();
ca = rcu_dereference(net->ipv4.tcp_congestion_control);
- if (unlikely(!try_module_get(ca->owner)))
+ if (unlikely(!bpf_try_module_get(ca, ca->owner)))
ca = &tcp_reno;
icsk->icsk_ca_ops = ca;
rcu_read_unlock();
@@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk)
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
- module_put(icsk->icsk_ca_ops->owner);
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
@@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
ca = tcp_ca_find_autoload(net, name);
if (!ca) {
ret = -ENOENT;
- } else if (!try_module_get(ca->owner)) {
+ } else if (!bpf_try_module_get(ca, ca->owner)) {
ret = -EBUSY;
} else {
prev = xchg(&net->ipv4.tcp_congestion_control, ca);
if (prev)
- module_put(prev->owner);
+ bpf_module_put(prev, prev->owner);
ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
@@ -256,6 +256,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -285,6 +288,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -360,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
} else if (!load) {
const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
- if (try_module_get(ca->owner)) {
+ if (bpf_try_module_get(ca, ca->owner)) {
if (reinit) {
tcp_reinit_congestion_control(sk, ca);
} else {
icsk->icsk_ca_ops = ca;
- module_put(old_ca->owner);
+ bpf_module_put(old_ca, old_ca->owner);
}
} else {
err = -EBUSY;
}
} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
err = -EPERM;
- } else if (!try_module_get(ca->owner)) {
+ } else if (!bpf_try_module_get(ca, ca->owner)) {
err = -EBUSY;
} else {
tcp_reinit_congestion_control(sk, ca);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 1b3d032a4df2..8f8eefd3a3ce 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -40,8 +40,8 @@
/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES 8
-#define HYSTART_DELAY_MIN (4U<<3)
-#define HYSTART_DELAY_MAX (16U<<3)
+#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
+#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
static int fast_convergence __read_mostly = 1;
@@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1;
static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
-static int hystart_ack_delta __read_mostly = 2;
+static int hystart_ack_delta_us __read_mostly = 2000;
static u32 cube_rtt_scale __read_mostly;
static u32 beta_scale __read_mostly;
@@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
" 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
-module_param(hystart_ack_delta, int, 0644);
-MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
+module_param(hystart_ack_delta_us, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
/* BIC TCP Parameters */
struct bictcp {
@@ -89,7 +89,7 @@ struct bictcp {
u32 bic_origin_point;/* origin point of bic function */
u32 bic_K; /* time to origin point
from the beginning of the current epoch */
- u32 delay_min; /* min delay (msec << 3) */
+ u32 delay_min; /* min delay (usec) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
@@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca)
ca->found = 0;
}
-static inline u32 bictcp_clock(void)
+static inline u32 bictcp_clock_us(const struct sock *sk)
{
-#if HZ < 1000
- return ktime_to_ms(ktime_get_real());
-#else
- return jiffies_to_msecs(jiffies);
-#endif
+ return tcp_sk(sk)->tcp_mstamp;
}
static inline void bictcp_hystart_reset(struct sock *sk)
@@ -131,9 +127,9 @@ static inline void bictcp_hystart_reset(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- ca->round_start = ca->last_ack = bictcp_clock();
+ ca->round_start = ca->last_ack = bictcp_clock_us(sk);
ca->end_seq = tp->snd_nxt;
- ca->curr_rtt = 0;
+ ca->curr_rtt = ~0U;
ca->sample_cnt = 0;
}
@@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
*/
t = (s32)(tcp_jiffies32 - ca->epoch_start);
- t += msecs_to_jiffies(ca->delay_min >> 3);
+ t += usecs_to_jiffies(ca->delay_min);
/* change the unit from HZ to bictcp_HZ */
t <<= BICTCP_HZ;
do_div(t, HZ);
@@ -376,22 +372,54 @@ static void bictcp_state(struct sock *sk, u8 new_state)
}
}
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static u32 hystart_ack_delay(struct sock *sk)
+{
+ unsigned long rate;
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ if (!rate)
+ return 0;
+ return min_t(u64, USEC_PER_MSEC,
+ div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
+}
+
static void hystart_update(struct sock *sk, u32 delay)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
-
- if (ca->found & hystart_detect)
- return;
+ u32 threshold;
if (hystart_detect & HYSTART_ACK_TRAIN) {
- u32 now = bictcp_clock();
+ u32 now = bictcp_clock_us(sk);
/* first detection parameter - ack-train detection */
- if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
ca->last_ack = now;
- if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
- ca->found |= HYSTART_ACK_TRAIN;
+
+ threshold = ca->delay_min + hystart_ack_delay(sk);
+
+ /* Hystart ack train triggers if we get ack past
+ * ca->delay_min/2.
+ * Pacing might have delayed packets up to RTT/2
+ * during slow start.
+ */
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ threshold >>= 1;
+
+ if ((s32)(now - ca->round_start) > threshold) {
+ ca->found = 1;
+ pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
+ now - ca->round_start, threshold,
+ ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
@@ -405,14 +433,14 @@ static void hystart_update(struct sock *sk, u32 delay)
if (hystart_detect & HYSTART_DELAY) {
/* obtain the minimum delay of more than sampling packets */
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
- if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
+ if (ca->curr_rtt > delay)
ca->curr_rtt = delay;
ca->sample_cnt++;
} else {
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
- ca->found |= HYSTART_DELAY;
+ ca->found = 1;
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
@@ -424,9 +452,6 @@ static void hystart_update(struct sock *sk, u32 delay)
}
}
-/* Track delayed acknowledgment ratio using sliding window
- * ratio = (15*ratio + sample) / 16
- */
static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -441,7 +466,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
return;
- delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
+ delay = sample->rtt_us;
if (delay == 0)
delay = 1;
@@ -450,7 +475,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
ca->delay_min = delay;
/* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tcp_in_slow_start(tp) &&
+ if (!ca->found && tcp_in_slow_start(tp) && hystart &&
tp->snd_cwnd >= hystart_low_window)
hystart_update(sk, delay);
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a3a386236d93..0d08f9e2d8d0 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -21,13 +21,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
struct tcp_info *info = _info;
if (inet_sk_state_load(sk) == TCP_LISTEN) {
- r->idiag_rqueue = sk->sk_ack_backlog;
- r->idiag_wqueue = sk->sk_max_ack_backlog;
+ r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
} else if (sk->sk_type == SOCK_STREAM) {
const struct tcp_sock *tp = tcp_sk(sk);
- r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
- r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
+ r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
}
if (info)
tcp_get_info(sk, info);
@@ -81,13 +82,42 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb,
}
#endif
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+ const struct tcp_ulp_ops *ulp_ops)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+ if (err)
+ goto nla_failure;
+
+ if (ulp_ops->get_info)
+ err = ulp_ops->get_info(sk, skb);
+ if (err)
+ goto nla_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_failure:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int err = 0;
+
#ifdef CONFIG_TCP_MD5SIG
if (net_admin) {
struct tcp_md5sig_info *md5sig;
- int err = 0;
rcu_read_lock();
md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
@@ -99,11 +129,21 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
}
#endif
+ if (net_admin) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops)
+ err = tcp_diag_put_ulp(skb, sk, ulp_ops);
+ if (err)
+ return err;
+ }
return 0;
}
static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
size_t size = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -124,6 +164,17 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
#endif
+ if (net_admin && sk_fullsock(sk)) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ size += nla_total_size(0) +
+ nla_total_size(TCP_ULP_NAME_MAX);
+ if (ulp_ops->get_info_size)
+ size += ulp_ops->get_info_size(sk);
+ }
+ }
return size;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 3fd451271a70..19ad9586c720 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
*/
tp = tcp_sk(child);
- tp->fastopen_rsk = req;
+ rcu_assign_pointer(tp->fastopen_rsk, req);
tcp_rsk(req)->tfo_listener = true;
/* RFC1323: The window in SYN & SYN/ACK segments is never
@@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
cookie->len = -1;
return true;
}
- return cookie->len > 0;
+ if (cookie->len > 0)
+ return true;
+ tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
+ return false;
}
/* This function checks if we want to defer sending SYN until the first
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c21e8a22fb3b..316ebdf8151d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,6 +79,7 @@
#include <trace/events/tcp.h>
#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
+#include <net/mptcp.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -266,7 +267,7 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
@@ -359,7 +360,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
+ WRITE_ONCE(sk->sk_sndbuf,
+ min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -483,8 +485,9 @@ static void tcp_clamp_window(struct sock *sk)
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- net->ipv4.sysctl_tcp_rmem[2]);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min(atomic_read(&sk->sk_rmem_alloc),
+ net->ipv4.sysctl_tcp_rmem[2]));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -648,7 +651,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
rcvbuf = min_t(u64, rcvwin * rcvmem,
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = rcvbuf;
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
@@ -913,9 +916,10 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- if (!tp->retransmit_skb_hint ||
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
+ (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
tp->retransmit_skb_hint = skb;
}
@@ -1420,7 +1424,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
- if (!tcp_skb_can_collapse_to(prev))
+ if (!tcp_skb_can_collapse(prev, skb))
goto fallback;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
@@ -1725,8 +1729,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
/* Ignore very old stuff early */
- if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
+ if (i == 0)
+ first_sack_index = -1;
continue;
+ }
used_sacks++;
}
@@ -2666,7 +2673,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
- if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) &&
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
tcp_try_undo_loss(sk, false))
return;
@@ -2990,7 +2997,7 @@ void tcp_rearm_rto(struct sock *sk)
/* If the retrans timer is currently being used by Fast Open
* for SYN-ACK retrans purpose, stay put.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return;
if (!tp->packets_out) {
@@ -3158,6 +3165,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
+ tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
@@ -3362,7 +3370,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
sock_owned_by_me((struct sock *)tp);
tp->bytes_received += delta;
- tp->rcv_nxt = seq;
+ WRITE_ONCE(tp->rcv_nxt, seq);
}
/* Update our send window.
@@ -3548,7 +3556,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return;
- if (unlikely(rexmit == 2)) {
+ if (unlikely(rexmit == REXMIT_NEW)) {
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
@@ -3782,6 +3790,49 @@ static void smc_parse_options(const struct tcphdr *th,
#endif
}
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+ u16 mss = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return mss;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return mss;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return mss;
+ if (opsize > length)
+ return mss; /* fail on partial options */
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+ u16 in_mss = get_unaligned_be16(ptr);
+
+ if (in_mss) {
+ if (user_mss && user_mss < in_mss)
+ in_mss = user_mss;
+ mss = in_mss;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return mss;
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3875,6 +3926,10 @@ void tcp_parse_options(const struct net *net,
*/
break;
#endif
+ case TCPOPT_MPTCP:
+ mptcp_parse_option(skb, ptr, opsize, opt_rx);
+ break;
+
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
@@ -4216,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
* The receiver remembers and reflects via DSACKs. Leverage the
* DSACK state and change the txhash to re-route speculatively.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) {
sk_rethink_txhash(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+ }
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4375,6 +4432,9 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+ if (!mptcp_skb_can_collapse(to, from))
+ return false;
+
#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
@@ -4512,6 +4572,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4712,6 +4773,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
bool fragstolen;
int eaten;
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb, &tp->rx_opt);
+
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
@@ -4883,7 +4947,7 @@ restart:
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
- * overlaps to the next one.
+ * overlaps to the next one and mptcp allow collapsing.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(sk, skb->truesize) > skb->len ||
@@ -4892,7 +4956,7 @@ restart:
break;
}
- if (n && n != tail &&
+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
@@ -4925,6 +4989,7 @@ restart:
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
+ mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -4944,6 +5009,7 @@ restart:
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
+ !mptcp_skb_can_collapse(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
#ifdef CONFIG_TLS_DEVICE
@@ -5312,7 +5378,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
tp->urg_data = TCP_URG_NOTYET;
- tp->urg_seq = ptr;
+ WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -5768,6 +5834,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
+ if (tp->total_retrans)
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+ else
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data) {
if (__tcp_retransmit_skb(sk, data, 1))
break;
@@ -5838,8 +5908,14 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* the segment and return)"
*/
if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
+ if (icsk->icsk_retransmits == 0)
+ inet_csk_reset_xmit_timer(sk,
+ ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
goto reset_and_undo;
+ }
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
@@ -5888,7 +5964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5914,10 +5990,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
+ if (sk_is_mptcp(sk))
+ mptcp_rcv_synsent(sk);
+
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
smc_check_reset_syn(tp);
@@ -5991,8 +6070,8 @@ discard:
tp->tcp_header_len = sizeof(struct tcphdr);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -6043,6 +6122,8 @@ reset_and_undo:
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
+ struct request_sock *req;
+
tcp_try_undo_loss(sk, false);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
@@ -6052,7 +6133,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
- reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ reqsk_fastopen_remove(sk, req, false);
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
@@ -6127,7 +6210,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
- req = tp->fastopen_rsk;
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
if (req) {
bool req_stolen;
@@ -6167,7 +6251,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -6274,8 +6358,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb, &tp->rx_opt);
break;
+ }
/* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
@@ -6422,9 +6509,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk,
- const struct sk_buff *skb,
- const char *proto)
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
@@ -6444,7 +6529,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
+ proto, sk->sk_num, msg);
return want_cookie;
}
@@ -6466,6 +6551,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
}
}
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
+ !inet_csk_reqsk_queue_is_full(sk))
+ return 0;
+
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+ return 0;
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ return 0;
+ }
+
+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ if (!mss)
+ mss = af_ops->mss_clamp;
+
+ return mss;
+}
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -6487,7 +6602,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
@@ -6503,6 +6618,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+ tcp_rsk(req)->is_mptcp = 0;
+#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6525,6 +6643,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
af_ops->init_req(req, sk, skb);
+ if (IS_ENABLED(CONFIG_MPTCP) && want_cookie)
+ tcp_rsk(req)->is_mptcp = 0;
+
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d57641cb3477..df1166b76126 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -121,11 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
- (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
- (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
loopback = true;
} else
#endif
@@ -164,9 +162,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
* without appearing to create any others.
*/
if (likely(!tp->repair)) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
+ u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
+
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
}
@@ -253,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
if (likely(!tp->repair))
- tp->write_seq = 0;
+ WRITE_ONCE(tp->write_seq, 0);
}
inet->inet_dport = usin->sin_port;
@@ -291,16 +291,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (likely(!tp->repair)) {
if (!tp->write_seq)
- tp->write_seq = secure_tcp_seq(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port);
+ WRITE_ONCE(tp->write_seq,
+ secure_tcp_seq(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port));
tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
inet->inet_saddr,
inet->inet_daddr);
}
- inet->inet_id = tp->write_seq ^ jiffies;
+ inet->inet_id = prandom_u32();
if (tcp_fastopen_defer_connect(sk, &err))
return err;
@@ -478,7 +479,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
icsk = inet_csk(sk);
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
- fastopen = tp->fastopen_rsk;
+ fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
@@ -700,9 +701,21 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
} else if (hash_location) {
+ const union tcp_md5_addr *addr;
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index;
+
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -713,14 +726,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb),
- tcp_v4_sdif(skb));
+ ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
if (!sk1)
goto out;
- key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = sdif ? dif : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
if (!key)
goto out;
@@ -771,6 +787,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
}
ip_send_unicast_reply(ctl_sk,
@@ -866,6 +884,8 @@ static void tcp_v4_send_ack(const struct sock *sk,
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : sk->sk_priority;
transmit_time = tcp_transmit_time(sk);
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -900,6 +920,9 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
+ const union tcp_md5_addr *addr;
+ int l3index;
+
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
@@ -911,14 +934,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
* exception of <SYN> segments, MUST be right-shifted by
* Rcv.Wind.Shift bits:
*/
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
- AF_INET),
+ tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
}
@@ -978,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
EXPORT_SYMBOL(tcp_md5_needed);
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
const union tcp_md5_addr *addr,
int family)
{
@@ -998,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
-
+ if (key->l3index && key->l3index != l3index)
+ continue;
if (family == AF_INET) {
mask = inet_make_mask(key->prefixlen);
match = (key->addr.a4.s_addr & mask) ==
@@ -1022,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
- int family, u8 prefixlen)
+ int family, u8 prefixlen,
+ int l3index)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1041,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
+ if (key->l3index && key->l3index != l3index)
+ continue;
if (!memcmp(&key->addr, addr, size) &&
key->prefixlen == prefixlen)
return key;
@@ -1052,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
+ int l3index;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
- return tcp_md5_do_lookup(sk, addr, AF_INET);
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
- gfp_t gfp)
+ int family, u8 prefixlen, int l3index,
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (key) {
/* Pre-existing entry - just update that one. */
memcpy(key->key, newkey, newkeylen);
@@ -1100,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
key->keylen = newkeylen;
key->family = family;
key->prefixlen = prefixlen;
+ key->l3index = l3index;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
@@ -1109,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
EXPORT_SYMBOL(tcp_md5_do_add);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
- u8 prefixlen)
+ u8 prefixlen, int l3index)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1144,7 +1176,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+ const union tcp_md5_addr *addr;
u8 prefixlen = 32;
+ int l3index = 0;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1162,16 +1196,34 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
+
if (!cmd.tcpm_keylen)
- return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen);
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
- GFP_KERNEL);
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
+ cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
@@ -1281,7 +1333,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
/* Called with rcu_read_lock() */
static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ int dif, int sdif)
{
#ifdef CONFIG_TCP_MD5SIG
/*
@@ -1296,11 +1349,17 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
struct tcp_md5sig_key *hash_expected;
const struct iphdr *iph = ip_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- int genhash;
+ const union tcp_md5_addr *addr;
unsigned char newhash[16];
+ int genhash, l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
- hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
- AF_INET);
+ addr = (union tcp_md5_addr *)&iph->saddr;
+ hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
@@ -1326,11 +1385,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
+ net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
&iph->saddr, ntohs(th->source),
&iph->daddr, ntohs(th->dest),
genhash ? " tcp_v4_calc_md5_hash failed"
- : "");
+ : "", l3index);
return true;
}
return false;
@@ -1367,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
@@ -1414,7 +1473,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
+ const union tcp_md5_addr *addr;
struct tcp_md5sig_key *key;
+ int l3index;
#endif
struct ip_options_rcu *inet_opt;
@@ -1443,7 +1504,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- newinet->inet_id = newtp->write_seq ^ jiffies;
+ newinet->inet_id = prandom_u32();
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
@@ -1462,9 +1523,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
/* Copy over the MD5 key from the original socket */
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET);
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
if (key) {
/*
* We're using one, so create a matching key
@@ -1472,8 +1534,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
* memory, then we end up not copying the key
* across. Shucks.
*/
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
+ tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
+ key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
@@ -1515,6 +1577,21 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1625,7 +1702,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
- u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
+ u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -1788,6 +1865,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
+ int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
@@ -1836,7 +1914,7 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
@@ -1894,10 +1972,10 @@ process:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- if (tcp_v4_inbound_md5_hash(sk, skb))
+ if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
goto discard_and_relse;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (tcp_filter(sk, skb))
goto discard_and_relse;
@@ -2102,7 +2180,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- BUG_ON(tp->fastopen_rsk);
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
@@ -2127,13 +2205,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
struct sock *sk = cur;
if (!sk) {
get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock(&ilb->lock);
- sk = sk_head(&ilb->head);
+ sk = sk_nulls_head(&ilb->nulls_head);
st->offset = 0;
goto get_sk;
}
@@ -2141,9 +2220,9 @@ get_head:
++st->num;
++st->offset;
- sk = sk_next(sk);
+ sk = sk_nulls_next(sk);
get_sk:
- sk_for_each_from(sk) {
+ sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net))
continue;
if (sk->sk_family == afinfo->family)
@@ -2431,17 +2510,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
- rx_queue = sk->sk_ack_backlog;
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
- rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
i, src, srcp, dest, destp, state,
- tp->write_seq - tp->snd_una,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
@@ -2598,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
int cpu;
if (net->ipv4.tcp_congestion_control)
- module_put(net->ipv4.tcp_congestion_control->owner);
+ bpf_module_put(net->ipv4.tcp_congestion_control,
+ net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
@@ -2637,6 +2718,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
@@ -2652,12 +2734,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
cnt = tcp_hashinfo.ehash_mask + 1;
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+ net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
@@ -2703,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net)
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
- try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
+ init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c4848e7a0aad..279db8822439 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk)
if (tcp_in_initial_slowstart(tp)) {
/* Slow start still did not finish. */
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && (tp->snd_cwnd >> 1) > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk)
} else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
@@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
(val + tp->snd_ssthresh) >> 1);
}
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && tp->snd_ssthresh > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
@@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk)
if (tcp_metric_locked(tm, TCP_METRIC_CWND))
tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
- val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ?
+ 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val) {
tp->snd_ssthresh = val;
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8bcaf2586b68..ad3b56d9fa71 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -266,6 +266,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_transparent = inet->transparent;
tw->tw_mark = sk->sk_mark;
+ tw->tw_priority = sk->sk_priority;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -413,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
ca_got_dst = true;
@@ -424,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
/* If no valid choice made yet, assign current system default ca. */
if (!ca_got_dst &&
(!icsk->icsk_ca_setsockopt ||
- !try_module_get(icsk->icsk_ca_ops->owner)))
+ !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
tcp_assign_congestion_control(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -461,6 +462,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk;
struct tcp_sock *oldtp, *newtp;
+ u32 seq;
if (!newsk)
return NULL;
@@ -474,12 +476,16 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
/* Now setup tcp_sock */
newtp->pred_flags = 0;
- newtp->rcv_wup = newtp->copied_seq =
- newtp->rcv_nxt = treq->rcv_isn + 1;
+ seq = treq->rcv_isn + 1;
+ newtp->rcv_wup = seq;
+ WRITE_ONCE(newtp->copied_seq, seq);
+ WRITE_ONCE(newtp->rcv_nxt, seq);
newtp->segs_in = 1;
- newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
+ seq = treq->snt_isn + 1;
+ newtp->snd_sml = newtp->snd_una = seq;
+ WRITE_ONCE(newtp->snd_nxt, seq);
+ newtp->snd_up = seq;
INIT_LIST_HEAD(&newtp->tsq_node);
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
@@ -494,7 +500,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->total_retrans = req->num_retrans;
tcp_init_xmit_timers(newsk);
- newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
+ WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
if (sock_flag(newsk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(newsk,
@@ -540,7 +546,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.mss_clamp = req->mss;
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_req = NULL;
- newtp->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 979520e46e33..306e25d743e8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -38,6 +38,7 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
@@ -67,11 +68,14 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
__skb_unlink(skb, &sk->sk_write_queue);
tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+ if (tp->highest_sack == NULL)
+ tp->highest_sack = skb;
+
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
@@ -411,6 +415,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
#define OPTION_SMC (1 << 9)
+#define OPTION_MPTCP (1 << 10)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -436,8 +441,17 @@ struct tcp_out_options {
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+ struct mptcp_out_options mptcp;
};
+static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+ if (unlikely(OPTION_MPTCP & opts->options))
+ mptcp_write_options(ptr, &opts->mptcp);
+#endif
+}
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -546,6 +560,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
smc_options_write(ptr, &options);
+
+ mptcp_options_write(ptr, opts);
}
static void smc_set_option(const struct tcp_sock *tp,
@@ -581,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp,
#endif
}
+static void mptcp_set_option_cond(const struct request_sock *req,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ if (rsk_is_mptcp(req)) {
+ unsigned int size;
+
+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+ if (*remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ *remaining -= size;
+ }
+ }
+ }
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -650,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
smc_set_option(tp, opts, &remaining);
+ if (sk_is_mptcp(sk)) {
+ unsigned int size;
+
+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ remaining -= size;
+ }
+ }
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -711,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
}
+ mptcp_set_option_cond(req, opts, &remaining);
+
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
@@ -748,13 +791,35 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
size += TCPOLEN_TSTAMP_ALIGNED;
}
+ /* MPTCP options have precedence over SACK for the limited TCP
+ * option space because a MPTCP connection would be forced to
+ * fall back to regular TCP if a required multipath option is
+ * missing. SACK still gets a chance to use whatever space is
+ * left.
+ */
+ if (sk_is_mptcp(sk)) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ unsigned int opt_size = 0;
+
+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
+ &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ size += opt_size;
+ }
+ }
+
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK))
+ return size;
+
opts->num_sack_blocks =
min_t(unsigned int, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
+
size += TCPOLEN_SACK_BASE_ALIGNED +
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
@@ -1050,11 +1115,22 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
- else
+ } else {
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
+ * at receiver : This slightly improve GRO performance.
+ * Note that we do not force the PSH flag for non GSO packets,
+ * because they might be sent under high congestion events,
+ * and in this case it is better to delay the delivery of 1-MSS
+ * packets and thus the corresponding ACK packet that would
+ * release the following packet.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ tcb->tcp_flags |= TCPHDR_PSH;
+ }
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
/* if no packet is in qdisc/device queue, then allow XPS to select
@@ -1185,10 +1261,10 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
/* Advance write_seq and place onto the write_queue. */
- tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
}
@@ -1322,7 +1398,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
@@ -1403,7 +1479,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
} else {
shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- shinfo->frags[k].page_offset += eat;
+ skb_frag_off_add(&shinfo->frags[k], eat);
skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
@@ -1432,7 +1508,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
if (delta_truesize) {
skb->truesize -= delta_truesize;
- sk->sk_wmem_queued -= delta_truesize;
+ sk_wmem_queued_add(sk, -delta_truesize);
sk_mem_uncharge(sk, delta_truesize);
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
}
@@ -1713,7 +1789,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
u32 bytes, segs;
bytes = min_t(unsigned long,
- sk->sk_pacing_rate >> sk->sk_pacing_shift,
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
/* Goal is to send at least one packet per ms,
@@ -1877,7 +1953,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
return -ENOMEM;
skb_copy_decrypted(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -2053,7 +2129,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (len <= skb->len)
break;
- if (unlikely(TCP_SKB_CB(skb)->eor))
+ if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
return false;
len -= skb->len;
@@ -2141,7 +2217,7 @@ static int tcp_mtu_probe(struct sock *sk)
nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
return -1;
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
@@ -2170,6 +2246,7 @@ static int tcp_mtu_probe(struct sock *sk)
* we need to propagate it to the new skb.
*/
TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
+ tcp_skb_collapse_tstamp(nskb, skb);
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
} else {
@@ -2247,7 +2324,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
limit = max_t(unsigned long,
2 * skb->truesize,
- sk->sk_pacing_rate >> sk->sk_pacing_shift);
+ sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit,
sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
@@ -2425,6 +2502,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (tcp_small_queue_check(sk, skb, 0))
break;
+ /* Argh, we hit an empty skb(), presumably a thread
+ * is sleeping in sendmsg()/sk_stream_wait_memory().
+ * We do not want to send a pure-ack packet and have
+ * a strange looking rtx queue with empty packet(s).
+ */
+ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
+ break;
+
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2470,7 +2555,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return false;
early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
@@ -2841,7 +2926,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!tcp_can_collapse(sk, skb))
break;
- if (!tcp_skb_can_collapse_to(to))
+ if (!tcp_skb_can_collapse(to, skb))
break;
space -= skb->len;
@@ -3108,7 +3193,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
*/
void tcp_send_fin(struct sock *sk)
{
- struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* Optimization, tack on the FIN if we have one skb in write queue and
@@ -3116,6 +3201,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
+ tskb = tail;
if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue);
@@ -3123,14 +3209,14 @@ void tcp_send_fin(struct sock *sk)
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- if (tcp_write_queue_empty(sk)) {
+ if (!tail) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
- tp->snd_nxt++;
+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
return;
}
} else {
@@ -3207,10 +3293,11 @@ int tcp_send_synack(struct sock *sk)
if (!nskb)
return -ENOMEM;
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+ tcp_highest_sack_replace(sk, skb, nskb);
tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
@@ -3278,7 +3365,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(req->cookie_ts))
- skb->skb_mstamp_ns = cookie_init_timestamp(req);
+ skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
else
#endif
{
@@ -3343,8 +3430,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
- module_put(icsk->icsk_ca_ops->owner);
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
@@ -3414,14 +3501,14 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
if (likely(!tp->repair))
tp->rcv_nxt = 0;
else
tp->rcv_tstamp = tcp_jiffies32;
tp->rcv_wup = tp->rcv_nxt;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
inet_csk(sk)->icsk_retransmits = 0;
@@ -3435,9 +3522,9 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
tcb->end_seq += skb->len;
__skb_header_release(skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
- tp->write_seq = tcb->end_seq;
+ WRITE_ONCE(tp->write_seq, tcb->end_seq);
tp->packets_out += tcp_skb_pcount(skb);
}
@@ -3574,11 +3661,11 @@ int tcp_connect(struct sock *sk)
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
tp->pushed_seq = tp->write_seq;
buff = tcp_send_head(sk);
if (unlikely(buff)) {
- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
tp->pushed_seq = TCP_SKB_CB(buff)->seq;
}
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c801cd37cc2a..c3f26dcd6704 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
} else {
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
- mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
+ mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
@@ -198,8 +198,13 @@ static bool retransmits_timed_out(struct sock *sk,
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
- if (likely(timeout == 0))
- timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN);
+ if (likely(timeout == 0)) {
+ unsigned int rto_base = TCP_RTO_MIN;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ rto_base = tcp_timeout_init(sk);
+ timeout = tcp_model_timeout(sk, boundary, rto_base);
+ }
return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
}
@@ -210,7 +215,7 @@ static int tcp_write_timeout(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- bool expired, do_reset;
+ bool expired = false, do_reset;
int retry_until;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
@@ -218,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
} else {
sk_rethink_txhash(sk);
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTIMEOUTREHASH);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
expired = icsk->icsk_retransmits >= retry_until;
@@ -229,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
} else {
sk_rethink_txhash(sk);
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPTIMEOUTREHASH);
}
retry_until = net->ipv4.sysctl_tcp_retries2;
@@ -242,9 +253,10 @@ static int tcp_write_timeout(struct sock *sk)
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
+ }
+ if (!expired)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
- }
tcp_fastopen_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
@@ -380,15 +392,13 @@ abort: tcp_write_err(sk);
* Timer for Fast Open socket to retransmit SYNACK. Note that the
* sk here is the child socket, not the parent (listener) socket.
*/
-static void tcp_fastopen_synack_timer(struct sock *sk)
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct tcp_sock *tp = tcp_sk(sk);
- struct request_sock *req;
- req = tcp_sk(sk)->fastopen_rsk;
req->rsk_ops->syn_ack_timeout(req);
if (req->num_timeout >= max_retries) {
@@ -429,17 +439,26 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
+ struct sk_buff *skb;
- if (tp->fastopen_rsk) {
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- tcp_fastopen_synack_timer(sk);
+ tcp_fastopen_synack_timer(sk, req);
/* Before we receive ACK to our SYN-ACK don't retransmit
* anything else (e.g., data or FIN segments).
*/
return;
}
- if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
+
+ if (!tp->packets_out)
+ return;
+
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
return;
tp->tlp_high_seq = 0;
@@ -473,7 +492,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
+ tcp_retransmit_skb(sk, skb, 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 4849edb62d52..38d3ad141161 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -92,21 +92,26 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ulp_ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
-void tcp_update_ulp(struct sock *sk, struct proto *proto)
+void tcp_update_ulp(struct sock *sk, struct proto *proto,
+ void (*write_space)(struct sock *sk))
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (!icsk->icsk_ulp_ops) {
+ sk->sk_write_space = write_space;
sk->sk_prot = proto;
return;
}
if (icsk->icsk_ulp_ops->update)
- icsk->icsk_ulp_ops->update(sk, proto);
+ icsk->icsk_ulp_ops->update(sk, proto, write_space);
}
void tcp_cleanup_ulp(struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d88821c794fb..db76b9609299 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -388,7 +388,7 @@ static int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
return score;
}
@@ -423,12 +423,13 @@ static struct sock *udp4_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
+ if (sk->sk_reuseport &&
+ sk->sk_state != TCP_ESTABLISHED) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (result)
+ if (result && !reuseport_has_conns(sk, false))
return result;
}
badness = score;
@@ -820,6 +821,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
int is_udplite = IS_UDPLITE(sk);
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
__wsum csum = 0;
/*
@@ -853,10 +855,12 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
return -EIO;
}
- skb_shinfo(skb)->gso_size = cork->gso_size;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
- cork->gso_size);
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
+ }
goto csum_partial;
}
@@ -1130,7 +1134,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport,
@@ -1293,6 +1297,27 @@ out:
#define UDP_SKB_IS_STATELESS 0x80000000
+/* all head states (dst, sk, nf conntrack) except skb extensions are
+ * cleared by udp_rcv().
+ *
+ * We need to preserve secpath, if present, to eventually process
+ * IP_CMSG_PASSSEC at recvmsg() time.
+ *
+ * Other extensions can be cleared.
+ */
+static bool udp_try_make_stateless(struct sk_buff *skb)
+{
+ if (!skb_has_extensions(skb))
+ return true;
+
+ if (!secpath_exists(skb)) {
+ skb_ext_reset(skb);
+ return true;
+ }
+
+ return false;
+}
+
static void udp_set_dev_scratch(struct sk_buff *skb)
{
struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
@@ -1304,14 +1329,24 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- /* all head states execept sp (dst, sk, nf) are always cleared by
- * udp_rcv() and we need to preserve secpath, if present, to eventually
- * process IP_CMSG_PASSSEC at recvmsg() time
- */
- if (likely(!skb_sec_path(skb)))
+ if (udp_try_make_stateless(skb))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+ /* We come here after udp_lib_checksum_complete() returned 0.
+ * This means that __skb_checksum_complete() might have
+ * set skb->csum_valid to 1.
+ * On 64bit platforms, we can set csum_unnecessary
+ * to true, but only if the skb is not shared.
+ */
+#if BITS_PER_LONG == 64
+ if (!skb_shared(skb))
+ udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
static int udp_skb_truesize(struct sk_buff *skb)
{
return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
@@ -1333,7 +1368,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (likely(partial)) {
up->forward_deficit += size;
size = up->forward_deficit;
- if (size < (sk->sk_rcvbuf >> 2))
+ if (size < (sk->sk_rcvbuf >> 2) &&
+ !skb_queue_empty(&up->reader_queue))
return;
} else {
size += up->forward_deficit;
@@ -1440,7 +1476,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
* queue contains some other skb
*/
rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
- if (rmem > (size + sk->sk_rcvbuf))
+ if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
goto uncharge_drop;
spin_lock(&list->lock);
@@ -1546,10 +1582,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
*total += skb->truesize;
kfree_skb(skb);
} else {
- /* the csum related bits could be changed, refresh
- * the scratch area
- */
- udp_set_dev_scratch(skb);
+ udp_skb_csum_unnecessary_set(skb);
break;
}
}
@@ -1573,7 +1606,7 @@ static int first_packet_length(struct sock *sk)
spin_lock_bh(&rcvq->lock);
skb = __first_packet_length(sk, rcvq, &total);
- if (!skb && !skb_queue_empty(sk_queue)) {
+ if (!skb && !skb_queue_empty_lockless(sk_queue)) {
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, rcvq);
spin_unlock(&sk_queue->lock);
@@ -1646,7 +1679,7 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
return skb;
}
- if (skb_queue_empty(sk_queue)) {
+ if (skb_queue_empty_lockless(sk_queue)) {
spin_unlock_bh(&queue->lock);
goto busy_check;
}
@@ -1672,11 +1705,12 @@ busy_check:
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT);
- } while (!skb_queue_empty(sk_queue));
+ } while (!skb_queue_empty_lockless(sk_queue));
/* sk_queue is empty, reader_queue may contain peeked packets */
} while (timeo &&
- !__skb_wait_for_more_packets(sk, &error, &timeo,
+ !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
+ &error, &timeo,
(struct sk_buff *)sk_queue));
*err = error;
@@ -1968,7 +2002,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
*/
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
@@ -2072,8 +2106,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
__skb_push(skb, -skb_mac_offset(skb));
segs = udp_rcv_segment(sk, skb, true);
- for (skb = segs; skb; skb = next) {
- next = skb->next;
+ skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
@@ -2297,7 +2330,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
@@ -2519,9 +2552,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_ENCAP:
switch (val) {
case 0:
+#ifdef CONFIG_XFRM
case UDP_ENCAP_ESPINUDP:
case UDP_ENCAP_ESPINUDP_NON_IKE:
up->encap_rcv = xfrm4_udp_encap_rcv;
+#endif
/* FALLTHROUGH */
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
@@ -2708,7 +2743,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
__poll_t mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
- if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
+ if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Check for false positives due to checksum errors */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a3908e55ed89..1a98583a79f4 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -184,6 +184,20 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
+
+ return skb;
+}
+
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features)
{
@@ -196,6 +210,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
__sum16 check;
__be16 newlen;
+ if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)
+ return __udp_gso_segment_list(gso_skb, features);
+
mss = skb_shinfo(gso_skb)->gso_size;
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
@@ -354,6 +371,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct udphdr *uh2;
struct sk_buff *p;
unsigned int ulen;
+ int ret = 0;
/* requires non zero csum, for symmetry with GSO */
if (!uh->check) {
@@ -369,7 +387,6 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
}
/* pull encapsulating udp header */
skb_gro_pull(skb, sizeof(struct udphdr));
- skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
@@ -383,14 +400,40 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
continue;
}
+ if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return p;
+ }
+
/* Terminate the flow on len mismatch or if it grow "too much".
* Under small packet flood GRO count could elsewhere grow a lot
* leading to excessive truesize values.
* On len mismatch merge the first packet shorter than gso_size,
* otherwise complete the GRO packet.
*/
- if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) ||
- ulen != ntohs(uh2->len) ||
+ if (ulen > ntohs(uh2->len)) {
+ pp = p;
+ } else {
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ if (!pskb_may_pull(skb, skb_gro_offset(skb))) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ if ((skb->ip_summed != p->ip_summed) ||
+ (skb->csum_level != p->csum_level)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ ret = skb_gro_receive_list(p, skb);
+ } else {
+ skb_gro_postpull_rcsum(skb, uh,
+ sizeof(struct udphdr));
+
+ ret = skb_gro_receive(p, skb);
+ }
+ }
+
+ if (ret || ulen != ntohs(uh2->len) ||
NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
pp = p;
@@ -401,36 +444,29 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
return NULL;
}
-INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
- __be16 sport, __be16 dport));
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup)
+ struct udphdr *uh, struct sock *sk)
{
struct sk_buff *pp = NULL;
struct sk_buff *p;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
- struct sock *sk;
- rcu_read_lock();
- sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
- udp4_lib_lookup_skb, skb, uh->source, uh->dest);
- if (!sk)
- goto out_unlock;
+ if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
- if (udp_sk(sk)->gro_enabled) {
+ if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) {
pp = call_gro_receive(udp_gro_receive_segment, head, skb);
- rcu_read_unlock();
return pp;
}
- if (NAPI_GRO_CB(skb)->encap_mark ||
+ if (!sk || NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid) ||
!udp_sk(sk)->gro_receive)
- goto out_unlock;
+ goto out;
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
@@ -457,8 +493,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
-out_unlock:
- rcu_read_unlock();
+out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
@@ -468,8 +503,10 @@ INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sk_buff *pp;
+ struct sock *sk;
- if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key))
+ if (unlikely(!uh))
goto flush;
/* Don't bother verifying checksum if we're going to flush anyway. */
@@ -480,11 +517,15 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
inet_gro_compute_pseudo))
goto flush;
else if (uh->check)
- skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
+ rcu_read_lock();
+ sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+ pp = udp_gro_receive(head, skb, uh, sk);
+ rcu_read_unlock();
+ return pp;
flush:
NAPI_GRO_CB(skb)->flush = 1;
@@ -517,9 +558,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
rcu_read_lock();
sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
udp4_lib_lookup_skb, skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_enabled) {
- err = udp_gro_complete_segment(skb);
- } else if (sk && udp_sk(sk)->gro_complete) {
+ if (sk && udp_sk(sk)->gro_complete) {
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL;
@@ -529,6 +568,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
+ } else {
+ err = udp_gro_complete_segment(skb);
}
rcu_read_unlock();
@@ -544,6 +585,23 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
+ skb->csum_level++;
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->csum_level = 0;
+ }
+
+ return 0;
+ }
+
if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index ecff3fce9807..89ba7c87de5d 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -92,7 +92,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst(skb)->dev,
__xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index cdef8f9a3b01..9ebd54752e03 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
RTCF_LOCAL);
xdst->u.rt.rt_type = rt->rt_type;
+ xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
xdst->u.rt.rt_gw_family = rt->rt_gw_family;
if (rt->rt_gw_family == AF_INET)
xdst->u.rt.rt_gw4 = rt->rt_gw4;
@@ -99,12 +100,13 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
}
static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, sk, skb, mtu);
+ path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
}
static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8a4285712808..ea595c8549c7 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -72,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
if (!head)
goto out;
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
+ goto drop;
+ }
+
for_each_protocol_rcu(*head, handler)
if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
return ret;
@@ -79,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
out:
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
kfree_skb(skb);
return 0;
}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index dc73888c7859..cb493e15959c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -478,7 +478,7 @@ static struct inet6_dev *ipv6_find_idev(struct net_device *dev)
if (!idev) {
idev = ipv6_add_dev(dev);
if (IS_ERR(idev))
- return NULL;
+ return idev;
}
if (dev->flags&IFF_UP)
@@ -1045,7 +1045,8 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
int err = 0;
if (addr_type == IPV6_ADDR_ANY ||
- addr_type & IPV6_ADDR_MULTICAST ||
+ (addr_type & IPV6_ADDR_MULTICAST &&
+ !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) ||
(!(idev->dev->flags & IFF_LOOPBACK) &&
!netif_is_l3_master(idev->dev) &&
addr_type & IPV6_ADDR_LOOPBACK))
@@ -2465,8 +2466,8 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
ASSERT_RTNL();
idev = ipv6_find_idev(dev);
- if (!idev)
- return ERR_PTR(-ENOBUFS);
+ if (IS_ERR(idev))
+ return idev;
if (idev->cnf.disable_ipv6)
return ERR_PTR(-EACCES);
@@ -3158,7 +3159,7 @@ static void init_loopback(struct net_device *dev)
ASSERT_RTNL();
idev = ipv6_find_idev(dev);
- if (!idev) {
+ if (IS_ERR(idev)) {
pr_debug("%s: add_dev failed\n", __func__);
return;
}
@@ -3373,7 +3374,7 @@ static void addrconf_sit_config(struct net_device *dev)
*/
idev = ipv6_find_idev(dev);
- if (!idev) {
+ if (IS_ERR(idev)) {
pr_debug("%s: add_dev failed\n", __func__);
return;
}
@@ -3398,7 +3399,7 @@ static void addrconf_gre_config(struct net_device *dev)
ASSERT_RTNL();
idev = ipv6_find_idev(dev);
- if (!idev) {
+ if (IS_ERR(idev)) {
pr_debug("%s: add_dev failed\n", __func__);
return;
}
@@ -4772,8 +4773,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
idev = ipv6_find_idev(dev);
- if (!idev)
- return -ENOBUFS;
+ if (IS_ERR(idev))
+ return PTR_ERR(idev);
if (!ipv6_allow_optimistic_dad(net, idev))
cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
@@ -5230,16 +5231,16 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
return -EINVAL;
}
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+
ifm = nlmsg_data(nlh);
if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request");
return -EINVAL;
}
- if (!netlink_strict_get_check(skb))
- return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
- ifa_ipv6_policy, extack);
-
err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
ifa_ipv6_policy, extack);
if (err)
@@ -5551,14 +5552,13 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
if (!nla)
goto nla_put_failure;
-
- if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
- goto nla_put_failure;
-
read_lock_bh(&idev->lock);
memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
read_unlock_bh(&idev->lock);
+ if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -5718,6 +5718,9 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
struct nlattr *tb[IFLA_INET6_MAX + 1];
int err;
+ if (!idev)
+ return -EAFNOSUPPORT;
+
if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
BUG();
@@ -5963,13 +5966,20 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
switch (event) {
case RTM_NEWADDR:
/*
- * If the address was optimistic
- * we inserted the route at the start of
- * our DAD process, so we don't need
- * to do it again
+ * If the address was optimistic we inserted the route at the
+ * start of our DAD process, so we don't need to do it again.
+ * If the device was taken down in the middle of the DAD
+ * cycle there is a race where we could get here without a
+ * host route, so nothing to insert. That will be fixed when
+ * the device is brought up.
*/
- if (!rcu_access_pointer(ifp->rt->fib6_node))
+ if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) {
ip6_ins_rt(net, ifp->rt);
+ } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) {
+ pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n",
+ &ifp->addr, ifp->idev->dev->name);
+ }
+
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 783f3c1466da..ea00ce3d4117 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <net/ipv6.h>
#include <net/ipv6_stubs.h>
+#include <net/addrconf.h>
#include <net/ip.h>
/* if ipv6 module registers this function is used by xfrm to force all
@@ -128,11 +129,12 @@ int inet6addr_validator_notifier_call_chain(unsigned long val, void *v)
}
EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain);
-static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
- struct dst_entry **u2,
- struct flowi6 *u3)
+static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
{
- return -EAFNOSUPPORT;
+ return ERR_PTR(-EAFNOSUPPORT);
}
static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
@@ -189,7 +191,7 @@ static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt)
}
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
- .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
.ipv6_route_input = eafnosupport_ipv6_route_input,
.fib6_get_table = eafnosupport_fib6_get_table,
.fib6_table_lookup = eafnosupport_fib6_table_lookup,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index ef37e0574f54..d727c3b41495 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -292,7 +292,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
return -EINVAL;
snum = ntohs(addr->sin6_port);
- if (snum && snum < inet_prot_sock(net) &&
+ if (snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
@@ -765,7 +765,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
&final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
sk->sk_route_caps = 0;
sk->sk_err_soft = -PTR_ERR(dst);
@@ -946,7 +946,7 @@ static int ipv6_route_input(struct sk_buff *skb)
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
- .ipv6_dst_lookup = ip6_dst_lookup,
+ .ipv6_dst_lookup_flow = ip6_dst_lookup_flow,
.ipv6_route_input = ipv6_route_input,
.fib6_get_table = fib6_get_table,
.fib6_table_lookup = fib6_table_lookup,
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 9ab897ded4df..390bedde21a5 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -27,6 +27,7 @@
#include <net/ip6_route.h>
#include <net/tcp_states.h>
#include <net/dsfield.h>
+#include <net/sock_reuseport.h>
#include <linux/errqueue.h>
#include <linux/uaccess.h>
@@ -84,7 +85,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
final_p = fl6_update_dst(&fl6, opt, &final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
@@ -254,6 +255,7 @@ ipv4_connected:
goto out;
}
+ reuseport_has_conns(sk, true);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
out:
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index e31626ffccd1..fd535053245b 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -79,6 +79,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
if (!x)
goto out_reset;
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
sp->xvec[sp->len++] = x;
sp->olen++;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index b358f1a4dd08..da46c4284676 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -197,10 +197,8 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
struct ipv6hdr _ip6, *ip6;
ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
- if (!ip6 || (ip6->version != 6)) {
- printk(KERN_ERR "IPv6 header not found\n");
+ if (!ip6 || (ip6->version != 6))
return -EBADMSG;
- }
start = *offset + sizeof(struct ipv6hdr);
nexthdr = ip6->nexthdr;
}
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 05f82baaa99e..f87ae33e1d01 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -7,12 +7,12 @@
#include <net/netns/ipv6.h>
#include <net/ip6_fib.h>
-int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+int call_fib6_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
info->family = AF_INET6;
- return call_fib_notifier(nb, net, event_type, info);
+ return call_fib_notifier(nb, event_type, info);
}
int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
@@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net)
return fib6_tables_seq_read(net) + fib6_rules_seq_read(net);
}
-static int fib6_dump(struct net *net, struct notifier_block *nb)
+static int fib6_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
int err;
- err = fib6_rules_dump(net, nb);
+ err = fib6_rules_dump(net, nb, extack);
if (err)
return err;
- return fib6_tables_dump(net, nb);
+ return fib6_tables_dump(net, nb, extack);
}
static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d22b6c140f23..fafe556d21e0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib6_rule_default);
-int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+int fib6_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, AF_INET6);
+ return fib_rules_dump(net, nb, AF_INET6, extack);
}
unsigned int fib6_rules_seq_read(struct net *net)
@@ -287,7 +288,8 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
return false;
suppress_route:
- ip6_rt_put(rt);
+ if (!(arg->flags & FIB_LOOKUP_NOREF))
+ ip6_rt_put(rt);
return true;
}
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 62c997201970..ef408a5090a2 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -516,13 +516,29 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
mip6_addr_swap(skb);
+ sk = icmpv6_xmit_lock(net);
+ if (!sk)
+ goto out_bh_enable;
+
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = hdr->saddr;
if (force_saddr)
saddr = force_saddr;
- if (saddr)
+ if (saddr) {
fl6.saddr = *saddr;
+ } else {
+ /* select a more meaningful saddr from input if */
+ struct net_device *in_netdev;
+
+ in_netdev = dev_get_by_index(net, IP6CB(skb)->iif);
+ if (in_netdev) {
+ ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
+ inet6_sk(sk)->srcprefs,
+ &fl6.saddr);
+ dev_put(in_netdev);
+ }
+ }
fl6.flowi6_mark = mark;
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
@@ -531,10 +547,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
- sk = icmpv6_xmit_lock(net);
- if (!sk)
- goto out_bh_enable;
-
sk->sk_mark = mark;
np = inet6_sk(sk);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 4da24aa6c696..e315526fa244 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,7 +48,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(fl6));
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(dst))
return NULL;
@@ -103,7 +103,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
dst = __inet6_csk_dst_check(sk, np->dst_cookie);
if (!dst) {
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (!IS_ERR(dst))
ip6_dst_store(sk, dst, NULL, NULL);
@@ -133,7 +133,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
fl6.daddr = sk->sk_v6_daddr;
res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass);
+ np->tclass, sk->sk_priority);
rcu_read_unlock();
return res;
}
@@ -146,7 +146,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
if (IS_ERR(dst))
return NULL;
- dst->ops->update_pmtu(dst, sk, NULL, mtu);
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
dst = inet6_csk_route_socket(sk, &fl6);
return IS_ERR(dst) ? NULL : dst;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index cf60fae9533b..fbe9d4295eac 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -105,7 +105,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score = 1;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
}
return score;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 87f47bc55c5e..58fbde244381 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -318,7 +318,7 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
if (rt->dst.error == -EAGAIN) {
ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
- if (!(flags | RT6_LOOKUP_F_DST_NOREF))
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
dst_hold(&rt->dst);
}
@@ -357,15 +357,32 @@ unsigned int fib6_tables_seq_read(struct net *net)
return fib_seq;
}
-static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib6_entry_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
- struct fib6_info *rt)
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ return call_fib6_notifier(nb, event_type, &info.info);
+}
+
+static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
.rt = rt,
+ .nsiblings = nsiblings,
};
- return call_fib6_notifier(nb, net, event_type, &info.info);
+ return call_fib6_notifier(nb, event_type, &info.info);
}
int call_fib6_entry_notifiers(struct net *net,
@@ -398,43 +415,72 @@ int call_fib6_multipath_entry_notifiers(struct net *net,
return call_fib6_notifiers(net, event_type, &info.info);
}
+int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ .nsiblings = rt->fib6_nsiblings,
+ };
+
+ rt->fib6_table->fib_seq++;
+ return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
+}
+
struct fib6_dump_arg {
struct net *net;
struct notifier_block *nb;
+ struct netlink_ext_ack *extack;
};
-static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
+static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
- if (rt == arg->net->ipv6.fib6_null_entry)
- return;
- call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+ enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
+ int err;
+
+ if (!rt || rt == arg->net->ipv6.fib6_null_entry)
+ return 0;
+
+ if (rt->fib6_nsiblings)
+ err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
+ rt,
+ rt->fib6_nsiblings,
+ arg->extack);
+ else
+ err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
+ arg->extack);
+
+ return err;
}
static int fib6_node_dump(struct fib6_walker *w)
{
- struct fib6_info *rt;
+ int err;
- for_each_fib6_walker_rt(w)
- fib6_rt_dump(rt, w->args);
+ err = fib6_rt_dump(w->leaf, w->args);
w->leaf = NULL;
- return 0;
+ return err;
}
-static void fib6_table_dump(struct net *net, struct fib6_table *tb,
- struct fib6_walker *w)
+static int fib6_table_dump(struct net *net, struct fib6_table *tb,
+ struct fib6_walker *w)
{
+ int err;
+
w->root = &tb->tb6_root;
spin_lock_bh(&tb->tb6_lock);
- fib6_walk(net, w);
+ err = fib6_walk(net, w);
spin_unlock_bh(&tb->tb6_lock);
+ return err;
}
/* Called with rcu_read_lock() */
-int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+int fib6_tables_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct fib6_dump_arg arg;
struct fib6_walker *w;
unsigned int h;
+ int err = 0;
w = kzalloc(sizeof(*w), GFP_ATOMIC);
if (!w)
@@ -443,19 +489,24 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
w->func = fib6_node_dump;
arg.net = net;
arg.nb = nb;
+ arg.extack = extack;
w->args = &arg;
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv6.fib_table_hash[h];
struct fib6_table *tb;
- hlist_for_each_entry_rcu(tb, head, tb6_hlist)
- fib6_table_dump(net, tb, w);
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ err = fib6_table_dump(net, tb, w);
+ if (err < 0)
+ goto out;
+ }
}
+out:
kfree(w);
- return 0;
+ return err;
}
static int fib6_dump_node(struct fib6_walker *w)
@@ -1021,6 +1072,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ bool notify_sibling_rt = false;
u16 nlflags = NLM_F_EXCL;
int err;
@@ -1112,6 +1164,7 @@ next_iter:
/* Find the first route that have the same metric */
sibling = leaf;
+ notify_sibling_rt = true;
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
@@ -1121,6 +1174,7 @@ next_iter:
}
sibling = rcu_dereference_protected(sibling->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
+ notify_sibling_rt = false;
}
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
@@ -1147,10 +1201,21 @@ next_iter:
add:
nlflags |= NLM_F_CREATE;
- if (!info->skip_notify_kernel) {
+ /* The route should only be notified if it is the first
+ * route in the node or if it is added as a sibling
+ * route to the first route in the node.
+ */
+ if (!info->skip_notify_kernel &&
+ (notify_sibling_rt || ins == &fn->leaf)) {
+ enum fib_event_type fib_event;
+
+ if (notify_sibling_rt)
+ fib_event = FIB_EVENT_ENTRY_APPEND;
+ else
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
+ fib_event, rt,
+ extack);
if (err) {
struct fib6_info *sibling, *next_sibling;
@@ -1194,7 +1259,7 @@ add:
return -ENOENT;
}
- if (!info->skip_notify_kernel) {
+ if (!info->skip_notify_kernel && ins == &fn->leaf) {
err = call_fib6_entry_notifiers(info->nl_net,
FIB_EVENT_ENTRY_REPLACE,
rt, extack);
@@ -1443,6 +1508,8 @@ out:
}
#endif
goto failure;
+ } else if (fib6_requires_src(rt)) {
+ fib6_routes_require_src_inc(info->nl_net);
}
return err;
@@ -1825,13 +1892,29 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
struct fib6_info __rcu **rtp, struct nl_info *info)
{
+ struct fib6_info *leaf, *replace_rt = NULL;
struct fib6_walker *w;
struct fib6_info *rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
struct net *net = info->nl_net;
+ bool notify_del = false;
RT6_TRACE("fib6_del_route\n");
+ /* If the deleted route is the first in the node and it is not part of
+ * a multipath route, then we need to replace it with the next route
+ * in the node, if exists.
+ */
+ leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (leaf == rt && !rt->fib6_nsiblings) {
+ if (rcu_access_pointer(rt->fib6_next))
+ replace_rt = rcu_dereference_protected(rt->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ else
+ notify_del = true;
+ }
+
/* Unlink it */
*rtp = rt->fib6_next;
rt->fib6_node = NULL;
@@ -1849,6 +1932,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
+ /* The route is deleted from a multipath route. If this
+ * multipath route is the first route in the node, then we need
+ * to emit a delete notification. Otherwise, we need to skip
+ * the notification.
+ */
+ if (rt->fib6_metric == leaf->fib6_metric &&
+ rt6_qualify_for_ecmp(leaf))
+ notify_del = true;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings)
sibling->fib6_nsiblings--;
@@ -1884,8 +1975,13 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fib6_purge_rt(rt, fn, net);
- if (!info->skip_notify_kernel)
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
+ if (!info->skip_notify_kernel) {
+ if (notify_del)
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ rt, NULL);
+ else if (replace_rt)
+ call_fib6_entry_notifiers_replace(net, replace_rt);
+ }
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
@@ -1915,6 +2011,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
if (rt == cur) {
+ if (fib6_requires_src(cur))
+ fib6_routes_require_src_dec(info->nl_net);
fib6_del_route(table, fn, rtp, info);
return 0;
}
@@ -2473,14 +2571,13 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
+ ++(*pos);
if (!v)
goto iter_table;
n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
- if (n) {
- ++*pos;
+ if (n)
return n;
- }
iter_table:
ipv6_route_check_sernum(iter);
@@ -2488,8 +2585,6 @@ iter_table:
r = fib6_walk_continue(&iter->w);
spin_unlock_bh(&iter->tbl->tb6_lock);
if (r > 0) {
- if (v)
- ++*pos;
return iter->w.leaf;
} else if (r < 0) {
fib6_walker_unlink(net, &iter->w);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index dd2d0b963260..55bfc5149d0c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -968,7 +968,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
if (unlikely(!tun_info ||
!(tun_info->mode & IP_TUNNEL_INFO_TX) ||
ip_tunnel_info_af(tun_info) != AF_INET6))
- return -EINVAL;
+ goto tx_err;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
@@ -980,9 +980,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
dsfield = key->tos;
if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
goto tx_err;
- md = ip_tunnel_info_opts(tun_info);
- if (!md)
+ if (tun_info->options_len < sizeof(*md))
goto tx_err;
+ md = ip_tunnel_info_opts(tun_info);
tun_id = tunnel_id_to_key32(key->tun_id);
if (md->version == 1) {
@@ -1040,7 +1040,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
/* TooBig packet may have updated dst->dev's mtu */
if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
- dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
+ dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false);
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
NEXTHDR_GRE);
@@ -1466,7 +1466,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
dev->mtu -= 8;
if (tunnel->parms.collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
netif_keep_dst(dev);
}
ip6gre_tnl_init_features(dev);
@@ -1894,7 +1893,6 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->needs_free_netdev = true;
dev->priv_destructor = ip6gre_dev_free;
- dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
netif_keep_dst(dev);
@@ -2170,8 +2168,8 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
[IFLA_GRE_IKEY] = { .type = NLA_U32 },
[IFLA_GRE_OKEY] = { .type = NLA_U32 },
- [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct ipv6hdr, saddr) },
- [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct ipv6hdr, daddr) },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct ipv6hdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct ipv6hdr, daddr) },
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
[IFLA_GRE_FLOWINFO] = { .type = NLA_U32 },
@@ -2192,11 +2190,11 @@ static void ip6erspan_tap_setup(struct net_device *dev)
{
ether_setup(dev);
+ dev->max_mtu = 0;
dev->netdev_ops = &ip6erspan_netdev_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ip6gre_dev_free;
- dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
netif_keep_dst(dev);
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 02045494c24c..e0086758b6ee 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -45,4 +45,38 @@ out:
rcu_read_unlock();
}
EXPORT_SYMBOL(icmpv6_send);
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ struct in6_addr orig_ip;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(ct->status & IPS_SRC_NAT)) {
+ icmpv6_send(skb_in, type, code, info);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct ipv6hdr))))
+ goto out;
+
+ orig_ip = ipv6_hdr(skb_in)->saddr;
+ ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
+ icmpv6_send(skb_in, type, code, info);
+ ipv6_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmpv6_ndo_send);
+#endif
#endif
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index fa014d5f1732..7b089d0ac8cd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -80,15 +80,33 @@ static void ip6_sublist_rcv_finish(struct list_head *head)
{
struct sk_buff *skb, *next;
- list_for_each_entry_safe(skb, next, head, list)
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
dst_input(skb);
+ }
+}
+
+static bool ip6_can_use_hint(const struct sk_buff *skb,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) &&
+ ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr);
+}
+
+static struct sk_buff *ip6_extract_route_hint(const struct net *net,
+ struct sk_buff *skb)
+{
+ if (fib6_routes_require_src(net) || fib6_has_custom_rules(net))
+ return NULL;
+
+ return skb;
}
static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
@@ -102,9 +120,15 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip6_rcv(skb);
if (!skb)
continue;
- ip6_rcv_finish_core(net, sk, skb);
+
+ if (ip6_can_use_hint(skb, hint))
+ skb_dst_copy(skb, hint);
+ else
+ ip6_rcv_finish_core(net, sk, skb);
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip6_extract_route_hint(net, skb);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip6_sublist_rcv_finish(&sublist);
@@ -221,6 +245,16 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
if (ipv6_addr_is_multicast(&hdr->saddr))
goto err;
+ /* While RFC4291 is not explicit about v4mapped addresses
+ * in IPv6 headers, it seems clear linux dual-stack
+ * model can not deal properly with these.
+ * Security models could be fooled by ::ffff:127.0.0.1 for example.
+ *
+ * https://tools.ietf.org/html/draft-itojun-v6ops-v4mapped-harmful-02
+ */
+ if (ipv6_addr_v4mapped(&hdr->saddr))
+ goto err;
+
skb->transport_header = skb->network_header + sizeof(*hdr);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
@@ -313,7 +347,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
- ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}
INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
@@ -369,7 +404,7 @@ resubmit_final:
/* Free reference early: we don't need it any more,
and it may hold ip_conntrack module loaded
indefinitely. */
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8e49fd62eea9..087304427bbb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -160,7 +160,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
+ struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
skb->protocol = htons(ETH_P_IPV6);
@@ -173,7 +173,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
}
return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
+ net, sk, skb, indev, dev,
ip6_finish_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
@@ -193,7 +193,7 @@ bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
* which are using proper atomic operations or spinlocks.
*/
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- __u32 mark, struct ipv6_txoptions *opt, int tclass)
+ __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -258,7 +258,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
hdr->daddr = *first_hop;
skb->protocol = htons(ETH_P_IPV6);
- skb->priority = sk->sk_priority;
+ skb->priority = priority;
skb->mark = mark;
mtu = dst_mtu(dst);
@@ -768,6 +768,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
inet6_sk(skb->sk) : NULL;
struct ip6_frag_state state;
unsigned int mtu, hlen, nexthdr_offset;
+ ktime_t tstamp = skb->tstamp;
int hroom, err = 0;
__be32 frag_id;
u8 *prevhdr, nexthdr = 0;
@@ -855,6 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (iter.frag)
ip6_fraglist_prepare(skb, &iter);
+ skb->tstamp = tstamp;
err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -913,6 +915,7 @@ slow_path:
/*
* Put this fragment into the sending queue.
*/
+ frag->tstamp = tstamp;
err = output(net, sk, frag);
if (err)
goto fail;
@@ -1141,19 +1144,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
* It returns a valid dst pointer on success, or a pointer encoded
* error code.
*/
-struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
+struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst)
{
struct dst_entry *dst = NULL;
int err;
- err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
+ err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
if (err)
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
- return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+ return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
@@ -1185,7 +1188,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
if (dst)
return dst;
- dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
if (connected && !IS_ERR(dst))
ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
@@ -1294,6 +1297,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
cork->base.fragsize = mtu;
cork->base.gso_size = ipc6->gso_size;
cork->base.tx_flags = 0;
+ cork->base.mark = ipc6->sockc.mark;
sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
@@ -1764,7 +1768,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
hdr->daddr = *final_dst;
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = cork->base.mark;
skb->tstamp = cork->base.transmit_time;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 754a484d35df..5d65436ad5ad 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -121,6 +121,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
/**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @link: ifindex of underlying interface
* @remote: the address of the tunnel exit-point
* @local: the address of the tunnel entry-point
*
@@ -134,37 +135,56 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
static struct ip6_tnl *
-ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
+ip6_tnl_lookup(struct net *net, int link,
+ const struct in6_addr *remote, const struct in6_addr *local)
{
unsigned int hash = HASH(remote, local);
- struct ip6_tnl *t;
+ struct ip6_tnl *t, *cand = NULL;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct in6_addr any;
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else
+ cand = t;
}
memset(&any, 0, sizeof(any));
hash = HASH(&any, local);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_any(&t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_any(&t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
hash = HASH(remote, &any);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(remote, &t->parms.raddr) &&
- ipv6_addr_any(&t->parms.laddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !ipv6_addr_any(&t->parms.laddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
+ if (cand)
+ return cand;
+
t = rcu_dereference(ip6n->collect_md_tun);
if (t && t->dev->flags & IFF_UP)
return t;
@@ -351,7 +371,8 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
(t = rtnl_dereference(*tp)) != NULL;
tp = &t->next) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr)) {
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ p->link == t->parms.link) {
if (create)
return ERR_PTR(-EEXIST);
@@ -485,7 +506,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
processing of the error. */
rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr);
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr);
if (!t)
goto out;
@@ -640,7 +661,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (rel_info > dst_mtu(skb_dst(skb2)))
goto out;
- skb_dst_update_pmtu(skb2, rel_info);
+ skb_dst_update_pmtu_no_confirm(skb2, rel_info);
}
icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
@@ -887,7 +908,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
int ret = -1;
rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr);
if (t) {
u8 tproto = READ_ONCE(t->parms.proto);
@@ -1132,7 +1153,7 @@ route_lookup:
mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
IPV6_MIN_MTU : IPV4_MIN_MTU);
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
err = -EMSGSIZE;
@@ -1420,8 +1441,10 @@ tx_err:
static void ip6_tnl_link_config(struct ip6_tnl *t)
{
struct net_device *dev = t->dev;
+ struct net_device *tdev = NULL;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
+ unsigned int mtu;
int t_hlen;
memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
@@ -1457,22 +1480,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct rt6_info *rt = rt6_lookup(t->net,
&p->raddr, &p->laddr,
p->link, NULL, strict);
+ if (rt) {
+ tdev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
- if (!rt)
- return;
+ if (!tdev && p->link)
+ tdev = __dev_get_by_index(t->net, p->link);
- if (rt->dst.dev) {
- dev->hard_header_len = rt->dst.dev->hard_header_len +
- t_hlen;
+ if (tdev) {
+ dev->hard_header_len = tdev->hard_header_len + t_hlen;
+ mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
- dev->mtu = rt->dst.dev->mtu - t_hlen;
+ dev->mtu = mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
if (dev->mtu < IPV6_MIN_MTU)
dev->mtu = IPV6_MIN_MTU;
}
- ip6_rt_put(rt);
}
}
@@ -1877,10 +1903,8 @@ static int ip6_tnl_dev_init(struct net_device *dev)
if (err)
return err;
ip6_tnl_link_config(t);
- if (t->parms.collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
+ if (t->parms.collect_md)
netif_keep_dst(dev);
- }
return 0;
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 024db17386d2..524006aa0d78 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -449,8 +449,17 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
int err = -1;
int mtu;
- if (!dst)
- goto tx_err_link_failure;
+ if (!dst) {
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+ skb_dst_set(skb, dst);
+ }
dst_hold(dst);
dst = xfrm_lookup(t->net, dst, fl, NULL, 0);
@@ -479,7 +488,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
mtu = dst_mtu(dst);
if (skb->len > mtu) {
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IPV6)) {
if (mtu < IPV6_MIN_MTU)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e80d36c5073d..bfa49ff70531 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
rtnl_unlock();
}
-static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR);
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack);
}
static unsigned int ip6mr_rules_seq_read(struct net *net)
@@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
rtnl_unlock();
}
-static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -1148,8 +1150,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
* Create a new entry if allowable
*/
- if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
- (c = ip6mr_cache_alloc_unres()) == NULL) {
+ c = ip6mr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
@@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net)
return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net);
}
-static int ip6mr_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
- ip6mr_mr_table_iter, &mrt_lock);
+ ip6mr_mr_table_iter, &mrt_lock, extack);
}
static struct notifier_block ip6_mr_notifier = {
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 264c292e7dcc..79fc012dd2ca 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -363,8 +363,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_TRANSPARENT:
- if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) &&
- !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) &&
+ !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
retv = -EPERM;
break;
}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 7f3f13c37916..eaa4c2cc2fbb 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -787,14 +787,15 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
if (pmc) {
im->idev = pmc->idev;
if (im->mca_sfmode == MCAST_INCLUDE) {
- im->mca_tomb = pmc->mca_tomb;
- im->mca_sources = pmc->mca_sources;
+ swap(im->mca_tomb, pmc->mca_tomb);
+ swap(im->mca_sources, pmc->mca_sources);
for (psf = im->mca_sources; psf; psf = psf->sf_next)
psf->sf_crcount = idev->mc_qrv;
} else {
im->mca_crcount = idev->mc_qrv;
}
in6_dev_put(pmc->idev);
+ ip6_mc_clear_src(pmc);
kfree(pmc);
}
spin_unlock_bh(&im->mca_lock);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 083cc1c94cd3..53caf59c591e 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -196,6 +196,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev,
{
return opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
+ opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 61819ed858b1..409e79b84a83 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -113,12 +113,13 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst,
EXPORT_SYMBOL_GPL(__nf_ip6_route);
int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
- struct nf_ct_bridge_frag_data *data,
+ struct nf_bridge_frag_data *data,
int (*output)(struct net *, struct sock *sk,
- const struct nf_ct_bridge_frag_data *data,
+ const struct nf_bridge_frag_data *data,
struct sk_buff *))
{
int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ ktime_t tstamp = skb->tstamp;
struct ip6_frag_state state;
u8 *prevhdr, nexthdr = 0;
unsigned int mtu, hlen;
@@ -183,6 +184,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (iter.frag)
ip6_fraglist_prepare(skb, &iter);
+ skb->tstamp = tstamp;
err = output(net, sk, data, skb);
if (err || !iter.frag)
break;
@@ -215,6 +217,7 @@ slow_path:
goto blackhole;
}
+ skb2->tstamp = tstamp;
err = output(net, sk, data, skb2);
if (err)
goto blackhole;
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 6120a7800975..0594131fa46d 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -128,9 +128,9 @@ config IP6_NF_MATCH_HL
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MATCH_HL
---help---
- This is a backwards-compat option for the user's convenience
- (e.g. when running oldconfig). It selects
- CONFIG_NETFILTER_XT_MATCH_HL.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
config IP6_NF_MATCH_IPV6HEADER
tristate '"ipv6header" IPv6 Extension Headers Match'
@@ -170,13 +170,13 @@ config IP6_NF_MATCH_RT
To compile it as a module, choose M here. If unsure, say N.
config IP6_NF_MATCH_SRH
- tristate '"srh" Segment Routing header match support'
- depends on NETFILTER_ADVANCED
- help
- srh matching allows you to match packets based on the segment
+ tristate '"srh" Segment Routing header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ srh matching allows you to match packets based on the segment
routing header of the packet.
- To compile it as a module, choose M here. If unsure, say N.
+ To compile it as a module, choose M here. If unsure, say N.
# The targets
config IP6_NF_TARGET_HL
@@ -184,9 +184,9 @@ config IP6_NF_TARGET_HL
depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
select NETFILTER_XT_TARGET_HL
---help---
- This is a backwards-compatible option for the user's convenience
- (e.g. when running oldconfig). It selects
- CONFIG_NETFILTER_XT_TARGET_HL.
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
config IP6_NF_FILTER
tristate "Packet filtering"
@@ -245,14 +245,14 @@ config IP6_NF_RAW
# security table for MAC policy
config IP6_NF_SECURITY
- tristate "Security table"
- depends on SECURITY
- depends on NETFILTER_ADVANCED
- help
- This option adds a `security' table to iptables, for use
- with Mandatory Access Control (MAC) policy.
-
- If unsure, say N.
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
+
+ If unsure, say N.
config IP6_NF_NAT
tristate "ip6tables NAT support"
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 5cdb4a69d277..fd1f52a21bf1 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -36,8 +36,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
opts.options |= XT_SYNPROXY_OPT_ECN;
opts.options &= info->options;
- opts.mss_encode = opts.mss;
- opts.mss = info->mss;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, &opts);
else
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 0fc6326ef499..c52ff929c93b 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -16,7 +16,7 @@
#include <net/ipv6.h>
#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_ipv6/ip6t_ipv6header.h>
MODULE_LICENSE("GPL");
@@ -42,7 +42,7 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
len = skb->len - ptr;
temp = 0;
- while (ip6t_ext_hdr(nexthdr)) {
+ while (nf_ip6_ext_hdr(nexthdr)) {
const struct ipv6_opt_hdr *hp;
struct ipv6_opt_hdr _hdr;
int hdrlen;
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index e6c9da9866b1..a0a2de30be3e 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -54,7 +54,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
return;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_reset(skb);
+ nf_reset_ct(skb);
nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
#endif
if (hooknum == NF_INET_PRE_ROUTING ||
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f6d9a48c7a2a..a8566ee12e83 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -10,6 +10,8 @@
static struct nf_flowtable_type flowtable_ipv6 = {
.family = NFPROTO_IPV6,
.init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv6,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ipv6_hook,
.owner = THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index f53bd8f01219..22b80db6d882 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -18,7 +18,7 @@
#include <net/route.h>
#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/xt_LOG.h>
#include <net/netfilter/nf_log.h>
@@ -70,7 +70,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
fragment = 0;
ptr = ip6hoff + sizeof(struct ipv6hdr);
currenthdr = ih->nexthdr;
- while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
+ while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) {
struct ipv6_opt_hdr _hdr;
const struct ipv6_opt_hdr *hp;
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index 437d95545c31..b9df879c48d3 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -12,7 +12,6 @@
#include <net/sock.h>
#include <net/inet_sock.h>
#include <net/inet6_hashtables.h>
-#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_socket.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index 34d51cd426b0..6bac68fb27a3 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -150,4 +150,4 @@ EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
-MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
+MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support");
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 87d2d8c1db7c..98ac32b49d8c 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -223,7 +223,7 @@ static int __net_init ping_v6_proc_init_net(struct net *net)
return 0;
}
-static void __net_init ping_v6_proc_exit_net(struct net *net)
+static void __net_exit ping_v6_proc_exit_net(struct net *net)
{
remove_proc_entry("icmp6", net->proc_net);
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8a6131991e38..dfe5e603ffe1 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -215,7 +215,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
/* Not releasing hash table! */
if (clone) {
- nf_reset(clone);
+ nf_reset_ct(clone);
rawv6_rcv(sk, clone);
}
}
@@ -646,7 +646,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->protocol = htons(ETH_P_IPV6);
skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->mark = sockc->mark;
skb->tstamp = sockc->transmit_time;
skb_put(skb, length);
@@ -810,6 +810,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipcm6_init(&ipc6);
ipc6.sockc.tsflags = sk->sk_tsflags;
+ ipc6.sockc.mark = sk->sk_mark;
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -891,6 +892,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt = ipv6_fixup_options(&opt_space, opt);
fl6.flowi6_proto = proto;
+ fl6.flowi6_mark = ipc6.sockc.mark;
if (!hdrincl) {
rfv.msg = msg;
@@ -923,7 +925,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fd059e08785a..4fbdc60b4e07 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -95,7 +95,8 @@ static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu);
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
@@ -227,7 +228,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
struct net_device *dev = dst->dev;
struct rt6_info *rt = (struct rt6_info *)dst;
- daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
+ daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
if (!daddr)
return;
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -264,7 +265,8 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
}
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
}
@@ -621,6 +623,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
{
struct __rt6_probe_work *work = NULL;
const struct in6_addr *nh_gw;
+ unsigned long last_probe;
struct neighbour *neigh;
struct net_device *dev;
struct inet6_dev *idev;
@@ -633,12 +636,13 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (fib6_nh->fib_nh_gw_family)
+ if (!fib6_nh->fib_nh_gw_family)
return;
nh_gw = &fib6_nh->fib_nh_gw6;
dev = fib6_nh->fib_nh_dev;
rcu_read_lock_bh();
+ last_probe = READ_ONCE(fib6_nh->last_probe);
idev = __in6_dev_get(dev);
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
if (neigh) {
@@ -654,13 +658,15 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
__neigh_set_probe_once(neigh);
}
write_unlock(&neigh->lock);
- } else if (time_after(jiffies, fib6_nh->last_probe +
+ } else if (time_after(jiffies, last_probe +
idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
- if (work) {
- fib6_nh->last_probe = jiffies;
+ if (!work || cmpxchg(&fib6_nh->last_probe,
+ last_probe, jiffies) != last_probe) {
+ kfree(work);
+ } else {
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = *nh_gw;
dev_hold(dev);
@@ -1475,11 +1481,11 @@ static u32 rt6_exception_hash(const struct in6_addr *dst,
u32 val;
net_get_random_once(&seed, sizeof(seed));
- val = jhash(dst, sizeof(*dst), seed);
+ val = jhash2((const u32 *)dst, sizeof(*dst)/sizeof(u32), seed);
#ifdef CONFIG_IPV6_SUBTREES
if (src)
- val = jhash(src, sizeof(*src), val);
+ val = jhash2((const u32 *)src, sizeof(*src)/sizeof(u32), val);
#endif
return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}
@@ -2291,10 +2297,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
if (!icmph)
goto out;
- if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
- icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
- icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
- icmph->icmp6_type != ICMPV6_PARAMPROB)
+ if (!icmpv6_is_err(icmph->icmp6_type))
goto out;
inner_iph = skb_header_pointer(skb,
@@ -2691,7 +2694,8 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
- const struct ipv6hdr *iph, u32 mtu)
+ const struct ipv6hdr *iph, u32 mtu,
+ bool confirm_neigh)
{
const struct in6_addr *daddr, *saddr;
struct rt6_info *rt6 = (struct rt6_info *)dst;
@@ -2709,7 +2713,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
daddr = NULL;
saddr = NULL;
}
- dst_confirm_neigh(dst, daddr);
+
+ if (confirm_neigh)
+ dst_confirm_neigh(dst, daddr);
+
mtu = max_t(u32, mtu, IPV6_MIN_MTU);
if (mtu >= dst_mtu(dst))
return;
@@ -2725,10 +2732,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
rcu_read_lock();
res.f6i = rcu_dereference(rt6->from);
- if (!res.f6i) {
- rcu_read_unlock();
- return;
- }
+ if (!res.f6i)
+ goto out_unlock;
+
res.fib6_flags = res.f6i->fib6_flags;
res.fib6_type = res.f6i->fib6_type;
@@ -2744,10 +2750,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
/* fib6_info uses a nexthop that does not have fib6_nh
* using the dst->dev + gw. Should be impossible.
*/
- if (!arg.match) {
- rcu_read_unlock();
- return;
- }
+ if (!arg.match)
+ goto out_unlock;
res.nh = arg.match;
} else {
@@ -2760,14 +2764,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (rt6_insert_exception(nrt6, &res))
dst_release_immediate(&nrt6->dst);
}
+out_unlock:
rcu_read_unlock();
}
}
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
- __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+ __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
+ confirm_neigh);
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
@@ -2786,7 +2793,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
- __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
+ __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -3385,6 +3392,9 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
int err;
fib6_nh->fib_nh_family = AF_INET6;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ fib6_nh->last_probe = jiffies;
+#endif
err = -ENODEV;
if (cfg->fc_ifindex) {
@@ -3747,6 +3757,7 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
struct fib6_info *sibling, *next_sibling;
+ struct fib6_node *fn;
/* prefer to send a single notification with all hops */
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
@@ -3762,12 +3773,32 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
info->skip_notify = 1;
}
+ /* 'rt' points to the first sibling route. If it is not the
+ * leaf, then we do not need to send a notification. Otherwise,
+ * we need to check if the last sibling has a next route or not
+ * and emit a replace or delete notification, respectively.
+ */
info->skip_notify_kernel = 1;
- call_fib6_multipath_entry_notifiers(net,
- FIB_EVENT_ENTRY_DEL,
- rt,
- rt->fib6_nsiblings,
- NULL);
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (rcu_access_pointer(fn->leaf) == rt) {
+ struct fib6_info *last_sibling, *replace_rt;
+
+ last_sibling = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
+ replace_rt = rcu_dereference_protected(
+ last_sibling->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ if (replace_rt)
+ call_fib6_entry_notifiers_replace(net,
+ replace_rt);
+ else
+ call_fib6_multipath_entry_notifiers(net,
+ FIB_EVENT_ENTRY_DEL,
+ rt, rt->fib6_nsiblings,
+ NULL);
+ }
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings,
fib6_siblings) {
@@ -4388,13 +4419,14 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
.fc_ifindex = idev->dev->ifindex,
- .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
+ .fc_flags = RTF_UP | RTF_NONEXTHOP,
.fc_dst = *addr,
.fc_dst_len = 128,
.fc_protocol = RTPROT_KERNEL,
.fc_nlinfo.nl_net = net,
.fc_ignore_dev_down = true,
};
+ struct fib6_info *f6i;
if (anycast) {
cfg.fc_type = RTN_ANYCAST;
@@ -4404,7 +4436,10 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
cfg.fc_flags |= RTF_LOCAL;
}
- return ip6_route_info_create(&cfg, gfp_flags, NULL);
+ f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
+ if (!IS_ERR(f6i))
+ f6i->dst_nocount = true;
+ return f6i;
}
/* remove deleted ip from prefsrc entries */
@@ -5011,12 +5046,37 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}
+static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
+{
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ bool should_notify = false;
+ struct fib6_info *leaf;
+ struct fib6_node *fn;
+
+ rcu_read_lock();
+ fn = rcu_dereference(rt->fib6_node);
+ if (!fn)
+ goto out;
+
+ leaf = rcu_dereference(fn->leaf);
+ if (!leaf)
+ goto out;
+
+ if (rt == leaf ||
+ (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
+ rt6_qualify_for_ecmp(leaf)))
+ should_notify = true;
+out:
+ rcu_read_unlock();
+
+ return should_notify;
+}
+
static int ip6_route_multipath_add(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
- enum fib_event_type event_type;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
struct fib6_info *rt;
@@ -5141,13 +5201,27 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nhn++;
}
- event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD;
- err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type,
- rt_notif, nhn - 1, extack);
- if (err) {
- /* Delete all the siblings that were just added */
- err_nh = NULL;
- goto add_errout;
+ /* An in-kernel notification should only be sent in case the new
+ * multipath route is added as the first route in the node, or if
+ * it was appended to it. We pass 'rt_notif' since it is the first
+ * sibling and might allow us to skip some checks in the replace case.
+ */
+ if (ip6_route_mpath_should_notify(rt_notif)) {
+ enum fib_event_type fib_event;
+
+ if (rt_notif->fib6_nsiblings != nhn - 1)
+ fib_event = FIB_EVENT_ENTRY_APPEND;
+ else
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+
+ err = call_fib6_multipath_entry_notifiers(info->nl_net,
+ fib_event, rt_notif,
+ nhn - 1, extack);
+ if (err) {
+ /* Delete all the siblings that were just added */
+ err_nh = NULL;
+ goto add_errout;
+ }
}
/* success ... tell user about new route */
@@ -5325,11 +5399,11 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
if (nexthop_is_multipath(nh)) {
struct nlattr *mp;
- mp = nla_nest_start(skb, RTA_MULTIPATH);
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
if (!mp)
goto nla_put_failure;
- if (nexthop_mpath_fill_node(skb, nh))
+ if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
goto nla_put_failure;
nla_nest_end(skb, mp);
@@ -5337,7 +5411,7 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
struct fib6_nh *fib6_nh;
fib6_nh = nexthop_fib6_nh(nh);
- if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
flags, false) < 0)
goto nla_put_failure;
}
@@ -5466,13 +5540,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
- rt->fib6_nh->fib_nh_weight) < 0)
+ rt->fib6_nh->fib_nh_weight, AF_INET6) < 0)
goto nla_put_failure;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings) {
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
- sibling->fib6_nh->fib_nh_weight) < 0)
+ sibling->fib6_nh->fib_nh_weight,
+ AF_INET6) < 0)
goto nla_put_failure;
}
@@ -5489,7 +5564,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
rtm->rtm_flags |= nh_flags;
} else {
- if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
&nh_flags, false) < 0)
goto nla_put_failure;
@@ -5501,6 +5576,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
expires -= jiffies;
}
+ if (!dst) {
+ if (rt->offload)
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (rt->trap)
+ rtm->rtm_flags |= RTM_F_TRAP;
+ }
+
if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;
@@ -6192,6 +6274,9 @@ static int __net_init ip6_route_net_init(struct net *net)
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
+#ifdef CONFIG_IPV6_SUBTREES
+ net->ipv6.fib6_routes_require_src = 0;
+#endif
#endif
net->ipv6.sysctl.flush_delay = 0;
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 9d4f75e0d33a..7cbc19731997 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -23,6 +23,7 @@
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/dst_cache.h>
+#include <net/ip_tunnels.h>
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
@@ -81,6 +82,11 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
if (!pskb_may_pull(skb, srhoff + len))
return NULL;
+ /* note that pskb_may_pull may change pointers in header;
+ * for this reason it is necessary to reload them when needed.
+ */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
if (!seg6_validate_srh(srh, len))
return NULL;
@@ -130,7 +136,8 @@ static bool decap_and_validate(struct sk_buff *skb, int proto)
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
- skb->encapsulation = 0;
+ if (iptunnel_pull_offloads(skb))
+ return false;
return true;
}
@@ -144,8 +151,9 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
*daddr = *addr;
}
-int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
- u32 tbl_id)
+static int
+seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+ u32 tbl_id, bool local_delivery)
{
struct net *net = dev_net(skb->dev);
struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -153,6 +161,7 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
struct dst_entry *dst = NULL;
struct rt6_info *rt;
struct flowi6 fl6;
+ int dev_flags = 0;
fl6.flowi6_iif = skb->dev->ifindex;
fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
@@ -177,7 +186,13 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
dst = &rt->dst;
}
- if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) {
+ /* we want to discard traffic destined for local packet processing,
+ * if @local_delivery is set to false.
+ */
+ if (!local_delivery)
+ dev_flags |= IFF_LOOPBACK;
+
+ if (dst && (dst->dev->flags & dev_flags) && !dst->error) {
dst_release(dst);
dst = NULL;
}
@@ -194,6 +209,12 @@ out:
return dst->error;
}
+int seg6_lookup_nexthop(struct sk_buff *skb,
+ struct in6_addr *nhaddr, u32 tbl_id)
+{
+ return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false);
+}
+
/* regular endpoint function */
static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
{
@@ -336,6 +357,8 @@ static int input_action_end_dx6(struct sk_buff *skb,
if (!ipv6_addr_any(&slwt->nh6))
nhaddr = &slwt->nh6;
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
seg6_lookup_nexthop(skb, nhaddr, 0);
return dst_input(skb);
@@ -365,6 +388,8 @@ static int input_action_end_dx4(struct sk_buff *skb,
skb_dst_drop(skb);
+ skb_set_transport_header(skb, sizeof(struct iphdr));
+
err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
if (err)
goto drop;
@@ -385,7 +410,9 @@ static int input_action_end_dt6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- seg6_lookup_nexthop(skb, NULL, slwt->table);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ seg6_lookup_any_nexthop(skb, NULL, slwt->table, true);
return dst_input(skb);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index b2ccbc473127..98954830c40b 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -944,7 +944,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
}
if (tunnel->parms.iph.daddr)
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->len > mtu && !skb_is_gso(skb)) {
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 16632e02e9b0..13235a012388 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -178,6 +178,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
treq = tcp_rsk(req);
treq->tfo_listener = false;
+ if (IS_ENABLED(CONFIG_MPTCP))
+ treq->is_mptcp = 0;
+
if (security_inet_conn_request(sk, skb, req))
goto out_free;
@@ -235,7 +238,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_uid = sk->sk_uid;
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst))
goto out_free;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5da069e91cac..eaf09e6b7844 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -75,13 +75,14 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
static const struct inet_connection_sock_af_ops ipv6_mapped;
-static const struct inet_connection_sock_af_ops ipv6_specific;
+const struct inet_connection_sock_af_ops ipv6_specific;
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
#else
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
- const struct in6_addr *addr)
+ const struct in6_addr *addr,
+ int l3index)
{
return NULL;
}
@@ -215,7 +216,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
!ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ WRITE_ONCE(tp->write_seq, 0);
}
sk->sk_v6_daddr = usin->sin6_addr;
@@ -237,6 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
icsk->icsk_af_ops = &ipv6_mapped;
+ if (sk_is_mptcp(sk))
+ mptcpv6_handle_mapped(sk, true);
sk->sk_backlog_rcv = tcp_v4_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
tp->af_specific = &tcp_sock_ipv6_mapped_specific;
@@ -247,6 +250,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (err) {
icsk->icsk_ext_hdr_len = exthdrlen;
icsk->icsk_af_ops = &ipv6_specific;
+ if (sk_is_mptcp(sk))
+ mptcpv6_handle_mapped(sk, false);
sk->sk_backlog_rcv = tcp_v6_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
tp->af_specific = &tcp_sock_ipv6_specific;
@@ -275,7 +280,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
@@ -311,10 +316,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (likely(!tp->repair)) {
if (!tp->write_seq)
- tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32,
- inet->inet_sport,
- inet->inet_dport);
+ WRITE_ONCE(tp->write_seq,
+ secure_tcpv6_seq(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport));
tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk),
np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32);
@@ -406,7 +412,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
- fastopen = tp->fastopen_rsk;
+ fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
@@ -512,7 +518,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+ sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -530,15 +537,22 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
#ifdef CONFIG_TCP_MD5SIG
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
- const struct in6_addr *addr)
+ const struct in6_addr *addr,
+ int l3index)
{
- return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6);
+ return tcp_md5_do_lookup(sk, l3index,
+ (union tcp_md5_addr *)addr, AF_INET6);
}
static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
- return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
+ int l3index;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr,
+ l3index);
}
static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
@@ -546,6 +560,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
{
struct tcp_md5sig cmd;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
+ int l3index = 0;
u8 prefixlen;
if (optlen < sizeof(cmd))
@@ -567,12 +582,30 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
}
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
if (!cmd.tcpm_keylen) {
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
- AF_INET, prefixlen);
+ AF_INET, prefixlen,
+ l3index);
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
- AF_INET6, prefixlen);
+ AF_INET6, prefixlen, l3index);
}
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
@@ -580,12 +613,13 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
- AF_INET, prefixlen, cmd.tcpm_key,
- cmd.tcpm_keylen, GFP_KERNEL);
+ AF_INET, prefixlen, l3index,
+ cmd.tcpm_key, cmd.tcpm_keylen,
+ GFP_KERNEL);
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
- AF_INET6, prefixlen, cmd.tcpm_key,
- cmd.tcpm_keylen, GFP_KERNEL);
+ AF_INET6, prefixlen, l3index,
+ cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
}
static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
@@ -696,17 +730,23 @@ clear_hash_noput:
#endif
static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ int dif, int sdif)
{
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected;
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
const struct tcphdr *th = tcp_hdr(skb);
- int genhash;
+ int genhash, l3index;
u8 newhash[16];
- hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr, l3index);
hash_location = tcp_parse_md5sig_option(th);
/* We've parsed the options - do we have a hash? */
@@ -730,10 +770,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
+ net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n",
genhash ? "failed" : "mismatch",
&ip6h->saddr, ntohs(th->source),
- &ip6h->daddr, ntohs(th->dest));
+ &ip6h->daddr, ntohs(th->dest), l3index);
return true;
}
#endif
@@ -783,7 +823,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
.syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
sizeof(struct ipv6hdr),
#ifdef CONFIG_TCP_MD5SIG
@@ -803,7 +843,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, __be32 label)
+ u8 tclass, __be32 label, u32 priority)
{
const struct tcphdr *th = tcp_hdr(skb);
struct tcphdr *t1;
@@ -904,10 +944,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
* Underlying function will use this to retrieve the network
* namespace
*/
- dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass,
+ priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -930,6 +971,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL;
#endif
__be32 label = 0;
+ u32 priority = 0;
struct net *net;
int oif = 0;
@@ -947,8 +989,18 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
- key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr);
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
} else if (hash_location) {
+ int dif = tcp_v6_iif_l3_slave(skb);
+ int sdif = tcp_v6_sdif(skb);
+ int l3index;
+
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -960,13 +1012,16 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
&tcp_hashinfo, NULL, 0,
&ipv6h->saddr,
th->source, &ipv6h->daddr,
- ntohs(th->source),
- tcp_v6_iif_l3_slave(skb),
- tcp_v6_sdif(skb));
+ ntohs(th->source), dif, sdif);
if (!sk1)
goto out;
- key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = tcp_v6_sdif(skb) ? dif : 0;
+
+ key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
if (!key)
goto out;
@@ -990,16 +1045,19 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
trace_tcp_send_reset(sk, skb);
if (np->repflow)
label = ip6_flowlabel(ipv6h);
+ priority = sk->sk_priority;
}
- if (sk->sk_state == TCP_TIME_WAIT)
+ if (sk->sk_state == TCP_TIME_WAIT) {
label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ priority = inet_twsk(sk)->tw_priority;
+ }
} else {
if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
label = ip6_flowlabel(ipv6h);
}
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0,
- label);
+ label, priority);
#ifdef CONFIG_TCP_MD5SIG
out:
@@ -1010,10 +1068,10 @@ out:
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key, u8 tclass,
- __be32 label)
+ __be32 label, u32 priority)
{
tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
- tclass, label);
+ tclass, label, priority);
}
static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -1025,7 +1083,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
+ tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority);
inet_twsk_put(tw);
}
@@ -1033,6 +1091,10 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
+ int l3index;
+
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
@@ -1047,8 +1109,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent, sk->sk_bound_dev_if,
- tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
- 0, 0);
+ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
+ 0, 0, sk->sk_priority);
}
@@ -1063,6 +1125,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
+u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops,
+ &tcp_request_sock_ipv6_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v6_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
if (skb->protocol == htons(ETH_P_IP))
@@ -1104,6 +1181,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
+ int l3index;
#endif
struct flowi6 fl6;
@@ -1129,6 +1207,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newnp->saddr = newsk->sk_v6_rcv_saddr;
inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+ if (sk_is_mptcp(newsk))
+ mptcpv6_handle_mapped(newsk, true);
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
#ifdef CONFIG_TCP_MD5SIG
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
@@ -1247,8 +1327,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
#ifdef CONFIG_TCP_MD5SIG
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+
/* Copy over the MD5 key from the original socket */
- key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr);
+ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
if (key) {
/* We're using one, so create a matching key
* on the newsk structure. If we fail to get
@@ -1256,7 +1338,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
- AF_INET6, 128, key->key, key->keylen,
+ AF_INET6, 128, l3index, key->key, key->keylen,
sk_gfp_mask(sk, GFP_ATOMIC));
}
#endif
@@ -1458,6 +1540,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
struct sk_buff *skb_to_free;
int sdif = inet6_sdif(skb);
+ int dif = inet6_iif(skb);
const struct tcphdr *th;
const struct ipv6hdr *hdr;
bool refcounted;
@@ -1506,7 +1589,7 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (tcp_v6_inbound_md5_hash(sk, skb)) {
+ if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) {
sk_drops_add(sk, skb);
reqsk_put(req);
goto discard_it;
@@ -1561,7 +1644,7 @@ process:
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- if (tcp_v6_inbound_md5_hash(sk, skb))
+ if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif))
goto discard_and_relse;
if (tcp_filter(sk, skb))
@@ -1717,7 +1800,7 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_destructor = tcp_twsk_destructor,
};
-static const struct inet_connection_sock_af_ops ipv6_specific = {
+const struct inet_connection_sock_af_ops ipv6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
@@ -1869,12 +1952,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
state = inet_sk_state_load(sp);
if (state == TCP_LISTEN)
- rx_queue = sp->sk_ack_backlog;
+ rx_queue = READ_ONCE(sp->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
- rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
@@ -1885,7 +1969,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
state,
- tp->write_seq - tp->snd_una,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
@@ -2085,9 +2169,16 @@ int __init tcpv6_init(void)
ret = register_pernet_subsys(&tcpv6_net_ops);
if (ret)
goto out_tcpv6_protosw;
+
+ ret = mptcpv6_init();
+ if (ret)
+ goto out_tcpv6_pernet_subsys;
+
out:
return ret;
+out_tcpv6_pernet_subsys:
+ unregister_pernet_subsys(&tcpv6_net_ops);
out_tcpv6_protosw:
inet6_unregister_protosw(&tcpv6_protosw);
out_tcpv6_protocol:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 827fe7385078..5dc439a391fe 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -135,7 +135,7 @@ static int compute_score(struct sock *sk, struct net *net,
return -1;
score++;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
return score;
@@ -158,13 +158,14 @@ static struct sock *udp6_lib_lookup2(struct net *net,
score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
+ if (sk->sk_reuseport &&
+ sk->sk_state != TCP_ESTABLISHED) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (result)
+ if (result && !reuseport_has_conns(sk, false))
return result;
}
result = sk;
@@ -689,8 +690,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
__skb_push(skb, -skb_mac_offset(skb));
segs = udp_rcv_segment(sk, skb, false);
- for (skb = segs; skb; skb = next) {
- next = skb->next;
+ skb_list_walk_safe(segs, skb, next) {
__skb_pull(skb, skb_transport_offset(skb));
ret = udpv6_queue_rcv_one_skb(sk, skb);
@@ -1108,6 +1108,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
__wsum csum = 0;
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
/*
* Create a UDP header
@@ -1140,8 +1141,12 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
return -EIO;
}
- skb_shinfo(skb)->gso_size = cork->gso_size;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
+ }
goto csum_partial;
}
@@ -1230,6 +1235,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipcm6_init(&ipc6);
ipc6.gso_size = up->gso_size;
ipc6.sockc.tsflags = sk->sk_tsflags;
+ ipc6.sockc.mark = sk->sk_mark;
/* destination address check */
if (sin6) {
@@ -1352,7 +1358,7 @@ do_udp_sendmsg:
if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
- fl6.flowi6_mark = sk->sk_mark;
+ fl6.flowi6_mark = ipc6.sockc.mark;
fl6.flowi6_uid = sk->sk_uid;
if (msg->msg_controllen) {
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 64b8f05d6735..584157a07759 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -115,8 +115,10 @@ INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sk_buff *pp;
+ struct sock *sk;
- if (unlikely(!uh) || !static_branch_unlikely(&udpv6_encap_needed_key))
+ if (unlikely(!uh))
goto flush;
/* Don't bother verifying checksum if we're going to flush anyway. */
@@ -127,12 +129,16 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
ip6_gro_compute_pseudo))
goto flush;
else if (uh->check)
- skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
ip6_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 1;
- return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
+ rcu_read_lock();
+ sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL;
+ pp = udp_gro_receive(head, skb, uh, sk);
+ rcu_read_unlock();
+ return pp;
flush:
NAPI_GRO_CB(skb)->flush = 1;
@@ -144,6 +150,23 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
+ skb->csum_level++;
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->csum_level = 0;
+ }
+
+ return 0;
+ }
+
if (uh->check)
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index eecac1b7148e..fbe51d40bd7e 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -187,7 +187,7 @@ skip_frag:
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst(skb)->dev,
__xfrm6_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 699e0730ce8e..af7a4b8b1e9c 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -98,12 +98,13 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
}
static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, sk, skb, mtu);
+ path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
}
static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk,
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index ebb62a4ebe30..c4bdcbc84b07 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -50,7 +50,7 @@ static struct iucv_interface *pr_iucv;
static const u8 iprm_shutdown[8] =
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
-#define TRGCLS_SIZE FIELD_SIZEOF(struct iucv_message, class)
+#define TRGCLS_SIZE sizeof_field(struct iucv_message, class)
#define __iucv_sock_wait(sk, condition, timeo, ret) \
do { \
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 5dbc0c48f8cb..ea9e73428ed9 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -378,8 +378,12 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
{
struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
struct bpf_prog *prog = psock->bpf_prog;
+ int res;
- return (*prog->bpf_func)(skb, prog->insnsi);
+ preempt_disable();
+ res = BPF_PROG_RUN(prog, skb);
+ preempt_enable();
+ return res;
}
static int kcm_read_sock_done(struct strparser *strp, int err)
@@ -635,15 +639,15 @@ do_frag_list:
frag_offset = 0;
do_frag:
frag = &skb_shinfo(skb)->frags[fragidx];
- if (WARN_ON(!frag->size)) {
+ if (WARN_ON(!skb_frag_size(frag))) {
ret = -EINVAL;
goto out;
}
ret = kernel_sendpage(psock->sk->sk_socket,
- frag->page.p,
- frag->page_offset + frag_offset,
- frag->size - frag_offset,
+ skb_frag_page(frag),
+ skb_frag_off(frag) + frag_offset,
+ skb_frag_size(frag) - frag_offset,
MSG_DONTWAIT);
if (ret <= 0) {
if (ret == -EAGAIN) {
@@ -678,7 +682,7 @@ do_frag:
sent += ret;
frag_offset += ret;
KCM_STATS_ADD(psock->stats.tx_bytes, ret);
- if (frag_offset < frag->size) {
+ if (frag_offset < skb_frag_size(frag)) {
/* Not finished with this frag */
goto do_frag;
}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 105e5a7092e7..fcb53ed1c4fb 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -122,8 +122,6 @@ static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk)
static inline struct l2tp_net *l2tp_pernet(const struct net *net)
{
- BUG_ON(!net);
-
return net_generic(net, l2tp_net_id);
}
@@ -322,8 +320,13 @@ int l2tp_session_register(struct l2tp_session *session,
spin_lock_bh(&pn->l2tp_session_hlist_lock);
+ /* IP encap expects session IDs to be globally unique, while
+ * UDP encap doesn't.
+ */
hlist_for_each_entry(session_walk, g_head, global_hlist)
- if (session_walk->session_id == session->session_id) {
+ if (session_walk->session_id == session->session_id &&
+ (session_walk->tunnel->encap == L2TP_ENCAPTYPE_IP ||
+ tunnel->encap == L2TP_ENCAPTYPE_IP)) {
err = -EEXIST;
goto err_tlock_pnlock;
}
@@ -1078,7 +1081,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
- nf_reset(skb);
+ nf_reset_ct(skb);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index bd3f39349d40..d3b520b9b2c9 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -56,7 +56,6 @@ static int l2tp_eth_dev_init(struct net_device *dev)
{
eth_hw_addr_random(dev);
eth_broadcast_addr(dev->broadcast);
- netdev_lockdep_set_classes(dev);
return 0;
}
@@ -151,7 +150,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
skb->ip_summed = CHECKSUM_NONE;
skb_dst_drop(skb);
- nf_reset(skb);
+ nf_reset_ct(skb);
rcu_read_lock();
dev = rcu_dereference(spriv->dev);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 622833317dcb..0d7c887a2b75 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -193,7 +193,7 @@ pass_up:
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
- nf_reset(skb);
+ nf_reset_ct(skb);
return sk_receive_skb(sk, skb, 1);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 687e23a8b326..d148766f40d1 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -206,7 +206,7 @@ pass_up:
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
- nf_reset(skb);
+ nf_reset_ct(skb);
return sk_receive_skb(sk, skb, 1);
@@ -615,7 +615,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 2017b7d780f5..2922d4150d88 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -113,22 +113,26 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr)
*
* Send data via reliable llc2 connection.
* Returns 0 upon success, non-zero if action did not succeed.
+ *
+ * This function always consumes a reference to the skb.
*/
static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
{
struct llc_sock* llc = llc_sk(sk);
- int rc = 0;
if (unlikely(llc_data_accept_state(llc->state) ||
llc->remote_busy_flag ||
llc->p_flag)) {
long timeout = sock_sndtimeo(sk, noblock);
+ int rc;
rc = llc_ui_wait_for_busy_core(sk, timeout);
+ if (rc) {
+ kfree_skb(skb);
+ return rc;
+ }
}
- if (unlikely(!rc))
- rc = llc_build_and_send_pkt(sk, skb);
- return rc;
+ return llc_build_and_send_pkt(sk, skb);
}
static void llc_ui_sk_init(struct socket *sock, struct sock *sk)
@@ -701,7 +705,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
/* put original socket back into a clean listen state. */
sk->sk_state = TCP_LISTEN;
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
dprintk("%s: ok success on %02X, client on %02X\n", __func__,
llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap);
frees:
@@ -776,7 +780,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
}
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail)
+ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
@@ -899,7 +903,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
int flags = msg->msg_flags;
int noblock = flags & MSG_DONTWAIT;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
size_t size = 0;
int rc = -EINVAL, copied = 0, hdrlen;
@@ -908,10 +912,10 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
lock_sock(sk);
if (addr) {
if (msg->msg_namelen < sizeof(*addr))
- goto release;
+ goto out;
} else {
if (llc_ui_addr_null(&llc->addr))
- goto release;
+ goto out;
addr = &llc->addr;
}
/* must bind connection to sap if user hasn't done it. */
@@ -919,7 +923,7 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
/* bind to sap with null dev, exclusive. */
rc = llc_ui_autobind(sock, addr);
if (rc)
- goto release;
+ goto out;
}
hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
size = hdrlen + len;
@@ -928,12 +932,12 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
copied = size - hdrlen;
rc = -EINVAL;
if (copied < 0)
- goto release;
+ goto out;
release_sock(sk);
skb = sock_alloc_send_skb(sk, size, noblock, &rc);
lock_sock(sk);
if (!skb)
- goto release;
+ goto out;
skb->dev = llc->dev;
skb->protocol = llc_proto_type(addr->sllc_arphrd);
skb_reserve(skb, hdrlen);
@@ -943,29 +947,31 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) {
llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac,
addr->sllc_sap);
+ skb = NULL;
goto out;
}
if (addr->sllc_test) {
llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac,
addr->sllc_sap);
+ skb = NULL;
goto out;
}
if (addr->sllc_xid) {
llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac,
addr->sllc_sap);
+ skb = NULL;
goto out;
}
rc = -ENOPROTOOPT;
if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua))
goto out;
rc = llc_ui_send_data(sk, skb, noblock);
+ skb = NULL;
out:
- if (rc) {
- kfree_skb(skb);
-release:
+ kfree_skb(skb);
+ if (rc)
dprintk("%s: failed sending from %02X to %02X: %d\n",
__func__, llc->laddr.lsap, llc->daddr.lsap, rc);
- }
release_sock(sk);
return rc ? : copied;
}
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index 4d78375f9872..647c0554d04c 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -372,6 +372,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
+ skb_get(skb);
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -389,7 +390,8 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb)
llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
- rc = llc_conn_send_pdu(sk, skb);
+ skb_get(skb);
+ llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
return rc;
@@ -406,6 +408,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
+ skb_get(skb);
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -916,7 +919,8 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk,
llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
- rc = llc_conn_send_pdu(sk, skb);
+ skb_get(skb);
+ llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
return rc;
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 4ff89cb7c86f..7b620acaca9e 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -30,7 +30,7 @@
#endif
static int llc_find_offset(int state, int ev_type);
-static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb);
+static void llc_conn_send_pdus(struct sock *sk);
static int llc_conn_service(struct sock *sk, struct sk_buff *skb);
static int llc_exec_conn_trans_actions(struct sock *sk,
struct llc_conn_state_trans *trans,
@@ -55,6 +55,8 @@ int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ;
* (executing it's actions and changing state), upper layer will be
* indicated or confirmed, if needed. Returns 0 for success, 1 for
* failure. The socket lock has to be held before calling this function.
+ *
+ * This function always consumes a reference to the skb.
*/
int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
{
@@ -62,12 +64,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(skb->sk);
struct llc_conn_state_ev *ev = llc_conn_ev(skb);
- /*
- * We have to hold the skb, because llc_conn_service will kfree it in
- * the sending path and we need to look at the skb->cb, where we encode
- * llc_conn_state_ev.
- */
- skb_get(skb);
ev->ind_prim = ev->cfm_prim = 0;
/*
* Send event to state machine
@@ -75,21 +71,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
rc = llc_conn_service(skb->sk, skb);
if (unlikely(rc != 0)) {
printk(KERN_ERR "%s: llc_conn_service failed\n", __func__);
- goto out_kfree_skb;
- }
-
- if (unlikely(!ev->ind_prim && !ev->cfm_prim)) {
- /* indicate or confirm not required */
- if (!skb->next)
- goto out_kfree_skb;
goto out_skb_put;
}
- if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */
- skb_get(skb);
-
switch (ev->ind_prim) {
case LLC_DATA_PRIM:
+ skb_get(skb);
llc_save_primitive(sk, skb, LLC_DATA_PRIM);
if (unlikely(sock_queue_rcv_skb(sk, skb))) {
/*
@@ -106,6 +93,7 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
* skb->sk pointing to the newly created struct sock in
* llc_conn_handler. -acme
*/
+ skb_get(skb);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_state_change(sk);
break;
@@ -121,7 +109,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
sk->sk_state_change(sk);
}
}
- kfree_skb(skb);
sock_put(sk);
break;
case LLC_RESET_PRIM:
@@ -130,14 +117,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
* RESET is not being notified to upper layers for now
*/
printk(KERN_INFO "%s: received a reset ind!\n", __func__);
- kfree_skb(skb);
break;
default:
- if (ev->ind_prim) {
+ if (ev->ind_prim)
printk(KERN_INFO "%s: received unknown %d prim!\n",
__func__, ev->ind_prim);
- kfree_skb(skb);
- }
/* No indication */
break;
}
@@ -179,25 +163,22 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
printk(KERN_INFO "%s: received a reset conf!\n", __func__);
break;
default:
- if (ev->cfm_prim) {
+ if (ev->cfm_prim)
printk(KERN_INFO "%s: received unknown %d prim!\n",
__func__, ev->cfm_prim);
- break;
- }
- goto out_skb_put; /* No confirmation */
+ /* No confirmation */
+ break;
}
-out_kfree_skb:
- kfree_skb(skb);
out_skb_put:
kfree_skb(skb);
return rc;
}
-int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb)
+void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb)
{
/* queue PDU to send to MAC layer */
skb_queue_tail(&sk->sk_write_queue, skb);
- return llc_conn_send_pdus(sk, skb);
+ llc_conn_send_pdus(sk);
}
/**
@@ -255,7 +236,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit)
if (howmany_resend > 0)
llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
/* any PDUs to re-send are queued up; start sending to MAC */
- llc_conn_send_pdus(sk, NULL);
+ llc_conn_send_pdus(sk);
out:;
}
@@ -296,7 +277,7 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit)
if (howmany_resend > 0)
llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
/* any PDUs to re-send are queued up; start sending to MAC */
- llc_conn_send_pdus(sk, NULL);
+ llc_conn_send_pdus(sk);
out:;
}
@@ -340,16 +321,12 @@ out:
/**
* llc_conn_send_pdus - Sends queued PDUs
* @sk: active connection
- * @hold_skb: the skb held by caller, or NULL if does not care
*
- * Sends queued pdus to MAC layer for transmission. When @hold_skb is
- * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent
- * successfully, or 1 for failure.
+ * Sends queued pdus to MAC layer for transmission.
*/
-static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb)
+static void llc_conn_send_pdus(struct sock *sk)
{
struct sk_buff *skb;
- int ret = 0;
while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) {
struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
@@ -361,20 +338,10 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb)
skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb);
if (!skb2)
break;
- dev_queue_xmit(skb2);
- } else {
- bool is_target = skb == hold_skb;
- int rc;
-
- if (is_target)
- skb_get(skb);
- rc = dev_queue_xmit(skb);
- if (is_target)
- ret = rc;
+ skb = skb2;
}
+ dev_queue_xmit(skb);
}
-
- return ret;
}
/**
@@ -846,7 +813,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
else {
dprintk("%s: adding to backlog...\n", __func__);
llc_set_backlog_type(skb, LLC_PACKET);
- if (sk_add_backlog(sk, skb, sk->sk_rcvbuf))
+ if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))
goto drop_unlock;
}
out:
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 8db03c2d5440..ad6547736c21 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -38,6 +38,8 @@
* closed and -EBUSY when sending data is not permitted in this state or
* LLC has send an I pdu with p bit set to 1 and is waiting for it's
* response.
+ *
+ * This function always consumes a reference to the skb.
*/
int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
{
@@ -46,20 +48,22 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
if (unlikely(llc->state == LLC_CONN_STATE_ADM))
- goto out;
+ goto out_free;
rc = -EBUSY;
if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */
llc->p_flag)) {
llc->failed_data_req = 1;
- goto out;
+ goto out_free;
}
ev = llc_conn_ev(skb);
ev->type = LLC_CONN_EV_TYPE_PRIM;
ev->prim = LLC_DATA_PRIM;
ev->prim_type = LLC_PRIM_TYPE_REQ;
skb->dev = llc->dev;
- rc = llc_conn_state_process(sk, skb);
-out:
+ return llc_conn_state_process(sk, skb);
+
+out_free:
+ kfree_skb(skb);
return rc;
}
diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
index a94bd56bcac6..7ae4cc684d3a 100644
--- a/net/llc/llc_s_ac.c
+++ b/net/llc/llc_s_ac.c
@@ -58,8 +58,10 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb)
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_ui_cmd(skb);
rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (likely(!rc))
+ if (likely(!rc)) {
+ skb_get(skb);
rc = dev_queue_xmit(skb);
+ }
return rc;
}
@@ -81,8 +83,10 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb)
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0);
rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (likely(!rc))
+ if (likely(!rc)) {
+ skb_get(skb);
rc = dev_queue_xmit(skb);
+ }
return rc;
}
@@ -135,8 +139,10 @@ int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb)
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_test_cmd(skb);
rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (likely(!rc))
+ if (likely(!rc)) {
+ skb_get(skb);
rc = dev_queue_xmit(skb);
+ }
return rc;
}
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index a7f7b8ff4729..be419062e19a 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -197,29 +197,22 @@ out:
* After executing actions of the event, upper layer will be indicated
* if needed(on receiving an UI frame). sk can be null for the
* datalink_proto case.
+ *
+ * This function always consumes a reference to the skb.
*/
static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb)
{
struct llc_sap_state_ev *ev = llc_sap_ev(skb);
- /*
- * We have to hold the skb, because llc_sap_next_state
- * will kfree it in the sending path and we need to
- * look at the skb->cb, where we encode llc_sap_state_ev.
- */
- skb_get(skb);
ev->ind_cfm_flag = 0;
llc_sap_next_state(sap, skb);
- if (ev->ind_cfm_flag == LLC_IND) {
- if (skb->sk->sk_state == TCP_LISTEN)
- kfree_skb(skb);
- else {
- llc_save_primitive(skb->sk, skb, ev->prim);
- /* queue skb to the user. */
- if (sock_queue_rcv_skb(skb->sk, skb))
- kfree_skb(skb);
- }
+ if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) {
+ llc_save_primitive(skb->sk, skb, ev->prim);
+
+ /* queue skb to the user. */
+ if (sock_queue_rcv_skb(skb->sk, skb) == 0)
+ return;
}
kfree_skb(skb);
}
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index 204a8351efff..c29170e767a8 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -32,7 +32,7 @@ static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb)
return LLC_PDU_IS_CMD(pdu) && /* command PDU */
LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */
LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID &&
- !pdu->dsap ? 0 : 1; /* NULL DSAP value */
+ !pdu->dsap; /* NULL DSAP value */
}
static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
@@ -42,7 +42,7 @@ static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
return LLC_PDU_IS_CMD(pdu) && /* command PDU */
LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */
LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST &&
- !pdu->dsap ? 0 : 1; /* NULL DSAP */
+ !pdu->dsap; /* NULL DSAP */
}
static int llc_station_ac_send_xid_r(struct sk_buff *skb)
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 4f03ebe732fa..6cbb1286d6c0 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -32,7 +32,8 @@ mac80211-y := \
chan.o \
trace.o mlme.o \
tdls.o \
- ocb.o
+ ocb.o \
+ airtime.o
mac80211-$(CONFIG_MAC80211_LEDS) += led.o
mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 01b0dad24500..4d1c335e06e5 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -178,17 +178,54 @@ static void sta_rx_agg_reorder_timer_expired(struct timer_list *t)
rcu_read_unlock();
}
-static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
+static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ const struct ieee80211_addba_ext_ie *req)
+{
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_addba_ext_ie *resp;
+ const struct ieee80211_sta_he_cap *he_cap;
+ u8 frag_level, cap_frag_level;
+ u8 *pos;
+
+ sband = ieee80211_get_sband(sdata);
+ if (!sband)
+ return;
+ he_cap = ieee80211_get_he_iftype_cap(sband, sdata->vif.type);
+ if (!he_cap)
+ return;
+
+ pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie));
+ *pos++ = WLAN_EID_ADDBA_EXT;
+ *pos++ = sizeof(struct ieee80211_addba_ext_ie);
+ resp = (struct ieee80211_addba_ext_ie *)pos;
+ resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG;
+
+ frag_level = u32_get_bits(req->data,
+ IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK);
+ cap_frag_level = u32_get_bits(he_cap->he_cap_elem.mac_cap_info[0],
+ IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK);
+ if (frag_level > cap_frag_level)
+ frag_level = cap_frag_level;
+ resp->data |= u8_encode_bits(frag_level,
+ IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK);
+}
+
+static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid,
u8 dialog_token, u16 status, u16 policy,
- u16 buf_size, u16 timeout)
+ u16 buf_size, u16 timeout,
+ const struct ieee80211_addba_ext_ie *addbaext)
{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU);
u16 capab;
- skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+ skb = dev_alloc_skb(sizeof(*mgmt) +
+ 2 + sizeof(struct ieee80211_addba_ext_ie) +
+ local->hw.extra_tx_headroom);
if (!skb)
return;
@@ -222,13 +259,17 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
+ if (sta->sta.he_cap.has_he && addbaext)
+ ieee80211_add_addbaext(sdata, skb, addbaext);
+
ieee80211_tx_skb(sdata, skb);
}
void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
u8 dialog_token, u16 timeout,
u16 start_seq_num, u16 ba_policy, u16 tid,
- u16 buf_size, bool tx, bool auto_seq)
+ u16 buf_size, bool tx, bool auto_seq,
+ const struct ieee80211_addba_ext_ie *addbaext)
{
struct ieee80211_local *local = sta->sdata->local;
struct tid_ampdu_rx *tid_agg_rx;
@@ -410,21 +451,22 @@ end:
}
if (tx)
- ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
+ ieee80211_send_addba_resp(sta, sta->sta.addr, tid,
dialog_token, status, 1, buf_size,
- timeout);
+ timeout, addbaext);
}
static void __ieee80211_start_rx_ba_session(struct sta_info *sta,
u8 dialog_token, u16 timeout,
u16 start_seq_num, u16 ba_policy,
u16 tid, u16 buf_size, bool tx,
- bool auto_seq)
+ bool auto_seq,
+ const struct ieee80211_addba_ext_ie *addbaext)
{
mutex_lock(&sta->ampdu_mlme.mtx);
___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
start_seq_num, ba_policy, tid,
- buf_size, tx, auto_seq);
+ buf_size, tx, auto_seq, addbaext);
mutex_unlock(&sta->ampdu_mlme.mtx);
}
@@ -434,7 +476,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
size_t len)
{
u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num;
+ struct ieee802_11_elems elems = { 0 };
u8 dialog_token;
+ int ies_len;
/* extract session parameters from addba request frame */
dialog_token = mgmt->u.action.u.addba_req.dialog_token;
@@ -447,9 +491,19 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+ ies_len = len - offsetof(struct ieee80211_mgmt,
+ u.action.u.addba_req.variable);
+ if (ies_len) {
+ ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable,
+ ies_len, true, &elems, mgmt->bssid, NULL);
+ if (elems.parse_error)
+ return;
+ }
+
__ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
start_seq_num, ba_policy, tid,
- buf_size, true, false);
+ buf_size, true, false,
+ elems.addba_ext_ie);
}
void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index b11883d26875..33da6f738c99 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -485,7 +485,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
params.ssn = sta->tid_seq[tid] >> 4;
ret = drv_ampdu_action(local, sdata, &params);
- if (ret) {
+ if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) {
+ /*
+ * We didn't send the request yet, so don't need to check
+ * here if we already got a response, just mark as driver
+ * ready immediately.
+ */
+ set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state);
+ } else if (ret) {
ht_dbg(sdata,
"BA request denied - HW unavailable for %pM tid %d\n",
sta->sta.addr, tid);
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
new file mode 100644
index 000000000000..9fc2968856c0
--- /dev/null
+++ b/net/mac80211/airtime.c
@@ -0,0 +1,597 @@
+// SPDX-License-Identifier: ISC
+/*
+ * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name>
+ */
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+
+#define AVG_PKT_SIZE 1024
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS (AVG_PKT_SIZE << 3)
+
+/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per
+ * symbol. We use k-symbols to avoid rounding in the _TIME macros below.
+ */
+#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps))
+
+/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024
+ * symbols.
+ */
+#define MCS_SYMBOL_TIME(sgi, ksyms) \
+ (sgi ? \
+ ((ksyms) * 4 * 18) / 20 : /* 3.6 us per sym */ \
+ ((ksyms) * 4) /* 4.0 us per sym */ \
+ )
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) \
+ ((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps))))
+
+#define MCS_DURATION_S(shift, streams, sgi, bps) \
+ ((u16)((MCS_DURATION(streams, sgi, bps) >> shift)))
+
+/* These should match the values in enum nl80211_he_gi */
+#define HE_GI_08 0
+#define HE_GI_16 1
+#define HE_GI_32 2
+
+/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */
+#define HE_SYMBOL_TIME(gi, ksyms) \
+ (gi == HE_GI_08 ? \
+ ((ksyms) * 16 * 17) / 20 : /* 13.6 us per sym */ \
+ (gi == HE_GI_16 ? \
+ ((ksyms) * 16 * 18) / 20 : /* 14.4 us per sym */ \
+ ((ksyms) * 16) /* 16.0 us per sym */ \
+ ))
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define HE_DURATION(streams, gi, bps) \
+ ((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps))))
+
+#define HE_DURATION_S(shift, streams, gi, bps) \
+ (HE_DURATION(streams, gi, bps) >> shift)
+
+#define BW_20 0
+#define BW_40 1
+#define BW_80 2
+#define BW_160 3
+
+/*
+ * Define group sort order: HT40 -> SGI -> #streams
+ */
+#define IEEE80211_MAX_STREAMS 4
+#define IEEE80211_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */
+#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */
+
+#define IEEE80211_HE_MAX_STREAMS 8
+#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */
+
+#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
+ IEEE80211_HT_STREAM_GROUPS)
+#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
+ IEEE80211_VHT_STREAM_GROUPS)
+#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \
+ IEEE80211_HE_STREAM_GROUPS)
+#define IEEE80211_GROUPS_NB (IEEE80211_HT_GROUPS_NB + \
+ IEEE80211_VHT_GROUPS_NB + \
+ IEEE80211_HE_GROUPS_NB)
+
+#define IEEE80211_HT_GROUP_0 0
+#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB)
+#define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB)
+
+#define MCS_GROUP_RATES 12
+
+#define HT_GROUP_IDX(_streams, _sgi, _ht40) \
+ IEEE80211_HT_GROUP_0 + \
+ IEEE80211_MAX_STREAMS * 2 * _ht40 + \
+ IEEE80211_MAX_STREAMS * _sgi + \
+ _streams - 1
+
+#define _MAX(a, b) (((a)>(b))?(a):(b))
+
+#define GROUP_SHIFT(duration) \
+ _MAX(0, 16 - __builtin_clz(duration))
+
+/* MCS rate information for an MCS group */
+#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \
+ [HT_GROUP_IDX(_streams, _sgi, _ht40)] = { \
+ .shift = _s, \
+ .duration = { \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260) \
+ } \
+}
+
+#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \
+ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26))
+
+#define MCS_GROUP(_streams, _sgi, _ht40) \
+ __MCS_GROUP(_streams, _sgi, _ht40, \
+ MCS_GROUP_SHIFT(_streams, _sgi, _ht40))
+
+#define VHT_GROUP_IDX(_streams, _sgi, _bw) \
+ (IEEE80211_VHT_GROUP_0 + \
+ IEEE80211_MAX_STREAMS * 2 * (_bw) + \
+ IEEE80211_MAX_STREAMS * (_sgi) + \
+ (_streams) - 1)
+
+#define BW2VBPS(_bw, r4, r3, r2, r1) \
+ (_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
+
+#define __VHT_GROUP(_streams, _sgi, _bw, _s) \
+ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 234, 117, 54, 26)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 468, 234, 108, 52)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 702, 351, 162, 78)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 936, 468, 216, 104)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 1404, 702, 324, 156)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 1872, 936, 432, 208)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2106, 1053, 486, 234)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2340, 1170, 540, 260)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2808, 1404, 648, 312)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 3120, 1560, 720, 346)) \
+ } \
+}
+
+#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \
+ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 243, 117, 54, 26)))
+
+#define VHT_GROUP(_streams, _sgi, _bw) \
+ __VHT_GROUP(_streams, _sgi, _bw, \
+ VHT_GROUP_SHIFT(_streams, _sgi, _bw))
+
+
+#define HE_GROUP_IDX(_streams, _gi, _bw) \
+ (IEEE80211_HE_GROUP_0 + \
+ IEEE80211_HE_MAX_STREAMS * 3 * (_bw) + \
+ IEEE80211_HE_MAX_STREAMS * (_gi) + \
+ (_streams) - 1)
+
+#define __HE_GROUP(_streams, _gi, _bw, _s) \
+ [HE_GROUP_IDX(_streams, _gi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 979, 489, 230, 115)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 1958, 979, 475, 230)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 2937, 1468, 705, 345)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 3916, 1958, 936, 475)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 5875, 2937, 1411, 705)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 7833, 3916, 1872, 936)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 8827, 4406, 2102, 1051)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 9806, 4896, 2347, 1166)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 11764, 5875, 2808, 1411)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 13060, 6523, 3124, 1555)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 14702, 7344, 3513, 1756)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 16329, 8164, 3902, 1944)) \
+ } \
+}
+
+#define HE_GROUP_SHIFT(_streams, _gi, _bw) \
+ GROUP_SHIFT(HE_DURATION(_streams, _gi, \
+ BW2VBPS(_bw, 979, 489, 230, 115)))
+
+#define HE_GROUP(_streams, _gi, _bw) \
+ __HE_GROUP(_streams, _gi, _bw, \
+ HE_GROUP_SHIFT(_streams, _gi, _bw))
+struct mcs_group {
+ u8 shift;
+ u16 duration[MCS_GROUP_RATES];
+};
+
+static const struct mcs_group airtime_mcs_groups[] = {
+ MCS_GROUP(1, 0, BW_20),
+ MCS_GROUP(2, 0, BW_20),
+ MCS_GROUP(3, 0, BW_20),
+ MCS_GROUP(4, 0, BW_20),
+
+ MCS_GROUP(1, 1, BW_20),
+ MCS_GROUP(2, 1, BW_20),
+ MCS_GROUP(3, 1, BW_20),
+ MCS_GROUP(4, 1, BW_20),
+
+ MCS_GROUP(1, 0, BW_40),
+ MCS_GROUP(2, 0, BW_40),
+ MCS_GROUP(3, 0, BW_40),
+ MCS_GROUP(4, 0, BW_40),
+
+ MCS_GROUP(1, 1, BW_40),
+ MCS_GROUP(2, 1, BW_40),
+ MCS_GROUP(3, 1, BW_40),
+ MCS_GROUP(4, 1, BW_40),
+
+ VHT_GROUP(1, 0, BW_20),
+ VHT_GROUP(2, 0, BW_20),
+ VHT_GROUP(3, 0, BW_20),
+ VHT_GROUP(4, 0, BW_20),
+
+ VHT_GROUP(1, 1, BW_20),
+ VHT_GROUP(2, 1, BW_20),
+ VHT_GROUP(3, 1, BW_20),
+ VHT_GROUP(4, 1, BW_20),
+
+ VHT_GROUP(1, 0, BW_40),
+ VHT_GROUP(2, 0, BW_40),
+ VHT_GROUP(3, 0, BW_40),
+ VHT_GROUP(4, 0, BW_40),
+
+ VHT_GROUP(1, 1, BW_40),
+ VHT_GROUP(2, 1, BW_40),
+ VHT_GROUP(3, 1, BW_40),
+ VHT_GROUP(4, 1, BW_40),
+
+ VHT_GROUP(1, 0, BW_80),
+ VHT_GROUP(2, 0, BW_80),
+ VHT_GROUP(3, 0, BW_80),
+ VHT_GROUP(4, 0, BW_80),
+
+ VHT_GROUP(1, 1, BW_80),
+ VHT_GROUP(2, 1, BW_80),
+ VHT_GROUP(3, 1, BW_80),
+ VHT_GROUP(4, 1, BW_80),
+
+ VHT_GROUP(1, 0, BW_160),
+ VHT_GROUP(2, 0, BW_160),
+ VHT_GROUP(3, 0, BW_160),
+ VHT_GROUP(4, 0, BW_160),
+
+ VHT_GROUP(1, 1, BW_160),
+ VHT_GROUP(2, 1, BW_160),
+ VHT_GROUP(3, 1, BW_160),
+ VHT_GROUP(4, 1, BW_160),
+
+ HE_GROUP(1, HE_GI_08, BW_20),
+ HE_GROUP(2, HE_GI_08, BW_20),
+ HE_GROUP(3, HE_GI_08, BW_20),
+ HE_GROUP(4, HE_GI_08, BW_20),
+ HE_GROUP(5, HE_GI_08, BW_20),
+ HE_GROUP(6, HE_GI_08, BW_20),
+ HE_GROUP(7, HE_GI_08, BW_20),
+ HE_GROUP(8, HE_GI_08, BW_20),
+
+ HE_GROUP(1, HE_GI_16, BW_20),
+ HE_GROUP(2, HE_GI_16, BW_20),
+ HE_GROUP(3, HE_GI_16, BW_20),
+ HE_GROUP(4, HE_GI_16, BW_20),
+ HE_GROUP(5, HE_GI_16, BW_20),
+ HE_GROUP(6, HE_GI_16, BW_20),
+ HE_GROUP(7, HE_GI_16, BW_20),
+ HE_GROUP(8, HE_GI_16, BW_20),
+
+ HE_GROUP(1, HE_GI_32, BW_20),
+ HE_GROUP(2, HE_GI_32, BW_20),
+ HE_GROUP(3, HE_GI_32, BW_20),
+ HE_GROUP(4, HE_GI_32, BW_20),
+ HE_GROUP(5, HE_GI_32, BW_20),
+ HE_GROUP(6, HE_GI_32, BW_20),
+ HE_GROUP(7, HE_GI_32, BW_20),
+ HE_GROUP(8, HE_GI_32, BW_20),
+
+ HE_GROUP(1, HE_GI_08, BW_40),
+ HE_GROUP(2, HE_GI_08, BW_40),
+ HE_GROUP(3, HE_GI_08, BW_40),
+ HE_GROUP(4, HE_GI_08, BW_40),
+ HE_GROUP(5, HE_GI_08, BW_40),
+ HE_GROUP(6, HE_GI_08, BW_40),
+ HE_GROUP(7, HE_GI_08, BW_40),
+ HE_GROUP(8, HE_GI_08, BW_40),
+
+ HE_GROUP(1, HE_GI_16, BW_40),
+ HE_GROUP(2, HE_GI_16, BW_40),
+ HE_GROUP(3, HE_GI_16, BW_40),
+ HE_GROUP(4, HE_GI_16, BW_40),
+ HE_GROUP(5, HE_GI_16, BW_40),
+ HE_GROUP(6, HE_GI_16, BW_40),
+ HE_GROUP(7, HE_GI_16, BW_40),
+ HE_GROUP(8, HE_GI_16, BW_40),
+
+ HE_GROUP(1, HE_GI_32, BW_40),
+ HE_GROUP(2, HE_GI_32, BW_40),
+ HE_GROUP(3, HE_GI_32, BW_40),
+ HE_GROUP(4, HE_GI_32, BW_40),
+ HE_GROUP(5, HE_GI_32, BW_40),
+ HE_GROUP(6, HE_GI_32, BW_40),
+ HE_GROUP(7, HE_GI_32, BW_40),
+ HE_GROUP(8, HE_GI_32, BW_40),
+
+ HE_GROUP(1, HE_GI_08, BW_80),
+ HE_GROUP(2, HE_GI_08, BW_80),
+ HE_GROUP(3, HE_GI_08, BW_80),
+ HE_GROUP(4, HE_GI_08, BW_80),
+ HE_GROUP(5, HE_GI_08, BW_80),
+ HE_GROUP(6, HE_GI_08, BW_80),
+ HE_GROUP(7, HE_GI_08, BW_80),
+ HE_GROUP(8, HE_GI_08, BW_80),
+
+ HE_GROUP(1, HE_GI_16, BW_80),
+ HE_GROUP(2, HE_GI_16, BW_80),
+ HE_GROUP(3, HE_GI_16, BW_80),
+ HE_GROUP(4, HE_GI_16, BW_80),
+ HE_GROUP(5, HE_GI_16, BW_80),
+ HE_GROUP(6, HE_GI_16, BW_80),
+ HE_GROUP(7, HE_GI_16, BW_80),
+ HE_GROUP(8, HE_GI_16, BW_80),
+
+ HE_GROUP(1, HE_GI_32, BW_80),
+ HE_GROUP(2, HE_GI_32, BW_80),
+ HE_GROUP(3, HE_GI_32, BW_80),
+ HE_GROUP(4, HE_GI_32, BW_80),
+ HE_GROUP(5, HE_GI_32, BW_80),
+ HE_GROUP(6, HE_GI_32, BW_80),
+ HE_GROUP(7, HE_GI_32, BW_80),
+ HE_GROUP(8, HE_GI_32, BW_80),
+
+ HE_GROUP(1, HE_GI_08, BW_160),
+ HE_GROUP(2, HE_GI_08, BW_160),
+ HE_GROUP(3, HE_GI_08, BW_160),
+ HE_GROUP(4, HE_GI_08, BW_160),
+ HE_GROUP(5, HE_GI_08, BW_160),
+ HE_GROUP(6, HE_GI_08, BW_160),
+ HE_GROUP(7, HE_GI_08, BW_160),
+ HE_GROUP(8, HE_GI_08, BW_160),
+
+ HE_GROUP(1, HE_GI_16, BW_160),
+ HE_GROUP(2, HE_GI_16, BW_160),
+ HE_GROUP(3, HE_GI_16, BW_160),
+ HE_GROUP(4, HE_GI_16, BW_160),
+ HE_GROUP(5, HE_GI_16, BW_160),
+ HE_GROUP(6, HE_GI_16, BW_160),
+ HE_GROUP(7, HE_GI_16, BW_160),
+ HE_GROUP(8, HE_GI_16, BW_160),
+
+ HE_GROUP(1, HE_GI_32, BW_160),
+ HE_GROUP(2, HE_GI_32, BW_160),
+ HE_GROUP(3, HE_GI_32, BW_160),
+ HE_GROUP(4, HE_GI_32, BW_160),
+ HE_GROUP(5, HE_GI_32, BW_160),
+ HE_GROUP(6, HE_GI_32, BW_160),
+ HE_GROUP(7, HE_GI_32, BW_160),
+ HE_GROUP(8, HE_GI_32, BW_160),
+};
+
+static u32
+ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre,
+ bool cck, int len)
+{
+ u32 duration;
+
+ if (cck) {
+ duration = 144 + 48; /* preamble + PLCP */
+ if (short_pre)
+ duration >>= 1;
+
+ duration += 10; /* SIFS */
+ } else {
+ duration = 20 + 16; /* premable + SIFS */
+ }
+
+ len <<= 3;
+ duration += (len * 10) / bitrate;
+
+ return duration;
+}
+
+u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_rx_status *status,
+ int len)
+{
+ struct ieee80211_supported_band *sband;
+ const struct ieee80211_rate *rate;
+ bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI;
+ bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE;
+ int bw, streams;
+ int group, idx;
+ u32 duration;
+ bool cck;
+
+ switch (status->bw) {
+ case RATE_INFO_BW_20:
+ bw = BW_20;
+ break;
+ case RATE_INFO_BW_40:
+ bw = BW_40;
+ break;
+ case RATE_INFO_BW_80:
+ bw = BW_80;
+ break;
+ case RATE_INFO_BW_160:
+ bw = BW_160;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ switch (status->encoding) {
+ case RX_ENC_LEGACY:
+ if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ))
+ return 0;
+
+ sband = hw->wiphy->bands[status->band];
+ if (!sband || status->rate_idx >= sband->n_bitrates)
+ return 0;
+
+ rate = &sband->bitrates[status->rate_idx];
+ cck = rate->flags & IEEE80211_RATE_MANDATORY_B;
+
+ return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp,
+ cck, len);
+
+ case RX_ENC_VHT:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = VHT_GROUP_IDX(streams, sgi, bw);
+ break;
+ case RX_ENC_HT:
+ streams = ((status->rate_idx >> 3) & 3) + 1;
+ idx = status->rate_idx & 7;
+ group = HT_GROUP_IDX(streams, sgi, bw);
+ break;
+ case RX_ENC_HE:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = HE_GROUP_IDX(streams, status->he_gi, bw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) ||
+ (status->encoding == RX_ENC_HE && streams > 8)))
+ return 0;
+
+ duration = airtime_mcs_groups[group].duration[idx];
+ duration <<= airtime_mcs_groups[group].shift;
+ duration *= len;
+ duration /= AVG_PKT_SIZE;
+ duration /= 1024;
+
+ duration += 36 + (streams << 2);
+
+ return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime);
+
+static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
+ struct ieee80211_tx_rate *rate,
+ u8 band, int len)
+{
+ struct ieee80211_rx_status stat = {
+ .band = band,
+ };
+
+ if (rate->idx < 0 || !rate->count)
+ return 0;
+
+ if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ stat.bw = RATE_INFO_BW_80;
+ else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ stat.bw = RATE_INFO_BW_40;
+ else
+ stat.bw = RATE_INFO_BW_20;
+
+ stat.enc_flags = 0;
+ if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)
+ stat.enc_flags |= RX_ENC_FLAG_SHORTPRE;
+ if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
+ stat.enc_flags |= RX_ENC_FLAG_SHORT_GI;
+
+ stat.rate_idx = rate->idx;
+ if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
+ stat.encoding = RX_ENC_VHT;
+ stat.rate_idx = ieee80211_rate_get_vht_mcs(rate);
+ stat.nss = ieee80211_rate_get_vht_nss(rate);
+ } else if (rate->flags & IEEE80211_TX_RC_MCS) {
+ stat.encoding = RX_ENC_HT;
+ } else {
+ stat.encoding = RX_ENC_LEGACY;
+ }
+
+ return ieee80211_calc_rx_airtime(hw, &stat, len);
+}
+
+u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_tx_info *info,
+ int len)
+{
+ u32 duration = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) {
+ struct ieee80211_tx_rate *rate = &info->status.rates[i];
+ u32 cur_duration;
+
+ cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate,
+ info->band, len);
+ if (!cur_duration)
+ break;
+
+ duration += cur_duration * rate->count;
+ }
+
+ return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime);
+
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ int len)
+{
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_chanctx_conf *conf;
+ int rateidx, shift = 0;
+ bool cck, short_pream;
+ u32 basic_rates;
+ u8 band = 0;
+ u16 rate;
+
+ len += 38; /* Ethernet header length */
+
+ conf = rcu_dereference(vif->chanctx_conf);
+ if (conf) {
+ band = conf->def.chan->band;
+ shift = ieee80211_chandef_get_shift(&conf->def);
+ }
+
+ if (pubsta) {
+ struct sta_info *sta = container_of(pubsta, struct sta_info,
+ sta);
+
+ return ieee80211_calc_tx_airtime_rate(hw,
+ &sta->tx_stats.last_rate,
+ band, len);
+ }
+
+ if (!conf)
+ return 0;
+
+ /* No station to get latest rate from, so calculate the worst-case
+ * duration using the lowest configured basic rate.
+ */
+ sband = hw->wiphy->bands[band];
+
+ basic_rates = vif->bss_conf.basic_rates;
+ short_pream = vif->bss_conf.use_short_preamble;
+
+ rateidx = basic_rates ? ffs(basic_rates) - 1 : 0;
+ rate = sband->bitrates[rateidx].bitrate << shift;
+ cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B;
+
+ return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len);
+}
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 4d458067d80d..6aee699deb28 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -980,7 +980,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
BSS_CHANGED_SSID |
BSS_CHANGED_P2P_PS |
BSS_CHANGED_TXPOWER |
- BSS_CHANGED_TWT;
+ BSS_CHANGED_TWT |
+ BSS_CHANGED_HE_OBSS_PD;
int err;
int prev_beacon_int;
@@ -1051,6 +1052,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.enable_beacon = true;
sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
sdata->vif.bss_conf.twt_responder = params->twt_responder;
+ memcpy(&sdata->vif.bss_conf.he_obss_pd, &params->he_obss_pd,
+ sizeof(struct ieee80211_he_obss_pd));
sdata->vif.bss_conf.ssid_len = params->ssid_len;
if (params->ssid_len)
@@ -1529,7 +1532,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
struct sta_info *sta;
struct ieee80211_sub_if_data *sdata;
int err;
- int layer2_update;
if (params->vlan) {
sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
@@ -1543,7 +1545,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
if (ether_addr_equal(mac, sdata->vif.addr))
return -EINVAL;
- if (is_multicast_ether_addr(mac))
+ if (!is_valid_ether_addr(mac))
+ return -EINVAL;
+
+ if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) &&
+ sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !sdata->u.mgd.associated)
return -EINVAL;
sta = sta_info_alloc(sdata, mac, GFP_KERNEL);
@@ -1553,10 +1560,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
sta->sta.tdls = true;
- if (sta->sta.tdls && sdata->vif.type == NL80211_IFTYPE_STATION &&
- !sdata->u.mgd.associated)
- return -EINVAL;
-
err = sta_apply_parameters(local, sta, params);
if (err) {
sta_info_free(local, sta);
@@ -1572,18 +1575,12 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
test_sta_flag(sta, WLAN_STA_ASSOC))
rate_control_rate_init(sta);
- layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_AP;
-
err = sta_info_insert_rcu(sta);
if (err) {
rcu_read_unlock();
return err;
}
- if (layer2_update)
- cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr);
-
rcu_read_unlock();
return 0;
@@ -1681,10 +1678,11 @@ static int ieee80211_change_station(struct wiphy *wiphy,
sta->sdata = vlansdata;
ieee80211_check_fast_xmit(sta);
- if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
ieee80211_vif_inc_num_mcast(sta->sdata);
-
- cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr);
+ cfg80211_send_layer2_update(sta->sdata->dev,
+ sta->sta.addr);
+ }
}
err = sta_apply_parameters(local, sta, params);
@@ -2956,6 +2954,28 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
return err;
}
+static void ieee80211_end_cac(struct wiphy *wiphy,
+ struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+
+ mutex_lock(&local->mtx);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ /* it might be waiting for the local->mtx, but then
+ * by the time it gets it, sdata->wdev.cac_started
+ * will no longer be true
+ */
+ cancel_delayed_work(&sdata->dfs_cac_timer_work);
+
+ if (sdata->wdev.cac_started) {
+ ieee80211_vif_release_channel(sdata);
+ sdata->wdev.cac_started = false;
+ }
+ }
+ mutex_unlock(&local->mtx);
+}
+
static struct cfg80211_beacon_data *
cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
{
@@ -3430,7 +3450,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
spin_lock_irqsave(&local->ack_status_lock, spin_flags);
id = idr_alloc(&local->ack_status_frames, ack_skb,
- 1, 0x10000, GFP_ATOMIC);
+ 1, 0x2000, GFP_ATOMIC);
spin_unlock_irqrestore(&local->ack_status_lock, spin_flags);
if (id < 0) {
@@ -4025,6 +4045,7 @@ const struct cfg80211_ops mac80211_config_ops = {
#endif
.get_channel = ieee80211_cfg_get_channel,
.start_radar_detection = ieee80211_start_radar_detection,
+ .end_cac = ieee80211_end_cac,
.channel_switch = ieee80211_channel_switch,
.set_qos_map = ieee80211_set_qos_map,
.set_ap_chanwidth = ieee80211_set_ap_chanwidth,
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2e7f75938c51..ad41d74530c6 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -59,6 +59,8 @@ static const struct file_operations name## _ops = { \
debugfs_create_file(#name, mode, phyd, local, &name## _ops);
+DEBUGFS_READONLY_FILE(hw_conf, "%x",
+ local->hw.conf.flags);
DEBUGFS_READONLY_FILE(user_power, "%d",
local->user_power_level);
DEBUGFS_READONLY_FILE(power, "%d",
@@ -148,6 +150,87 @@ static const struct file_operations aqm_ops = {
.llseek = default_llseek,
};
+static ssize_t aql_txq_limit_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[400];
+ int len = 0;
+
+ len = scnprintf(buf, sizeof(buf),
+ "AC AQL limit low AQL limit high\n"
+ "VO %u %u\n"
+ "VI %u %u\n"
+ "BE %u %u\n"
+ "BK %u %u\n",
+ local->aql_txq_limit_low[IEEE80211_AC_VO],
+ local->aql_txq_limit_high[IEEE80211_AC_VO],
+ local->aql_txq_limit_low[IEEE80211_AC_VI],
+ local->aql_txq_limit_high[IEEE80211_AC_VI],
+ local->aql_txq_limit_low[IEEE80211_AC_BE],
+ local->aql_txq_limit_high[IEEE80211_AC_BE],
+ local->aql_txq_limit_low[IEEE80211_AC_BK],
+ local->aql_txq_limit_high[IEEE80211_AC_BK]);
+ return simple_read_from_buffer(user_buf, count, ppos,
+ buf, len);
+}
+
+static ssize_t aql_txq_limit_write(struct file *file,
+ const char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[100];
+ size_t len;
+ u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
+ struct sta_info *sta;
+
+ if (count > sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ buf[sizeof(buf) - 1] = 0;
+ len = strlen(buf);
+ if (len > 0 && buf[len - 1] == '\n')
+ buf[len - 1] = 0;
+
+ if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
+ return -EINVAL;
+
+ if (ac >= IEEE80211_NUM_ACS)
+ return -EINVAL;
+
+ q_limit_low_old = local->aql_txq_limit_low[ac];
+ q_limit_high_old = local->aql_txq_limit_high[ac];
+
+ local->aql_txq_limit_low[ac] = q_limit_low;
+ local->aql_txq_limit_high[ac] = q_limit_high;
+
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ /* If a sta has customized queue limits, keep it */
+ if (sta->airtime[ac].aql_limit_low == q_limit_low_old &&
+ sta->airtime[ac].aql_limit_high == q_limit_high_old) {
+ sta->airtime[ac].aql_limit_low = q_limit_low;
+ sta->airtime[ac].aql_limit_high = q_limit_high;
+ }
+ }
+ mutex_unlock(&local->sta_mtx);
+ return count;
+}
+
+static const struct file_operations aql_txq_limit_ops = {
+ .write = aql_txq_limit_write,
+ .read = aql_txq_limit_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
static ssize_t force_tx_status_read(struct file *file,
char __user *user_buf,
size_t count,
@@ -271,8 +354,7 @@ static const char *hw_flag_names[] = {
FLAG(TX_STATUS_NO_AMPDU_LEN),
FLAG(SUPPORTS_MULTI_BSSID),
FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
- FLAG(EXT_KEY_ID_NATIVE),
- FLAG(NO_AMPDU_KEYBORDER_SUPPORT),
+ FLAG(AMPDU_KEYBORDER_SUPPORT),
#undef FLAG
};
@@ -434,6 +516,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_ADD(hwflags);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
+ DEBUGFS_ADD(hw_conf);
DEBUGFS_ADD_MODE(force_tx_status, 0600);
if (local->ops->wake_tx_queue)
@@ -442,6 +525,10 @@ void debugfs_hw_add(struct ieee80211_local *local)
debugfs_create_u16("airtime_flags", 0600,
phyd, &local->airtime_flags);
+ DEBUGFS_ADD(aql_txq_limit);
+ debugfs_create_u32("aql_threshold", 0600,
+ phyd, &local->aql_threshold);
+
statsd = debugfs_create_dir("statistics", phyd);
/* if the dir failed, don't put all the other things into the root! */
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index b1438fd4d876..64b544ae9966 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -487,9 +487,14 @@ static ssize_t ieee80211_if_fmt_aqm(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
{
struct ieee80211_local *local = sdata->local;
- struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+ struct txq_info *txqi;
int len;
+ if (!sdata->vif.txq)
+ return 0;
+
+ txqi = to_txq_info(sdata->vif.txq);
+
spin_lock_bh(&local->fq.lock);
rcu_read_lock();
@@ -658,7 +663,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
DEBUGFS_ADD(hw_queues);
- if (sdata->local->ops->wake_tx_queue)
+ if (sdata->local->ops->wake_tx_queue &&
+ sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_NAN)
DEBUGFS_ADD(aqm);
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index c8ad20c28c43..c80b1e163ea4 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -197,7 +197,7 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
{
struct sta_info *sta = file->private_data;
struct ieee80211_local *local = sta->sdata->local;
- size_t bufsz = 200;
+ size_t bufsz = 400;
char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
u64 rx_airtime = 0, tx_airtime = 0;
s64 deficit[IEEE80211_NUM_ACS];
@@ -218,13 +218,8 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
p += scnprintf(p, bufsz + buf - p,
"RX: %llu us\nTX: %llu us\nWeight: %u\n"
"Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n",
- rx_airtime,
- tx_airtime,
- sta->airtime_weight,
- deficit[0],
- deficit[1],
- deficit[2],
- deficit[3]);
+ rx_airtime, tx_airtime, sta->airtime_weight,
+ deficit[0], deficit[1], deficit[2], deficit[3]);
rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
kfree(buf);
@@ -250,6 +245,70 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf,
}
STA_OPS_RW(airtime);
+static ssize_t sta_aql_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_local *local = sta->sdata->local;
+ size_t bufsz = 400;
+ char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+ u32 q_depth[IEEE80211_NUM_ACS];
+ u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS];
+ ssize_t rv;
+ int ac;
+
+ if (!buf)
+ return -ENOMEM;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ spin_lock_bh(&local->active_txq_lock[ac]);
+ q_limit_l[ac] = sta->airtime[ac].aql_limit_low;
+ q_limit_h[ac] = sta->airtime[ac].aql_limit_high;
+ spin_unlock_bh(&local->active_txq_lock[ac]);
+ q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending);
+ }
+
+ p += scnprintf(p, bufsz + buf - p,
+ "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n"
+ "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n",
+ q_depth[0], q_depth[1], q_depth[2], q_depth[3],
+ q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1],
+ q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]),
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return rv;
+}
+
+static ssize_t sta_aql_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ u32 ac, q_limit_l, q_limit_h;
+ char _buf[100] = {}, *buf = _buf;
+
+ if (count > sizeof(_buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, userbuf, count))
+ return -EFAULT;
+
+ buf[sizeof(_buf) - 1] = '\0';
+ if (sscanf(buf, "limit %u %u %u", &ac, &q_limit_l, &q_limit_h)
+ != 3)
+ return -EINVAL;
+
+ if (ac >= IEEE80211_NUM_ACS)
+ return -EINVAL;
+
+ sta->airtime[ac].aql_limit_low = q_limit_l;
+ sta->airtime[ac].aql_limit_high = q_limit_h;
+
+ return count;
+}
+STA_OPS_RW(aql);
+
+
static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -928,12 +987,7 @@ STA_OPS(he_capa);
sta->debugfs_dir, sta, &sta_ ##name## _ops);
#define DEBUGFS_ADD_COUNTER(name, field) \
- if (sizeof(sta->field) == sizeof(u32)) \
- debugfs_create_u32(#name, 0400, sta->debugfs_dir, \
- (u32 *) &sta->field); \
- else \
- debugfs_create_u64(#name, 0400, sta->debugfs_dir, \
- (u64 *) &sta->field);
+ debugfs_create_ulong(#name, 0400, sta->debugfs_dir, &sta->field);
void ieee80211_sta_debugfs_add(struct sta_info *sta)
{
@@ -978,14 +1032,12 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
DEBUGFS_ADD(airtime);
- if (sizeof(sta->driver_buffered_tids) == sizeof(u32))
- debugfs_create_x32("driver_buffered_tids", 0400,
- sta->debugfs_dir,
- (u32 *)&sta->driver_buffered_tids);
- else
- debugfs_create_x64("driver_buffered_tids", 0400,
- sta->debugfs_dir,
- (u64 *)&sta->driver_buffered_tids);
+ if (wiphy_ext_feature_isset(local->hw.wiphy,
+ NL80211_EXT_FEATURE_AQL))
+ DEBUGFS_ADD(aql);
+
+ debugfs_create_xul("driver_buffered_tids", 0400, sta->debugfs_dir,
+ &sta->driver_buffered_tids);
drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs_dir);
}
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index c2d8b5451a5e..2c9b3eb8b652 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -692,14 +692,16 @@ static inline int drv_remain_on_channel(struct ieee80211_local *local,
return ret;
}
-static inline int drv_cancel_remain_on_channel(struct ieee80211_local *local)
+static inline int
+drv_cancel_remain_on_channel(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
{
int ret;
might_sleep();
- trace_drv_cancel_remain_on_channel(local);
- ret = local->ops->cancel_remain_on_channel(&local->hw);
+ trace_drv_cancel_remain_on_channel(local, sdata);
+ ret = local->ops->cancel_remain_on_channel(&local->hw, &sdata->vif);
trace_drv_return_int(local, ret);
return ret;
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index 219650591c79..736da0035135 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -50,3 +50,43 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
he_cap->has_he = true;
}
+
+void
+ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
+ const struct ieee80211_he_operation *he_op_ie_elem)
+{
+ struct ieee80211_he_operation *he_operation =
+ &vif->bss_conf.he_operation;
+
+ if (!he_op_ie_elem) {
+ memset(he_operation, 0, sizeof(*he_operation));
+ return;
+ }
+
+ vif->bss_conf.he_operation = *he_op_ie_elem;
+}
+
+void
+ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
+ const struct ieee80211_he_spr *he_spr_ie_elem)
+{
+ struct ieee80211_he_obss_pd *he_obss_pd =
+ &vif->bss_conf.he_obss_pd;
+ const u8 *data;
+
+ memset(he_obss_pd, 0, sizeof(*he_obss_pd));
+
+ if (!he_spr_ie_elem)
+ return;
+ data = he_spr_ie_elem->optional;
+
+ if (he_spr_ie_elem->he_sr_control &
+ IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT)
+ data++;
+ if (he_spr_ie_elem->he_sr_control &
+ IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) {
+ he_obss_pd->max_offset = *data++;
+ he_obss_pd->min_offset = *data++;
+ he_obss_pd->enable = true;
+ }
+}
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index d5a500b2a448..a2e4d6b8fd98 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -359,7 +359,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
sta->ampdu_mlme.tid_rx_manage_offl))
___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
IEEE80211_MAX_AMPDU_BUF_HT,
- false, true);
+ false, true, NULL);
if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS,
sta->ampdu_mlme.tid_rx_manage_offl))
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index f00dca056295..d40744903fa9 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -538,7 +538,6 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
struct cfg80211_bss *cbss;
- int err, changed = 0;
sdata_assert_lock(sdata);
@@ -560,13 +559,7 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata)
ifibss->chandef = sdata->csa_chandef;
/* generate the beacon */
- err = ieee80211_ibss_csa_beacon(sdata, NULL);
- if (err < 0)
- return err;
-
- changed |= err;
-
- return changed;
+ return ieee80211_ibss_csa_beacon(sdata, NULL);
}
void ieee80211_ibss_stop(struct ieee80211_sub_if_data *sdata)
@@ -1252,6 +1245,7 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata,
static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
{
+ struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
struct ieee80211_local *local = sdata->local;
struct sta_info *sta, *tmp;
unsigned long exp_time = IEEE80211_IBSS_INACTIVITY_LIMIT;
@@ -1268,10 +1262,17 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
if (time_is_before_jiffies(last_active + exp_time) ||
(time_is_before_jiffies(last_active + exp_rsn) &&
sta->sta_state != IEEE80211_STA_AUTHORIZED)) {
+ u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n",
sta->sta_state != IEEE80211_STA_AUTHORIZED ?
"not authorized " : "", sta->sta.addr);
+ ieee80211_send_deauth_disassoc(sdata, sta->sta.addr,
+ ifibss->bssid,
+ IEEE80211_STYPE_DEAUTH,
+ WLAN_REASON_DEAUTH_LEAVING,
+ true, frame_buf);
WARN_ON(__sta_info_destroy(sta));
}
}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 004e2e3adb88..ad15b3be8bb3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1142,6 +1142,10 @@ struct ieee80211_local {
u16 schedule_round[IEEE80211_NUM_ACS];
u16 airtime_flags;
+ u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
+ u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
+ u32 aql_threshold;
+ atomic_t aql_total_pending_airtime;
const struct ieee80211_ops *ops;
@@ -1480,6 +1484,7 @@ struct ieee802_11_elems {
const struct ieee80211_meshconf_ie *mesh_config;
const u8 *he_cap;
const struct ieee80211_he_operation *he_operation;
+ const struct ieee80211_he_spr *he_spr;
const struct ieee80211_mu_edca_param_set *mu_edca_param_set;
const u8 *uora_element;
const u8 *mesh_id;
@@ -1506,6 +1511,7 @@ struct ieee802_11_elems {
u8 max_bssid_indicator;
u8 dtim_count;
u8 dtim_period;
+ const struct ieee80211_addba_ext_ie *addba_ext_ie;
/* length of them, respectively */
u8 ext_capab_len;
@@ -1767,7 +1773,8 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, u32 info_flags);
void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
struct ieee80211_supported_band *sband,
- int retry_count, int shift, bool send_to_cooked);
+ int retry_count, int shift, bool send_to_cooked,
+ struct ieee80211_tx_status *status);
void ieee80211_check_fast_xmit(struct sta_info *sta);
void ieee80211_check_fast_xmit_all(struct ieee80211_local *local);
@@ -1804,7 +1811,8 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
u8 dialog_token, u16 timeout,
u16 start_seq_num, u16 ba_policy, u16 tid,
- u16 buf_size, bool tx, bool auto_seq);
+ u16 buf_size, bool tx, bool auto_seq,
+ const struct ieee80211_addba_ext_ie *addbaext);
void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
enum ieee80211_agg_stop_reason reason);
void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
@@ -1869,6 +1877,13 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
struct ieee80211_supported_band *sband,
const u8 *he_cap_ie, u8 he_cap_len,
struct sta_info *sta);
+void
+ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
+ const struct ieee80211_he_spr *he_spr_ie_elem);
+
+void
+ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif,
+ const struct ieee80211_he_operation *he_op_ie_elem);
/* Spectrum management */
void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
@@ -2088,7 +2103,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
const u8 *da, const u8 *key, u8 key_len, u8 key_idx,
u32 tx_flags);
void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
- const u8 *bssid, u16 stype, u16 reason,
+ const u8 *da, const u8 *bssid,
+ u16 stype, u16 reason,
bool send_frame, u8 *frame_buf);
enum {
@@ -2133,9 +2149,11 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
u32 cap);
u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
const struct cfg80211_chan_def *chandef);
+u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype);
u8 *ieee80211_ie_build_he_cap(u8 *pos,
const struct ieee80211_sta_he_cap *he_cap,
u8 *end);
+u8 *ieee80211_ie_build_he_oper(u8 *pos);
int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates);
@@ -2235,6 +2253,10 @@ const char *ieee80211_get_reason_code_string(u16 reason_code);
extern const struct ethtool_ops ieee80211_ethtool_ops;
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ int len);
#ifdef CONFIG_MAC80211_NOINLINE
#define debug_noinline noinline
#else
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 8dc6580e1787..af8b09214786 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1876,7 +1876,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
/* MTU range: 256 - 2304 */
ndev->min_mtu = 256;
- ndev->max_mtu = IEEE80211_MAX_DATA_LEN;
+ ndev->max_mtu = local->hw.max_mtu;
ret = register_netdevice(ndev);
if (ret) {
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index dd60f6428049..0f889b919b06 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -6,6 +6,7 @@
* Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
+ * Copyright 2018-2019 Intel Corporation
*/
#include <linux/if_ether.h>
@@ -270,7 +271,7 @@ int ieee80211_set_tx_key(struct ieee80211_key *key)
sta->ptk_idx = key->conf.keyidx;
- if (ieee80211_hw_check(&local->hw, NO_AMPDU_KEYBORDER_SUPPORT))
+ if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT))
clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_check_fast_xmit(sta);
@@ -290,15 +291,15 @@ static void ieee80211_pairwise_rekey(struct ieee80211_key *old,
/* Extended Key ID key install, initial one or rekey */
if (sta->ptk_idx != INVALID_PTK_KEYIDX &&
- ieee80211_hw_check(&local->hw,
- NO_AMPDU_KEYBORDER_SUPPORT)) {
+ !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) {
/* Aggregation Sessions with Extended Key ID must not
* mix MPDUs with different keyIDs within one A-MPDU.
- * Tear down any running Tx aggregation and all new
- * Rx/Tx aggregation request during rekey if the driver
- * asks us to do so. (Blocking Tx only would be
- * sufficient but WLAN_STA_BLOCK_BA gets the job done
- * for the few ms we need it.)
+ * Tear down running Tx aggregation sessions and block
+ * new Rx/Tx aggregation requests during rekey to
+ * ensure there are no A-MPDUs when the driver is not
+ * supporting A-MPDU key borders. (Blocking Tx only
+ * would be sufficient but WLAN_STA_BLOCK_BA gets the
+ * job done for the few ms we need it.)
*/
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
mutex_lock(&sta->ampdu_mlme.mtx);
@@ -781,9 +782,8 @@ int ieee80211_key_link(struct ieee80211_key *key,
/* The rekey code assumes that the old and new key are using
* the same cipher. Enforce the assumption for pairwise keys.
*/
- if (key &&
- ((alt_key && alt_key->conf.cipher != key->conf.cipher) ||
- (old_key && old_key->conf.cipher != key->conf.cipher)))
+ if ((alt_key && alt_key->conf.cipher != key->conf.cipher) ||
+ (old_key && old_key->conf.cipher != key->conf.cipher))
goto out;
} else if (sta) {
old_key = key_mtx_dereference(sdata->local, sta->gtk[idx]);
@@ -793,7 +793,7 @@ int ieee80211_key_link(struct ieee80211_key *key,
/* Non-pairwise keys must also not switch the cipher on rekey */
if (!pairwise) {
- if (key && old_key && old_key->conf.cipher != key->conf.cipher)
+ if (old_key && old_key->conf.cipher != key->conf.cipher)
goto out;
}
@@ -843,46 +843,30 @@ void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom)
ieee80211_key_destroy(key, delay_tailroom);
}
-void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
+void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_key *key;
struct ieee80211_sub_if_data *vlan;
ASSERT_RTNL();
- if (WARN_ON(!ieee80211_sdata_running(sdata)))
- return;
-
- mutex_lock(&sdata->local->key_mtx);
-
- WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt ||
- sdata->crypto_tx_tailroom_pending_dec);
-
- if (sdata->vif.type == NL80211_IFTYPE_AP) {
- list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
- WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt ||
- vlan->crypto_tx_tailroom_pending_dec);
- }
-
- list_for_each_entry(key, &sdata->key_list, list) {
- increment_tailroom_need_count(sdata);
- ieee80211_key_enable_hw_accel(key);
- }
-
- mutex_unlock(&sdata->local->key_mtx);
-}
-
-void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata)
-{
- struct ieee80211_sub_if_data *vlan;
-
mutex_lock(&sdata->local->key_mtx);
sdata->crypto_tx_tailroom_needed_cnt = 0;
+ sdata->crypto_tx_tailroom_pending_dec = 0;
if (sdata->vif.type == NL80211_IFTYPE_AP) {
- list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
vlan->crypto_tx_tailroom_needed_cnt = 0;
+ vlan->crypto_tx_tailroom_pending_dec = 0;
+ }
+ }
+
+ if (ieee80211_sdata_running(sdata)) {
+ list_for_each_entry(key, &sdata->key_list, list) {
+ increment_tailroom_need_count(sdata);
+ ieee80211_key_enable_hw_accel(key);
+ }
}
mutex_unlock(&sdata->local->key_mtx);
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index b8b9cd743bf4..d6d6e89cf7dd 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -2,6 +2,7 @@
/*
* Copyright 2002-2004, Instant802 Networks, Inc.
* Copyright 2005, Devicescape Software, Inc.
+ * Copyright (C) 2019 Intel Corporation
*/
#ifndef IEEE80211_KEY_H
@@ -156,8 +157,7 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
bool force_synchronize);
void ieee80211_free_sta_keys(struct ieee80211_local *local,
struct sta_info *sta);
-void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata);
-void ieee80211_reset_crypto_tx_tailroom(struct ieee80211_sub_if_data *sdata);
+void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata);
#define key_mtx_dereference(local, ref) \
rcu_dereference_protected(ref, lockdep_is_held(&((local)->key_mtx)))
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4c2702f128f3..4c2b5ba3ac09 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -639,6 +639,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH;
local->hw.uapsd_queues = IEEE80211_DEFAULT_UAPSD_QUEUES;
local->hw.uapsd_max_sp_len = IEEE80211_DEFAULT_MAX_SP_LEN;
+ local->hw.max_mtu = IEEE80211_MAX_DATA_LEN;
local->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
wiphy->ht_capa_mod_mask = &mac80211_ht_capa_mod_mask;
wiphy->vht_capa_mod_mask = &mac80211_vht_capa_mod_mask;
@@ -666,8 +667,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
for (i = 0; i < IEEE80211_NUM_ACS; i++) {
INIT_LIST_HEAD(&local->active_txqs[i]);
spin_lock_init(&local->active_txq_lock[i]);
+ local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L;
+ local->aql_txq_limit_high[i] =
+ IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H;
}
+
local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX;
+ local->aql_threshold = IEEE80211_AQL_THRESHOLD;
+ atomic_set(&local->aql_total_pending_airtime, 0);
INIT_LIST_HEAD(&local->chanctx_list);
mutex_init(&local->chanctx_mtx);
@@ -1048,21 +1055,15 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
}
}
- /* Enable Extended Key IDs when driver allowed it, or when it
- * supports neither HW crypto nor A-MPDUs
- */
- if ((!local->ops->set_key &&
- !ieee80211_hw_check(hw, AMPDU_AGGREGATION)) ||
- ieee80211_hw_check(&local->hw, EXT_KEY_ID_NATIVE))
- wiphy_ext_feature_set(local->hw.wiphy,
- NL80211_EXT_FEATURE_EXT_KEY_ID);
-
- /* Mac80211 and therefore all cards only using SW crypto are able to
- * handle PTK rekeys correctly
+ /* Mac80211 and therefore all drivers using SW crypto only
+ * are able to handle PTK rekeys and Extended Key ID.
*/
- if (!local->ops->set_key)
+ if (!local->ops->set_key) {
wiphy_ext_feature_set(local->hw.wiphy,
NL80211_EXT_FEATURE_CAN_REPLACE_PTK0);
+ wiphy_ext_feature_set(local->hw.wiphy,
+ NL80211_EXT_FEATURE_EXT_KEY_ID);
+ }
/*
* Calculate scan IE length -- we need this to alloc
@@ -1297,8 +1298,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
ieee80211_remove_interfaces(local);
fail_rate:
rtnl_unlock();
- ieee80211_led_exit(local);
fail_flows:
+ ieee80211_led_exit(local);
destroy_workqueue(local->workqueue);
fail_workqueue:
wiphy_unregister(local->hw.wiphy);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 2e7fa743c892..d09b3c789314 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -532,6 +532,61 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
return 0;
}
+int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u8 ie_len)
+{
+ const struct ieee80211_sta_he_cap *he_cap;
+ struct ieee80211_supported_band *sband;
+ u8 *pos;
+
+ sband = ieee80211_get_sband(sdata);
+ if (!sband)
+ return -EINVAL;
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
+
+ if (!he_cap ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+ return 0;
+
+ if (skb_tailroom(skb) < ie_len)
+ return -ENOMEM;
+
+ pos = skb_put(skb, ie_len);
+ ieee80211_ie_build_he_cap(pos, he_cap, pos + ie_len);
+
+ return 0;
+}
+
+int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ const struct ieee80211_sta_he_cap *he_cap;
+ struct ieee80211_supported_band *sband;
+ u8 *pos;
+
+ sband = ieee80211_get_sband(sdata);
+ if (!sband)
+ return -EINVAL;
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, NL80211_IFTYPE_MESH_POINT);
+ if (!he_cap ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 ||
+ sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
+ return 0;
+
+ if (skb_tailroom(skb) < 2 + 1 + sizeof(struct ieee80211_he_operation))
+ return -ENOMEM;
+
+ pos = skb_put(skb, 2 + 1 + sizeof(struct ieee80211_he_operation));
+ ieee80211_ie_build_he_oper(pos);
+
+ return 0;
+}
+
static void ieee80211_mesh_path_timer(struct timer_list *t)
{
struct ieee80211_sub_if_data *sdata =
@@ -677,6 +732,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
struct ieee80211_chanctx_conf *chanctx_conf;
struct mesh_csa_settings *csa;
enum nl80211_band band;
+ u8 ie_len_he_cap;
u8 *pos;
struct ieee80211_sub_if_data *sdata;
int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon);
@@ -687,6 +743,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
band = chanctx_conf->def.chan->band;
rcu_read_unlock();
+ ie_len_he_cap = ieee80211_ie_len_he_cap(sdata,
+ NL80211_IFTYPE_MESH_POINT);
head_len = hdr_len +
2 + /* NULL SSID */
/* Channel Switch Announcement */
@@ -706,6 +764,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
2 + sizeof(__le16) + /* awake window */
2 + sizeof(struct ieee80211_vht_cap) +
2 + sizeof(struct ieee80211_vht_operation) +
+ ie_len_he_cap +
+ 2 + 1 + sizeof(struct ieee80211_he_operation) +
ifmsh->ie_len;
bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL);
@@ -823,6 +883,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh)
mesh_add_awake_window_ie(sdata, skb) ||
mesh_add_vht_cap_ie(sdata, skb) ||
mesh_add_vht_oper_ie(sdata, skb) ||
+ mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) ||
+ mesh_add_he_oper_ie(sdata, skb) ||
mesh_add_vendor_ies(sdata, skb))
goto out_free;
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index 94d57cce70da..953f720754e8 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -218,6 +218,10 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb);
+int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u8 ie_len);
+int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
void mesh_rmc_free(struct ieee80211_sub_if_data *sdata);
int mesh_rmc_init(struct ieee80211_sub_if_data *sdata);
void ieee80211s_init(void);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 68af62306385..d69983370381 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -328,6 +328,9 @@ u32 airtime_link_metric_get(struct ieee80211_local *local,
unsigned long fail_avg =
ewma_mesh_fail_avg_read(&sta->mesh->fail_avg);
+ if (sta->mesh->plink_state != NL80211_PLINK_ESTAB)
+ return MAX_METRIC;
+
/* Try to get rate based on HW/SW RC algorithm.
* Rate is returned in units of Kbps, correct this
* to comply with airtime calculation units
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index dd3aefd052a9..737c5f4dbf52 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -218,9 +218,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
bool include_plid = false;
u16 peering_proto = 0;
u8 *pos, ie_len = 4;
+ u8 ie_len_he_cap;
int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot);
int err = -ENOMEM;
+ ie_len_he_cap = ieee80211_ie_len_he_cap(sdata,
+ NL80211_IFTYPE_MESH_POINT);
skb = dev_alloc_skb(local->tx_headroom +
hdr_len +
2 + /* capability info */
@@ -233,6 +236,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
2 + sizeof(struct ieee80211_ht_operation) +
2 + sizeof(struct ieee80211_vht_cap) +
2 + sizeof(struct ieee80211_vht_operation) +
+ ie_len_he_cap +
+ 2 + 1 + sizeof(struct ieee80211_he_operation) +
2 + 8 + /* peering IE */
sdata->u.mesh.ie_len);
if (!skb)
@@ -321,7 +326,9 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
if (mesh_add_ht_cap_ie(sdata, skb) ||
mesh_add_ht_oper_ie(sdata, skb) ||
mesh_add_vht_cap_ie(sdata, skb) ||
- mesh_add_vht_oper_ie(sdata, skb))
+ mesh_add_vht_oper_ie(sdata, skb) ||
+ mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) ||
+ mesh_add_he_oper_ie(sdata, skb))
goto free;
}
@@ -433,6 +440,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
elems->vht_cap_elem, sta);
+ ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap,
+ elems->he_cap_len, sta);
+
if (bw != sta->sta.bandwidth)
changed |= IEEE80211_RC_BW_CHANGED;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 4c888dc9bd81..e041af2f021a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -8,7 +8,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
*/
#include <linux/delay.h>
@@ -158,10 +158,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
+ memset(chandef, 0, sizeof(struct cfg80211_chan_def));
chandef->chan = channel;
chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
chandef->center_freq1 = channel->center_freq;
- chandef->center_freq2 = 0;
if (!ht_oper || !sta_ht_cap.ht_supported) {
ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
@@ -1311,7 +1311,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
if (!res) {
ch_switch.timestamp = timestamp;
ch_switch.device_timestamp = device_timestamp;
- ch_switch.block_tx = beacon ? csa_ie.mode : 0;
+ ch_switch.block_tx = csa_ie.mode;
ch_switch.chandef = csa_ie.chandef;
ch_switch.count = csa_ie.count;
ch_switch.delay = csa_ie.max_switch_time;
@@ -1404,7 +1404,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
sdata->vif.csa_active = true;
sdata->csa_chandef = csa_ie.chandef;
- sdata->csa_block_tx = ch_switch.block_tx;
+ sdata->csa_block_tx = csa_ie.mode;
ifmgd->csa_ignored_same_chan = false;
if (sdata->csa_block_tx)
@@ -1438,7 +1438,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
* reset when the disconnection worker runs.
*/
sdata->vif.csa_active = true;
- sdata->csa_block_tx = ch_switch.block_tx;
+ sdata->csa_block_tx = csa_ie.mode;
ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work);
mutex_unlock(&local->chanctx_mtx);
@@ -2278,8 +2278,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
!ifmgd->have_beacon)
drv_mgd_prepare_tx(sdata->local, sdata, 0);
- ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
- reason, tx, frame_buf);
+ ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid,
+ ifmgd->bssid, stype, reason,
+ tx, frame_buf);
}
/* flush out frame - make sure the deauth was actually sent */
@@ -2522,7 +2523,10 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) {
ifmgd->nullfunc_failed = false;
- ieee80211_send_nullfunc(sdata->local, sdata, false);
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE))
+ ifmgd->probe_send_count--;
+ else
+ ieee80211_send_nullfunc(sdata->local, sdata, false);
} else {
int ssid_len;
@@ -2629,7 +2633,8 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw,
rcu_read_lock();
ssid = ieee80211_bss_get_ie(cbss, WLAN_EID_SSID);
- if (WARN_ON_ONCE(ssid == NULL))
+ if (WARN_ONCE(!ssid || ssid[1] > IEEE80211_MAX_SSID_LEN,
+ "invalid SSID element (len=%d)", ssid ? ssid[1] : -1))
ssid_len = 0;
else
ssid_len = ssid[1];
@@ -3181,15 +3186,14 @@ static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata,
static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss,
- struct ieee80211_mgmt *mgmt, size_t len)
+ struct ieee80211_mgmt *mgmt, size_t len,
+ struct ieee802_11_elems *elems)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
struct sta_info *sta;
- u8 *pos;
u16 capab_info, aid;
- struct ieee802_11_elems elems;
struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
const struct cfg80211_bss_ies *bss_ies = NULL;
struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
@@ -3217,19 +3221,15 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
ifmgd->broken_ap = true;
}
- pos = mgmt->u.assoc_resp.variable;
- ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems,
- mgmt->bssid, assoc_data->bss->bssid);
-
- if (!elems.supp_rates) {
+ if (!elems->supp_rates) {
sdata_info(sdata, "no SuppRates element in AssocResp\n");
return false;
}
ifmgd->aid = aid;
ifmgd->tdls_chan_switch_prohibited =
- elems.ext_capab && elems.ext_capab_len >= 5 &&
- (elems.ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED);
+ elems->ext_capab && elems->ext_capab_len >= 5 &&
+ (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED);
/*
* Some APs are erroneously not including some information in their
@@ -3238,11 +3238,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* 2G/3G/4G wifi routers, reported models include the "Onda PN51T",
* "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device.
*/
- if ((assoc_data->wmm && !elems.wmm_param) ||
+ if ((assoc_data->wmm && !elems->wmm_param) ||
(!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
- (!elems.ht_cap_elem || !elems.ht_operation)) ||
+ (!elems->ht_cap_elem || !elems->ht_operation)) ||
(!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
- (!elems.vht_cap_elem || !elems.vht_operation))) {
+ (!elems->vht_cap_elem || !elems->vht_operation))) {
const struct cfg80211_bss_ies *ies;
struct ieee802_11_elems bss_elems;
@@ -3260,8 +3260,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
mgmt->bssid,
assoc_data->bss->bssid);
if (assoc_data->wmm &&
- !elems.wmm_param && bss_elems.wmm_param) {
- elems.wmm_param = bss_elems.wmm_param;
+ !elems->wmm_param && bss_elems.wmm_param) {
+ elems->wmm_param = bss_elems.wmm_param;
sdata_info(sdata,
"AP bug: WMM param missing from AssocResp\n");
}
@@ -3270,27 +3270,27 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* Also check if we requested HT/VHT, otherwise the AP doesn't
* have to include the IEs in the (re)association response.
*/
- if (!elems.ht_cap_elem && bss_elems.ht_cap_elem &&
+ if (!elems->ht_cap_elem && bss_elems.ht_cap_elem &&
!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
- elems.ht_cap_elem = bss_elems.ht_cap_elem;
+ elems->ht_cap_elem = bss_elems.ht_cap_elem;
sdata_info(sdata,
"AP bug: HT capability missing from AssocResp\n");
}
- if (!elems.ht_operation && bss_elems.ht_operation &&
+ if (!elems->ht_operation && bss_elems.ht_operation &&
!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
- elems.ht_operation = bss_elems.ht_operation;
+ elems->ht_operation = bss_elems.ht_operation;
sdata_info(sdata,
"AP bug: HT operation missing from AssocResp\n");
}
- if (!elems.vht_cap_elem && bss_elems.vht_cap_elem &&
+ if (!elems->vht_cap_elem && bss_elems.vht_cap_elem &&
!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
- elems.vht_cap_elem = bss_elems.vht_cap_elem;
+ elems->vht_cap_elem = bss_elems.vht_cap_elem;
sdata_info(sdata,
"AP bug: VHT capa missing from AssocResp\n");
}
- if (!elems.vht_operation && bss_elems.vht_operation &&
+ if (!elems->vht_operation && bss_elems.vht_operation &&
!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
- elems.vht_operation = bss_elems.vht_operation;
+ elems->vht_operation = bss_elems.vht_operation;
sdata_info(sdata,
"AP bug: VHT operation missing from AssocResp\n");
}
@@ -3301,7 +3301,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* they should be present here. This is just a safety net.
*/
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
- (!elems.wmm_param || !elems.ht_cap_elem || !elems.ht_operation)) {
+ (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) {
sdata_info(sdata,
"HT AP is missing WMM params or HT capability/operation\n");
ret = false;
@@ -3309,7 +3309,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
- (!elems.vht_cap_elem || !elems.vht_operation)) {
+ (!elems->vht_cap_elem || !elems->vht_operation)) {
sdata_info(sdata,
"VHT AP is missing VHT capability/operation\n");
ret = false;
@@ -3336,7 +3336,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
- (!elems.he_cap || !elems.he_operation)) {
+ (!elems->he_cap || !elems->he_operation)) {
mutex_unlock(&sdata->local->sta_mtx);
sdata_info(sdata,
"HE AP is missing HE capability/operation\n");
@@ -3345,23 +3345,23 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
/* Set up internal HT/VHT capabilities */
- if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+ if (elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
- elems.ht_cap_elem, sta);
+ elems->ht_cap_elem, sta);
- if (elems.vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
+ if (elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
- elems.vht_cap_elem, sta);
+ elems->vht_cap_elem, sta);
- if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
- elems.he_cap) {
+ if (elems->he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) &&
+ elems->he_cap) {
ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
- elems.he_cap,
- elems.he_cap_len,
+ elems->he_cap,
+ elems->he_cap_len,
sta);
bss_conf->he_support = sta->sta.he_cap.has_he;
- changed |= ieee80211_recalc_twt_req(sdata, sta, &elems);
+ changed |= ieee80211_recalc_twt_req(sdata, sta, elems);
} else {
bss_conf->he_support = false;
bss_conf->twt_requester = false;
@@ -3369,14 +3369,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
if (bss_conf->he_support) {
bss_conf->bss_color =
- le32_get_bits(elems.he_operation->he_oper_params,
+ le32_get_bits(elems->he_operation->he_oper_params,
IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
bss_conf->htc_trig_based_pkt_ext =
- le32_get_bits(elems.he_operation->he_oper_params,
+ le32_get_bits(elems->he_operation->he_oper_params,
IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK);
bss_conf->frame_time_rts_th =
- le32_get_bits(elems.he_operation->he_oper_params,
+ le32_get_bits(elems->he_operation->he_oper_params,
IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
bss_conf->multi_sta_back_32bit =
@@ -3387,10 +3387,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
sta->sta.he_cap.he_cap_elem.mac_cap_info[2] &
IEEE80211_HE_MAC_CAP2_ACK_EN;
- bss_conf->uora_exists = !!elems.uora_element;
- if (elems.uora_element)
- bss_conf->uora_ocw_range = elems.uora_element[0];
+ bss_conf->uora_exists = !!elems->uora_element;
+ if (elems->uora_element)
+ bss_conf->uora_ocw_range = elems->uora_element[0];
+ ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation);
+ ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr);
/* TODO: OPEN: what happens if BSS color disable is set? */
}
@@ -3414,11 +3416,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
* NSS calculation (that would be done in rate_control_rate_init())
* and use the # of streams from that element.
*/
- if (elems.opmode_notif &&
- !(*elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) {
+ if (elems->opmode_notif &&
+ !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) {
u8 nss;
- nss = *elems.opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK;
+ nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK;
nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT;
nss += 1;
sta->sta.rx_nss = nss;
@@ -3433,7 +3435,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
sta->sta.mfp = false;
}
- sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS;
+ sta->sta.wme = elems->wmm_param && local->hw.queues >= IEEE80211_NUM_ACS;
err = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT))
@@ -3461,9 +3463,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
ieee80211_set_wmm_default(sdata, false, false);
- } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
- elems.wmm_param_len,
- elems.mu_edca_param_set)) {
+ } else if (!ieee80211_sta_wmm_params(local, sdata, elems->wmm_param,
+ elems->wmm_param_len,
+ elems->mu_edca_param_set)) {
/* still enable QoS since we might have HT/VHT */
ieee80211_set_wmm_default(sdata, false, true);
/* set the disable-WMM flag in this case to disable
@@ -3477,11 +3479,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
changed |= BSS_CHANGED_QOS;
- if (elems.max_idle_period_ie) {
+ if (elems->max_idle_period_ie) {
bss_conf->max_idle_period =
- le16_to_cpu(elems.max_idle_period_ie->max_idle_period);
+ le16_to_cpu(elems->max_idle_period_ie->max_idle_period);
bss_conf->protected_keep_alive =
- !!(elems.max_idle_period_ie->idle_options &
+ !!(elems->max_idle_period_ie->idle_options &
WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE);
changed |= BSS_CHANGED_KEEP_ALIVE;
} else {
@@ -3591,7 +3593,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
event.u.mlme.reason = status_code;
drv_event_callback(sdata->local, sdata, &event);
} else {
- if (!ieee80211_assoc_success(sdata, bss, mgmt, len)) {
+ if (!ieee80211_assoc_success(sdata, bss, mgmt, len, &elems)) {
/* oops -- internal error -- send timeout for now */
ieee80211_destroy_assoc_data(sdata, false, false);
cfg80211_assoc_timeout(sdata->dev, bss);
@@ -4504,7 +4506,7 @@ void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata)
* cfg80211 won't know and won't actually abort those attempts,
* thus we need to do that ourselves.
*/
- ieee80211_send_deauth_disassoc(sdata, bssid,
+ ieee80211_send_deauth_disassoc(sdata, bssid, bssid,
IEEE80211_STYPE_DEAUTH,
WLAN_REASON_DEAUTH_LEAVING,
false, frame_buf);
@@ -5227,7 +5229,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
rcu_read_lock();
ssidie = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID);
- if (!ssidie) {
+ if (!ssidie || ssidie[1] > sizeof(assoc_data->ssid)) {
rcu_read_unlock();
kfree(assoc_data);
return -EINVAL;
@@ -5291,7 +5293,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
netdev_info(sdata->dev,
- "disabling HE/HT/VHT due to WEP/TKIP use\n");
+ "disabling HT/VHT/HE due to WEP/TKIP use\n");
}
}
@@ -5545,7 +5547,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
ieee80211_get_reason_code_string(req->reason_code));
drv_mgd_prepare_tx(sdata->local, sdata, 0);
- ieee80211_send_deauth_disassoc(sdata, req->bssid,
+ ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid,
IEEE80211_STYPE_DEAUTH,
req->reason_code, tx,
frame_buf);
@@ -5565,7 +5567,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
ieee80211_get_reason_code_string(req->reason_code));
drv_mgd_prepare_tx(sdata->local, sdata, 0);
- ieee80211_send_deauth_disassoc(sdata, req->bssid,
+ ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid,
IEEE80211_STYPE_DEAUTH,
req->reason_code, tx,
frame_buf);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 60ef8972b254..c710504ccf1a 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -8,6 +8,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2019 Intel Corporation
*/
#include <linux/export.h>
#include <net/mac80211.h>
@@ -732,7 +733,7 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local,
}
if (local->ops->remain_on_channel) {
- ret = drv_cancel_remain_on_channel(local);
+ ret = drv_cancel_remain_on_channel(local, roc->sdata);
if (WARN_ON_ONCE(ret)) {
mutex_unlock(&local->mtx);
return ret;
@@ -991,7 +992,7 @@ void ieee80211_roc_purge(struct ieee80211_local *local,
if (roc->started) {
if (local->ops->remain_on_channel) {
/* can race, so ignore return value */
- drv_cancel_remain_on_channel(local);
+ drv_cancel_remain_on_channel(local, sdata);
ieee80211_roc_notify_destroy(roc);
} else {
roc->abort = true;
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 5d5348bc41ec..5397c6dad056 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -60,15 +60,6 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta)
#endif
}
-static inline void rate_control_remove_sta_debugfs(struct sta_info *sta)
-{
-#ifdef CONFIG_MAC80211_DEBUGFS
- struct rate_control_ref *ref = sta->rate_ctrl;
- if (ref && ref->ops->remove_sta_debugfs)
- ref->ops->remove_sta_debugfs(ref->priv, sta->rate_ctrl_priv);
-#endif
-}
-
void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata);
/* Get a reference to the rate control algorithm. If `name' is NULL, get the
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index ee86c3333999..86bc469a28bc 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -70,7 +70,7 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix)
}
/* return current EMWA throughput */
-int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma)
+int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg)
{
int usecs;
@@ -79,13 +79,13 @@ int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma)
usecs = 1000000;
/* reset thr. below 10% success */
- if (mr->stats.prob_ewma < MINSTREL_FRAC(10, 100))
+ if (mr->stats.prob_avg < MINSTREL_FRAC(10, 100))
return 0;
- if (prob_ewma > MINSTREL_FRAC(90, 100))
+ if (prob_avg > MINSTREL_FRAC(90, 100))
return MINSTREL_TRUNC(100000 * (MINSTREL_FRAC(90, 100) / usecs));
else
- return MINSTREL_TRUNC(100000 * (prob_ewma / usecs));
+ return MINSTREL_TRUNC(100000 * (prob_avg / usecs));
}
/* find & sort topmost throughput rates */
@@ -98,8 +98,8 @@ minstrel_sort_best_tp_rates(struct minstrel_sta_info *mi, int i, u8 *tp_list)
for (j = MAX_THR_RATES; j > 0; --j) {
tmp_mrs = &mi->r[tp_list[j - 1]].stats;
- if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_ewma) <=
- minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_ewma))
+ if (minstrel_get_tp_avg(&mi->r[i], cur_mrs->prob_avg) <=
+ minstrel_get_tp_avg(&mi->r[tp_list[j - 1]], tmp_mrs->prob_avg))
break;
}
@@ -157,20 +157,24 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
* Recalculate statistics and counters of a given rate
*/
void
-minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs)
+minstrel_calc_rate_stats(struct minstrel_priv *mp,
+ struct minstrel_rate_stats *mrs)
{
unsigned int cur_prob;
if (unlikely(mrs->attempts > 0)) {
mrs->sample_skipped = 0;
cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts);
- if (unlikely(!mrs->att_hist)) {
- mrs->prob_ewma = cur_prob;
+ if (mp->new_avg) {
+ minstrel_filter_avg_add(&mrs->prob_avg,
+ &mrs->prob_avg_1, cur_prob);
+ } else if (unlikely(!mrs->att_hist)) {
+ mrs->prob_avg = cur_prob;
} else {
/*update exponential weighted moving avarage */
- mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma,
- cur_prob,
- EWMA_LEVEL);
+ mrs->prob_avg = minstrel_ewma(mrs->prob_avg,
+ cur_prob,
+ EWMA_LEVEL);
}
mrs->att_hist += mrs->attempts;
mrs->succ_hist += mrs->success;
@@ -200,12 +204,12 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
struct minstrel_rate_stats *tmp_mrs = &mi->r[tmp_prob_rate].stats;
/* Update statistics of success probability per rate */
- minstrel_calc_rate_stats(mrs);
+ minstrel_calc_rate_stats(mp, mrs);
/* Sample less often below the 10% chance of success.
* Sample less often above the 95% chance of success. */
- if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) ||
- mrs->prob_ewma < MINSTREL_FRAC(10, 100)) {
+ if (mrs->prob_avg > MINSTREL_FRAC(95, 100) ||
+ mrs->prob_avg < MINSTREL_FRAC(10, 100)) {
mr->adjusted_retry_count = mrs->retry_count >> 1;
if (mr->adjusted_retry_count > 2)
mr->adjusted_retry_count = 2;
@@ -225,14 +229,14 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi)
* choose the maximum throughput rate as max_prob_rate
* (2) if all success probabilities < 95%, the rate with
* highest success probability is chosen as max_prob_rate */
- if (mrs->prob_ewma >= MINSTREL_FRAC(95, 100)) {
- tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_ewma);
+ if (mrs->prob_avg >= MINSTREL_FRAC(95, 100)) {
+ tmp_cur_tp = minstrel_get_tp_avg(mr, mrs->prob_avg);
tmp_prob_tp = minstrel_get_tp_avg(&mi->r[tmp_prob_rate],
- tmp_mrs->prob_ewma);
+ tmp_mrs->prob_avg);
if (tmp_cur_tp >= tmp_prob_tp)
tmp_prob_rate = i;
} else {
- if (mrs->prob_ewma >= tmp_mrs->prob_ewma)
+ if (mrs->prob_avg >= tmp_mrs->prob_avg)
tmp_prob_rate = i;
}
}
@@ -290,7 +294,7 @@ minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband,
mi->sample_deferred--;
if (time_after(jiffies, mi->last_stats_update +
- (mp->update_interval * HZ) / 1000))
+ mp->update_interval / (mp->new_avg ? 2 : 1)))
minstrel_update_stats(mp, mi);
}
@@ -422,7 +426,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta,
* has a probability of >95%, we shouldn't be attempting
* to use it, as this only wastes precious airtime */
if (!mrr_capable &&
- (mi->r[ndx].stats.prob_ewma > MINSTREL_FRAC(95, 100)))
+ (mi->r[ndx].stats.prob_avg > MINSTREL_FRAC(95, 100)))
return;
mi->prev_sample = true;
@@ -573,7 +577,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
* computing cur_tp
*/
tmp_mrs = &mi->r[idx].stats;
- tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
+ tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_avg) * 10;
tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
return tmp_cur_tp;
diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h
index 3c96a853adbd..dbb43bcd3c45 100644
--- a/net/mac80211/rc80211_minstrel.h
+++ b/net/mac80211/rc80211_minstrel.h
@@ -19,6 +19,21 @@
#define MAX_THR_RATES 4
/*
+ * Coefficients for moving average with noise filter (period=16),
+ * scaled by 10 bits
+ *
+ * a1 = exp(-pi * sqrt(2) / period)
+ * coeff2 = 2 * a1 * cos(sqrt(2) * 2 * pi / period)
+ * coeff3 = -sqr(a1)
+ * coeff1 = 1 - coeff2 - coeff3
+ */
+#define MINSTREL_AVG_COEFF1 (MINSTREL_FRAC(1, 1) - \
+ MINSTREL_AVG_COEFF2 - \
+ MINSTREL_AVG_COEFF3)
+#define MINSTREL_AVG_COEFF2 0x00001499
+#define MINSTREL_AVG_COEFF3 -0x0000092e
+
+/*
* Perform EWMA (Exponentially Weighted Moving Average) calculation
*/
static inline int
@@ -32,6 +47,37 @@ minstrel_ewma(int old, int new, int weight)
return old + incr;
}
+static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in)
+{
+ s32 out_1 = *prev_1;
+ s32 out_2 = *prev_2;
+ s32 val;
+
+ if (!in)
+ in += 1;
+
+ if (!out_1) {
+ val = out_1 = in;
+ goto out;
+ }
+
+ val = MINSTREL_AVG_COEFF1 * in;
+ val += MINSTREL_AVG_COEFF2 * out_1;
+ val += MINSTREL_AVG_COEFF3 * out_2;
+ val >>= MINSTREL_SCALE;
+
+ if (val > 1 << MINSTREL_SCALE)
+ val = 1 << MINSTREL_SCALE;
+ if (val < 0)
+ val = 1;
+
+out:
+ *prev_2 = out_1;
+ *prev_1 = val;
+
+ return val;
+}
+
struct minstrel_rate_stats {
/* current / last sampling period attempts/success counters */
u16 attempts, last_attempts;
@@ -40,8 +86,9 @@ struct minstrel_rate_stats {
/* total attempts/success counters */
u32 att_hist, succ_hist;
- /* prob_ewma - exponential weighted moving average of prob */
- u16 prob_ewma;
+ /* prob_avg - moving average of prob */
+ u16 prob_avg;
+ u16 prob_avg_1;
/* maximum retry counts */
u8 retry_count;
@@ -95,6 +142,8 @@ struct minstrel_sta_info {
struct minstrel_priv {
struct ieee80211_hw *hw;
bool has_mrr;
+ bool new_avg;
+ u32 sample_switch;
unsigned int cw_min;
unsigned int cw_max;
unsigned int max_retry;
@@ -125,8 +174,9 @@ extern const struct rate_control_ops mac80211_minstrel;
void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
/* Recalculate success probabilities and counters for a given rate using EWMA */
-void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs);
-int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_ewma);
+void minstrel_calc_rate_stats(struct minstrel_priv *mp,
+ struct minstrel_rate_stats *mrs);
+int minstrel_get_tp_avg(struct minstrel_rate *mr, int prob_avg);
/* debugfs */
int minstrel_stats_open(struct inode *inode, struct file *file);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index c8afd85b51a0..9b8e0daeb7bb 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -90,8 +90,8 @@ minstrel_stats_open(struct inode *inode, struct file *file)
p += sprintf(p, "%6u ", mr->perfect_tx_time);
tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
- tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
- eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg);
+ eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000);
p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u"
" %3u %3u %-3u "
@@ -147,8 +147,8 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file)
p += sprintf(p, "%u,",mr->perfect_tx_time);
tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100));
- tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma);
- eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ tp_avg = minstrel_get_tp_avg(mr, mrs->prob_avg);
+ eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000);
p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,%u,"
"%llu,%llu,%d,%d\n",
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 5a882da82f0e..694a31978a04 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -18,6 +18,8 @@
#define AVG_AMPDU_SIZE 16
#define AVG_PKT_SIZE 1200
+#define SAMPLE_SWITCH_THR 100
+
/* Number of bits for an average sized packet */
#define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3)
@@ -58,6 +60,7 @@
[GROUP_IDX(_streams, _sgi, _ht40)] = { \
.streams = _streams, \
.shift = _s, \
+ .bw = _ht40, \
.flags = \
IEEE80211_TX_RC_MCS | \
(_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
@@ -94,6 +97,7 @@
[VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
.streams = _streams, \
.shift = _s, \
+ .bw = _bw, \
.flags = \
IEEE80211_TX_RC_VHT_MCS | \
(_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \
@@ -342,12 +346,12 @@ minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi)
*/
int
minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
- int prob_ewma)
+ int prob_avg)
{
unsigned int nsecs = 0;
/* do not account throughput if sucess prob is below 10% */
- if (prob_ewma < MINSTREL_FRAC(10, 100))
+ if (prob_avg < MINSTREL_FRAC(10, 100))
return 0;
if (group != MINSTREL_CCK_GROUP)
@@ -361,11 +365,11 @@ minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
* account for collision related packet error rate fluctuation
* (prob is scaled - see MINSTREL_FRAC above)
*/
- if (prob_ewma > MINSTREL_FRAC(90, 100))
+ if (prob_avg > MINSTREL_FRAC(90, 100))
return MINSTREL_TRUNC(100000 * ((MINSTREL_FRAC(90, 100) * 1000)
/ nsecs));
else
- return MINSTREL_TRUNC(100000 * ((prob_ewma * 1000) / nsecs));
+ return MINSTREL_TRUNC(100000 * ((prob_avg * 1000) / nsecs));
}
/*
@@ -385,13 +389,13 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index,
cur_group = index / MCS_GROUP_RATES;
cur_idx = index % MCS_GROUP_RATES;
- cur_prob = mi->groups[cur_group].rates[cur_idx].prob_ewma;
+ cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg;
cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob);
do {
tmp_group = tp_list[j - 1] / MCS_GROUP_RATES;
tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES;
- tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg;
tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx,
tmp_prob);
if (cur_tp_avg < tmp_tp_avg ||
@@ -428,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
tmp_group = mi->max_prob_rate / MCS_GROUP_RATES;
tmp_idx = mi->max_prob_rate % MCS_GROUP_RATES;
- tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg;
tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
/* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from
@@ -440,11 +444,11 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
- max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+ max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg;
- if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
+ if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) {
cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
- mrs->prob_ewma);
+ mrs->prob_avg);
if (cur_tp_avg > tmp_tp_avg)
mi->max_prob_rate = index;
@@ -454,9 +458,9 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
if (cur_tp_avg > max_gpr_tp_avg)
mg->max_group_prob_rate = index;
} else {
- if (mrs->prob_ewma > tmp_prob)
+ if (mrs->prob_avg > tmp_prob)
mi->max_prob_rate = index;
- if (mrs->prob_ewma > max_gpr_prob)
+ if (mrs->prob_avg > max_gpr_prob)
mg->max_group_prob_rate = index;
}
}
@@ -478,15 +482,15 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi,
tmp_group = tmp_cck_tp_rate[0] / MCS_GROUP_RATES;
tmp_idx = tmp_cck_tp_rate[0] % MCS_GROUP_RATES;
- tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg;
tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
tmp_group = tmp_mcs_tp_rate[0] / MCS_GROUP_RATES;
tmp_idx = tmp_mcs_tp_rate[0] % MCS_GROUP_RATES;
- tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_ewma;
+ tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg;
tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob);
- if (tmp_cck_tp > tmp_mcs_tp) {
+ if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) {
for(i = 0; i < MAX_THR_RATES; i++) {
minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i],
tmp_mcs_tp_rate);
@@ -514,7 +518,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
continue;
tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
- tmp_prob = mi->groups[group].rates[tmp_idx].prob_ewma;
+ tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg;
if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) &&
(minstrel_mcs_groups[group].streams < tmp_max_streams)) {
@@ -526,6 +530,133 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
}
}
+static inline int
+minstrel_get_duration(int index)
+{
+ const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+ unsigned int duration = group->duration[index % MCS_GROUP_RATES];
+ return duration << group->shift;
+}
+
+static bool
+minstrel_ht_probe_group(struct minstrel_ht_sta *mi, const struct mcs_group *tp_group,
+ int tp_idx, const struct mcs_group *group)
+{
+ if (group->bw < tp_group->bw)
+ return false;
+
+ if (group->streams == tp_group->streams)
+ return true;
+
+ if (tp_idx < 4 && group->streams == tp_group->streams - 1)
+ return true;
+
+ return group->streams == tp_group->streams + 1;
+}
+
+static void
+minstrel_ht_find_probe_rates(struct minstrel_ht_sta *mi, u16 *rates, int *n_rates,
+ bool faster_rate)
+{
+ const struct mcs_group *group, *tp_group;
+ int i, g, max_dur;
+ int tp_idx;
+
+ tp_group = &minstrel_mcs_groups[mi->max_tp_rate[0] / MCS_GROUP_RATES];
+ tp_idx = mi->max_tp_rate[0] % MCS_GROUP_RATES;
+
+ max_dur = minstrel_get_duration(mi->max_tp_rate[0]);
+ if (faster_rate)
+ max_dur -= max_dur / 16;
+
+ for (g = 0; g < MINSTREL_GROUPS_NB; g++) {
+ u16 supported = mi->supported[g];
+
+ if (!supported)
+ continue;
+
+ group = &minstrel_mcs_groups[g];
+ if (!minstrel_ht_probe_group(mi, tp_group, tp_idx, group))
+ continue;
+
+ for (i = 0; supported; supported >>= 1, i++) {
+ int idx;
+
+ if (!(supported & 1))
+ continue;
+
+ if ((group->duration[i] << group->shift) > max_dur)
+ continue;
+
+ idx = g * MCS_GROUP_RATES + i;
+ if (idx == mi->max_tp_rate[0])
+ continue;
+
+ rates[(*n_rates)++] = idx;
+ break;
+ }
+ }
+}
+
+static void
+minstrel_ht_rate_sample_switch(struct minstrel_priv *mp,
+ struct minstrel_ht_sta *mi)
+{
+ struct minstrel_rate_stats *mrs;
+ u16 rates[MINSTREL_GROUPS_NB];
+ int n_rates = 0;
+ int probe_rate = 0;
+ bool faster_rate;
+ int i;
+ u8 random;
+
+ /*
+ * Use rate switching instead of probing packets for devices with
+ * little control over retry fallback behavior
+ */
+ if (mp->hw->max_rates > 1)
+ return;
+
+ /*
+ * If the current EWMA prob is >75%, look for a rate that's 6.25%
+ * faster than the max tp rate.
+ * If that fails, look again for a rate that is at least as fast
+ */
+ mrs = minstrel_get_ratestats(mi, mi->max_tp_rate[0]);
+ faster_rate = mrs->prob_avg > MINSTREL_FRAC(75, 100);
+ minstrel_ht_find_probe_rates(mi, rates, &n_rates, faster_rate);
+ if (!n_rates && faster_rate)
+ minstrel_ht_find_probe_rates(mi, rates, &n_rates, false);
+
+ /* If no suitable rate was found, try to pick the next one in the group */
+ if (!n_rates) {
+ int g_idx = mi->max_tp_rate[0] / MCS_GROUP_RATES;
+ u16 supported = mi->supported[g_idx];
+
+ supported >>= mi->max_tp_rate[0] % MCS_GROUP_RATES;
+ for (i = 0; supported; supported >>= 1, i++) {
+ if (!(supported & 1))
+ continue;
+
+ probe_rate = mi->max_tp_rate[0] + i;
+ goto out;
+ }
+
+ return;
+ }
+
+ i = 0;
+ if (n_rates > 1) {
+ random = prandom_u32();
+ i = random % n_rates;
+ }
+ probe_rate = rates[i];
+
+out:
+ mi->sample_rate = probe_rate;
+ mi->sample_mode = MINSTREL_SAMPLE_ACTIVE;
+}
+
/*
* Update rate statistics and select new primary rates
*
@@ -536,7 +667,8 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi)
* higher throughput rates, even if the probablity is a bit lower
*/
static void
-minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+ bool sample)
{
struct minstrel_mcs_group_data *mg;
struct minstrel_rate_stats *mrs;
@@ -544,6 +676,18 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES];
u16 tmp_cck_tp_rate[MAX_THR_RATES], index;
+ mi->sample_mode = MINSTREL_SAMPLE_IDLE;
+
+ if (sample) {
+ mi->total_packets_cur = mi->total_packets -
+ mi->total_packets_last;
+ mi->total_packets_last = mi->total_packets;
+ }
+ if (!mp->sample_switch)
+ sample = false;
+ if (mi->total_packets_cur < SAMPLE_SWITCH_THR && mp->sample_switch != 1)
+ sample = false;
+
if (mi->ampdu_packets > 0) {
if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN))
mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len,
@@ -558,11 +702,19 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
mi->sample_slow = 0;
mi->sample_count = 0;
- /* Initialize global rate indexes */
- for(j = 0; j < MAX_THR_RATES; j++){
- tmp_mcs_tp_rate[j] = 0;
- tmp_cck_tp_rate[j] = 0;
- }
+ memset(tmp_mcs_tp_rate, 0, sizeof(tmp_mcs_tp_rate));
+ memset(tmp_cck_tp_rate, 0, sizeof(tmp_cck_tp_rate));
+ if (mi->supported[MINSTREL_CCK_GROUP])
+ for (j = 0; j < ARRAY_SIZE(tmp_cck_tp_rate); j++)
+ tmp_cck_tp_rate[j] = MINSTREL_CCK_GROUP * MCS_GROUP_RATES;
+
+ if (mi->supported[MINSTREL_VHT_GROUP_0])
+ index = MINSTREL_VHT_GROUP_0 * MCS_GROUP_RATES;
+ else
+ index = MINSTREL_HT_GROUP_0 * MCS_GROUP_RATES;
+
+ for (j = 0; j < ARRAY_SIZE(tmp_mcs_tp_rate); j++)
+ tmp_mcs_tp_rate[j] = index;
/* Find best rate sets within all MCS groups*/
for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
@@ -575,7 +727,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
/* (re)Initialize group rate indexes */
for(j = 0; j < MAX_THR_RATES; j++)
- tmp_group_tp_rate[j] = group;
+ tmp_group_tp_rate[j] = MCS_GROUP_RATES * group;
for (i = 0; i < MCS_GROUP_RATES; i++) {
if (!(mi->supported[group] & BIT(i)))
@@ -585,8 +737,8 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
mrs = &mg->rates[i];
mrs->retry_updated = false;
- minstrel_calc_rate_stats(mrs);
- cur_prob = mrs->prob_ewma;
+ minstrel_calc_rate_stats(mp, mrs);
+ cur_prob = mrs->prob_avg;
if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0)
continue;
@@ -621,6 +773,11 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
/* try to sample all available rates during each interval */
mi->sample_count *= 8;
+ if (mp->new_avg)
+ mi->sample_count /= 2;
+
+ if (sample)
+ minstrel_ht_rate_sample_switch(mp, mi);
#ifdef CONFIG_MAC80211_DEBUGFS
/* use fixed index if set */
@@ -628,6 +785,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
for (i = 0; i < 4; i++)
mi->max_tp_rate[i] = mp->fixed_rate_idx;
mi->max_prob_rate = mp->fixed_rate_idx;
+ mi->sample_mode = MINSTREL_SAMPLE_IDLE;
}
#endif
@@ -731,15 +889,18 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
struct minstrel_ht_sta_priv *msp = priv_sta;
struct minstrel_ht_sta *mi = &msp->ht;
struct ieee80211_tx_rate *ar = info->status.rates;
- struct minstrel_rate_stats *rate, *rate2;
+ struct minstrel_rate_stats *rate, *rate2, *rate_sample = NULL;
struct minstrel_priv *mp = priv;
+ u32 update_interval = mp->update_interval / 2;
bool last, update = false;
+ bool sample_status = false;
int i;
if (!msp->is_ht)
return mac80211_minstrel.tx_status_ext(priv, sband,
&msp->legacy, st);
+
/* This packet was aggregated but doesn't carry status info */
if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
!(info->flags & IEEE80211_TX_STAT_AMPDU))
@@ -765,12 +926,17 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE)
mi->sample_packets += info->status.ampdu_len;
+ if (mi->sample_mode != MINSTREL_SAMPLE_IDLE)
+ rate_sample = minstrel_get_ratestats(mi, mi->sample_rate);
+
last = !minstrel_ht_txstat_valid(mp, &ar[0]);
for (i = 0; !last; i++) {
last = (i == IEEE80211_TX_MAX_RATES - 1) ||
!minstrel_ht_txstat_valid(mp, &ar[i + 1]);
rate = minstrel_ht_get_stats(mp, mi, &ar[i]);
+ if (rate == rate_sample)
+ sample_status = true;
if (last)
rate->success += info->status.ampdu_ack_len;
@@ -778,44 +944,61 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
rate->attempts += ar[i].count * info->status.ampdu_len;
}
- /*
- * check for sudden death of spatial multiplexing,
- * downgrade to a lower number of streams if necessary.
- */
- rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]);
- if (rate->attempts > 30 &&
- MINSTREL_FRAC(rate->success, rate->attempts) <
- MINSTREL_FRAC(20, 100)) {
- minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true);
+ switch (mi->sample_mode) {
+ case MINSTREL_SAMPLE_IDLE:
+ if (mp->new_avg &&
+ (mp->hw->max_rates > 1 ||
+ mi->total_packets_cur < SAMPLE_SWITCH_THR))
+ update_interval /= 2;
+ break;
+
+ case MINSTREL_SAMPLE_ACTIVE:
+ if (!sample_status)
+ break;
+
+ mi->sample_mode = MINSTREL_SAMPLE_PENDING;
update = true;
- }
+ break;
+
+ case MINSTREL_SAMPLE_PENDING:
+ if (sample_status)
+ break;
- rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]);
- if (rate2->attempts > 30 &&
- MINSTREL_FRAC(rate2->success, rate2->attempts) <
- MINSTREL_FRAC(20, 100)) {
- minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false);
update = true;
+ minstrel_ht_update_stats(mp, mi, false);
+ break;
+ }
+
+
+ if (mp->hw->max_rates > 1) {
+ /*
+ * check for sudden death of spatial multiplexing,
+ * downgrade to a lower number of streams if necessary.
+ */
+ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]);
+ if (rate->attempts > 30 &&
+ rate->success < rate->attempts / 4) {
+ minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true);
+ update = true;
+ }
+
+ rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]);
+ if (rate2->attempts > 30 &&
+ rate2->success < rate2->attempts / 4) {
+ minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false);
+ update = true;
+ }
}
- if (time_after(jiffies, mi->last_stats_update +
- (mp->update_interval / 2 * HZ) / 1000)) {
+ if (time_after(jiffies, mi->last_stats_update + update_interval)) {
update = true;
- minstrel_ht_update_stats(mp, mi);
+ minstrel_ht_update_stats(mp, mi, true);
}
if (update)
minstrel_ht_update_rates(mp, mi);
}
-static inline int
-minstrel_get_duration(int index)
-{
- const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
- unsigned int duration = group->duration[index % MCS_GROUP_RATES];
- return duration << group->shift;
-}
-
static void
minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
int index)
@@ -829,7 +1012,7 @@ minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
unsigned int overhead = 0, overhead_rtscts = 0;
mrs = minstrel_get_ratestats(mi, index);
- if (mrs->prob_ewma < MINSTREL_FRAC(1, 10)) {
+ if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) {
mrs->retry_count = 1;
mrs->retry_count_rtscts = 1;
return;
@@ -886,7 +1069,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
if (!mrs->retry_updated)
minstrel_calc_retransmit(mp, mi, index);
- if (mrs->prob_ewma < MINSTREL_FRAC(20, 100) || !mrs->retry_count) {
+ if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) {
ratetbl->rate[offset].count = 2;
ratetbl->rate[offset].count_rts = 2;
ratetbl->rate[offset].count_cts = 2;
@@ -920,11 +1103,11 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
}
static inline int
-minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate)
+minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate)
{
int group = rate / MCS_GROUP_RATES;
rate %= MCS_GROUP_RATES;
- return mi->groups[group].rates[rate].prob_ewma;
+ return mi->groups[group].rates[rate].prob_avg;
}
static int
@@ -936,7 +1119,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
unsigned int duration;
/* Disable A-MSDU if max_prob_rate is bad */
- if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100))
+ if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100))
return 1;
duration = g->duration[rate];
@@ -959,7 +1142,7 @@ minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi)
* data packet size
*/
if (duration > MCS_DURATION(1, 0, 260) ||
- (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) <
+ (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) <
MINSTREL_FRAC(75, 100)))
return 3200;
@@ -980,14 +1163,18 @@ static void
minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
{
struct ieee80211_sta_rates *rates;
+ u16 first_rate = mi->max_tp_rate[0];
int i = 0;
+ if (mi->sample_mode == MINSTREL_SAMPLE_ACTIVE)
+ first_rate = mi->sample_rate;
+
rates = kzalloc(sizeof(*rates), GFP_ATOMIC);
if (!rates)
return;
/* Start with max_tp_rate[0] */
- minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]);
+ minstrel_ht_set_rate(mp, mi, rates, i++, first_rate);
if (mp->hw->max_rates >= 3) {
/* At least 3 tx rates supported, use max_tp_rate[1] next */
@@ -1012,6 +1199,11 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
int tp_rate1, tp_rate2;
int sample_idx = 0;
+ if (mp->hw->max_rates == 1 && mp->sample_switch &&
+ (mi->total_packets_cur >= SAMPLE_SWITCH_THR ||
+ mp->sample_switch == 1))
+ return -1;
+
if (mi->sample_wait > 0) {
mi->sample_wait--;
return -1;
@@ -1055,10 +1247,25 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
* rate, to avoid wasting airtime.
*/
sample_dur = minstrel_get_duration(sample_idx);
- if (mrs->prob_ewma > MINSTREL_FRAC(95, 100) ||
+ if (mrs->prob_avg > MINSTREL_FRAC(95, 100) ||
minstrel_get_duration(mi->max_prob_rate) * 3 < sample_dur)
return -1;
+
+ /*
+ * For devices with no configurable multi-rate retry, skip sampling
+ * below the per-group max throughput rate, and only use one sampling
+ * attempt per rate
+ */
+ if (mp->hw->max_rates == 1 &&
+ (minstrel_get_duration(mg->max_group_tp_rate[0]) < sample_dur ||
+ mrs->attempts))
+ return -1;
+
+ /* Skip already sampled slow rates */
+ if (sample_dur >= minstrel_get_duration(tp_rate1) && mrs->attempts)
+ return -1;
+
/*
* Make sure that lower rates get sampled only occasionally,
* if the link is working perfectly.
@@ -1318,7 +1525,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
mi->supported[MINSTREL_CCK_GROUP] |= mi->cck_supported_short << 4;
/* create an initial rate table with the lowest supported rates */
- minstrel_ht_update_stats(mp, mi);
+ minstrel_ht_update_stats(mp, mi, true);
minstrel_ht_update_rates(mp, mi);
return;
@@ -1436,6 +1643,8 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
if (!mp)
return NULL;
+ mp->sample_switch = -1;
+
/* contention window settings
* Just an approximation. Using the per-queue values would complicate
* the calculations and is probably unnecessary */
@@ -1461,12 +1670,17 @@ minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
mp->has_mrr = true;
mp->hw = hw;
- mp->update_interval = 100;
+ mp->update_interval = HZ / 10;
+ mp->new_avg = true;
#ifdef CONFIG_MAC80211_DEBUGFS
mp->fixed_rate_idx = (u32) -1;
debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir,
&mp->fixed_rate_idx);
+ debugfs_create_u32("sample_switch", S_IRUGO | S_IWUSR, debugfsdir,
+ &mp->sample_switch);
+ debugfs_create_bool("new_avg", S_IRUGO | S_IWUSR, debugfsdir,
+ &mp->new_avg);
#endif
minstrel_ht_init_cck_rates(mp);
@@ -1491,7 +1705,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
i = mi->max_tp_rate[0] / MCS_GROUP_RATES;
j = mi->max_tp_rate[0] % MCS_GROUP_RATES;
- prob = mi->groups[i].rates[j].prob_ewma;
+ prob = mi->groups[i].rates[j].prob_avg;
/* convert tp_avg from pkt per second in kbps */
tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
index 80296268c778..53ea3c29debf 100644
--- a/net/mac80211/rc80211_minstrel_ht.h
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -33,6 +33,7 @@ struct mcs_group {
u16 flags;
u8 streams;
u8 shift;
+ u8 bw;
u16 duration[MCS_GROUP_RATES];
};
@@ -50,6 +51,12 @@ struct minstrel_mcs_group_data {
struct minstrel_rate_stats rates[MCS_GROUP_RATES];
};
+enum minstrel_sample_mode {
+ MINSTREL_SAMPLE_IDLE,
+ MINSTREL_SAMPLE_ACTIVE,
+ MINSTREL_SAMPLE_PENDING,
+};
+
struct minstrel_ht_sta {
struct ieee80211_sta *sta;
@@ -71,6 +78,8 @@ struct minstrel_ht_sta {
unsigned int overhead;
unsigned int overhead_rtscts;
+ unsigned int total_packets_last;
+ unsigned int total_packets_cur;
unsigned int total_packets;
unsigned int sample_packets;
@@ -82,6 +91,9 @@ struct minstrel_ht_sta {
u8 sample_count;
u8 sample_slow;
+ enum minstrel_sample_mode sample_mode;
+ u16 sample_rate;
+
/* current MCS group to be sampled */
u8 sample_group;
@@ -107,6 +119,6 @@ struct minstrel_ht_sta_priv {
void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate,
- int prob_ewma);
+ int prob_avg);
#endif
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 5a6e9f3edc04..bebb71917742 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -98,8 +98,8 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p)
p += sprintf(p, "%6u ", tx_time);
tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
- tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
- eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg);
+ eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000);
p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u"
" %3u %3u %-3u "
@@ -243,8 +243,8 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p)
p += sprintf(p, "%u,", tx_time);
tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100));
- tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma);
- eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000);
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_avg);
+ eprob = MINSTREL_TRUNC(mrs->prob_avg * 1000);
p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u,%u,"
"%u,%llu,%llu,",
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3c1ab870fefe..0e05ff037672 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2447,11 +2447,13 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) &&
sdata->control_port_over_nl80211)) {
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
- bool noencrypt = status->flag & RX_FLAG_DECRYPTED;
+ bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED);
cfg80211_rx_control_port(dev, skb, noencrypt);
dev_kfree_skb(skb);
} else {
+ memset(skb->cb, 0, sizeof(skb->cb));
+
/* deliver to local stack */
if (rx->napi)
napi_gro_receive(rx->napi, skb);
@@ -2546,8 +2548,6 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
if (skb) {
skb->protocol = eth_type_trans(skb, dev);
- memset(skb->cb, 0, sizeof(skb->cb));
-
ieee80211_deliver_skb_to_local_stack(skb, rx);
}
@@ -3467,9 +3467,18 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP):
/* process for all: mesh, mlme, ibss */
break;
+ case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
+ if (is_multicast_ether_addr(mgmt->da) &&
+ !is_broadcast_ether_addr(mgmt->da))
+ return RX_DROP_MONITOR;
+
+ /* process only for station/IBSS */
+ if (sdata->vif.type != NL80211_IFTYPE_STATION &&
+ sdata->vif.type != NL80211_IFTYPE_ADHOC)
+ return RX_DROP_MONITOR;
+ break;
case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP):
case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP):
- case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
if (is_multicast_ether_addr(mgmt->da) &&
!is_broadcast_ether_addr(mgmt->da))
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index adf94ba1ed77..4d31d9688dc2 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -520,10 +520,33 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local,
return 0;
}
+static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *sdata_iter;
+
+ if (!ieee80211_is_radar_required(local))
+ return true;
+
+ if (!regulatory_pre_cac_allowed(local->hw.wiphy))
+ return false;
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(sdata_iter, &local->interfaces, list) {
+ if (sdata_iter->wdev.cac_started) {
+ mutex_unlock(&local->iflist_mtx);
+ return false;
+ }
+ }
+ mutex_unlock(&local->iflist_mtx);
+
+ return true;
+}
+
static bool ieee80211_can_scan(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata)
{
- if (ieee80211_is_radar_required(local))
+ if (!__ieee80211_can_leave_ch(sdata))
return false;
if (!list_empty(&local->roc_list))
@@ -630,7 +653,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
lockdep_assert_held(&local->mtx);
- if (local->scan_req || ieee80211_is_radar_required(local))
+ if (local->scan_req)
+ return -EBUSY;
+
+ if (!__ieee80211_can_leave_ch(sdata))
return -EBUSY;
if (!ieee80211_can_scan(local, sdata)) {
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 95eb8220e2e4..0f5f40678885 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -210,6 +210,20 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
return NULL;
}
+struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
+ const u8 *sta_addr, const u8 *vif_addr)
+{
+ struct rhlist_head *tmp;
+ struct sta_info *sta;
+
+ for_each_sta_info(local, sta_addr, sta, tmp) {
+ if (ether_addr_equal(vif_addr, sta->sdata->vif.addr))
+ return sta;
+ }
+
+ return NULL;
+}
+
struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata,
int idx)
{
@@ -396,6 +410,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
skb_queue_head_init(&sta->ps_tx_buf[i]);
skb_queue_head_init(&sta->tx_filtered[i]);
sta->airtime[i].deficit = sta->airtime_weight;
+ atomic_set(&sta->airtime[i].aql_tx_pending, 0);
+ sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i];
+ sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i];
}
for (i = 0; i < IEEE80211_NUM_TIDS; i++)
@@ -1065,7 +1082,6 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
kfree(sinfo);
- rate_control_remove_sta_debugfs(sta);
ieee80211_sta_debugfs_remove(sta);
cleanup_single_sta(sta);
@@ -1894,6 +1910,44 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
}
EXPORT_SYMBOL(ieee80211_sta_register_airtime);
+void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
+ struct sta_info *sta, u8 ac,
+ u16 tx_airtime, bool tx_completed)
+{
+ int tx_pending;
+
+ if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
+ return;
+
+ if (!tx_completed) {
+ if (sta)
+ atomic_add(tx_airtime,
+ &sta->airtime[ac].aql_tx_pending);
+
+ atomic_add(tx_airtime, &local->aql_total_pending_airtime);
+ return;
+ }
+
+ if (sta) {
+ tx_pending = atomic_sub_return(tx_airtime,
+ &sta->airtime[ac].aql_tx_pending);
+ if (WARN_ONCE(tx_pending < 0,
+ "STA %pM AC %d txq pending airtime underflow: %u, %u",
+ sta->addr, ac, tx_pending, tx_airtime))
+ atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending,
+ tx_pending, 0);
+ }
+
+ tx_pending = atomic_sub_return(tx_airtime,
+ &local->aql_total_pending_airtime);
+ if (WARN_ONCE(tx_pending < 0,
+ "Device %s AC %d pending airtime underflow: %u, %u",
+ wiphy_name(local->hw.wiphy), ac, tx_pending,
+ tx_airtime))
+ atomic_cmpxchg(&local->aql_total_pending_airtime,
+ tx_pending, 0);
+}
+
int sta_info_move_state(struct sta_info *sta,
enum ieee80211_sta_state new_state)
{
@@ -1962,6 +2016,7 @@ int sta_info_move_state(struct sta_info *sta,
case IEEE80211_STA_ASSOC:
if (sta->sta_state == IEEE80211_STA_AUTH) {
set_bit(WLAN_STA_ASSOC, &sta->_flags);
+ sta->assoc_at = ktime_get_boottime_ns();
ieee80211_recalc_min_chandef(sta->sdata);
if (!sta->sta.support_p2p_ps)
ieee80211_recalc_p2p_go_ps_allowed(sta->sdata);
@@ -1979,6 +2034,10 @@ int sta_info_move_state(struct sta_info *sta,
ieee80211_check_fast_xmit(sta);
ieee80211_check_fast_rx(sta);
}
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
+ sta->sdata->vif.type == NL80211_IFTYPE_AP)
+ cfg80211_send_layer2_update(sta->sdata->dev,
+ sta->sta.addr);
break;
default:
break;
@@ -2191,6 +2250,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
BIT_ULL(NL80211_STA_INFO_STA_FLAGS) |
BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) |
+ BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) |
BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC);
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
@@ -2199,6 +2259,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
}
sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
+ sinfo->assoc_at = sta->assoc_at;
sinfo->inactive_time =
jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));
@@ -2451,7 +2512,8 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta)
{
struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta);
- if (time_after(stats->last_rx, sta->status_stats.last_ack))
+ if (!sta->status_stats.last_ack ||
+ time_after(stats->last_rx, sta->status_stats.last_ack))
return stats->last_rx;
return sta->status_stats.last_ack;
}
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 3260d4234920..c00e28585f9d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -132,8 +132,15 @@ struct airtime_info {
u64 rx_airtime;
u64 tx_airtime;
s64 deficit;
+ atomic_t aql_tx_pending; /* Estimated airtime for frames pending */
+ u32 aql_limit_low;
+ u32 aql_limit_high;
};
+void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
+ struct sta_info *sta, u8 ac,
+ u16 tx_airtime, bool tx_completed);
+
struct sta_info;
/**
@@ -466,6 +473,7 @@ struct ieee80211_sta_rx_stats {
* the station when it leaves powersave or polls for frames
* @driver_buffered_tids: bitmap of TIDs the driver has data buffered on
* @txq_buffered_tids: bitmap of TIDs that mac80211 has txq data buffered on
+ * @assoc_at: clock boottime (in ns) of last association
* @last_connected: time (in seconds) when a station got connected
* @last_seq_ctrl: last received seq/frag number from this STA (per TID
* plus one for non-QoS frames)
@@ -562,6 +570,7 @@ struct sta_info {
unsigned long driver_buffered_tids;
unsigned long txq_buffered_tids;
+ u64 assoc_at;
long last_connected;
/* Updated from RX path only, no locking requirements */
@@ -723,6 +732,10 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
const u8 *addr);
+/* user must hold sta_mtx or be in RCU critical section */
+struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local,
+ const u8 *sta_addr, const u8 *vif_addr);
+
#define for_each_sta_info(local, _addr, _sta, _tmp) \
rhl_for_each_entry_rcu(_sta, _tmp, \
sta_info_hash_lookup(local, _addr), hash_node)
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index a88e3bf17e9d..b720feaf9a74 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -254,14 +254,22 @@ static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn)
tid_tx->bar_pending = true;
}
-static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
+static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info,
+ struct ieee80211_tx_status *status)
{
int len = sizeof(struct ieee80211_radiotap_header);
/* IEEE80211_RADIOTAP_RATE rate */
- if (info->status.rates[0].idx >= 0 &&
- !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
- IEEE80211_TX_RC_VHT_MCS)))
+ if (status && status->rate && !(status->rate->flags &
+ (RATE_INFO_FLAGS_MCS |
+ RATE_INFO_FLAGS_DMG |
+ RATE_INFO_FLAGS_EDMG |
+ RATE_INFO_FLAGS_VHT_MCS |
+ RATE_INFO_FLAGS_HE_MCS)))
+ len += 2;
+ else if (info->status.rates[0].idx >= 0 &&
+ !(info->status.rates[0].flags &
+ (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)))
len += 2;
/* IEEE80211_RADIOTAP_TX_FLAGS */
@@ -272,7 +280,14 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
/* IEEE80211_RADIOTAP_MCS
* IEEE80211_RADIOTAP_VHT */
- if (info->status.rates[0].idx >= 0) {
+ if (status && status->rate) {
+ if (status->rate->flags & RATE_INFO_FLAGS_MCS)
+ len += 3;
+ else if (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS)
+ len = ALIGN(len, 2) + 12;
+ else if (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)
+ len = ALIGN(len, 2) + 12;
+ } else if (info->status.rates[0].idx >= 0) {
if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS)
len += 3;
else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS)
@@ -286,12 +301,14 @@ static void
ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
struct ieee80211_supported_band *sband,
struct sk_buff *skb, int retry_count,
- int rtap_len, int shift)
+ int rtap_len, int shift,
+ struct ieee80211_tx_status *status)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
struct ieee80211_radiotap_header *rthdr;
unsigned char *pos;
+ u16 legacy_rate = 0;
u16 txflags;
rthdr = skb_push(skb, rtap_len);
@@ -310,14 +327,23 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
*/
/* IEEE80211_RADIOTAP_RATE */
- if (info->status.rates[0].idx >= 0 &&
- !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
- IEEE80211_TX_RC_VHT_MCS))) {
- u16 rate;
+ if (status && status->rate) {
+ if (!(status->rate->flags & (RATE_INFO_FLAGS_MCS |
+ RATE_INFO_FLAGS_DMG |
+ RATE_INFO_FLAGS_EDMG |
+ RATE_INFO_FLAGS_VHT_MCS |
+ RATE_INFO_FLAGS_HE_MCS)))
+ legacy_rate = status->rate->legacy;
+ } else if (info->status.rates[0].idx >= 0 &&
+ !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+ IEEE80211_TX_RC_VHT_MCS)))
+ legacy_rate =
+ sband->bitrates[info->status.rates[0].idx].bitrate;
+
+ if (legacy_rate) {
rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
- rate = sband->bitrates[info->status.rates[0].idx].bitrate;
- *pos = DIV_ROUND_UP(rate, 5 * (1 << shift));
+ *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift));
/* padding for tx flags */
pos += 2;
}
@@ -341,7 +367,140 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
*pos = retry_count;
pos++;
- if (info->status.rates[0].idx < 0)
+ if (status && status->rate &&
+ (status->rate->flags & RATE_INFO_FLAGS_MCS)) {
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS);
+ pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS |
+ IEEE80211_RADIOTAP_MCS_HAVE_GI |
+ IEEE80211_RADIOTAP_MCS_HAVE_BW;
+ if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI)
+ pos[1] |= IEEE80211_RADIOTAP_MCS_SGI;
+ if (status->rate->bw == RATE_INFO_BW_40)
+ pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40;
+ pos[2] = status->rate->mcs;
+ pos += 3;
+ } else if (status && status->rate &&
+ (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS)) {
+ u16 known = local->hw.radiotap_vht_details &
+ (IEEE80211_RADIOTAP_VHT_KNOWN_GI |
+ IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH);
+
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT);
+
+ /* required alignment from rthdr */
+ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2);
+
+ /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */
+ put_unaligned_le16(known, pos);
+ pos += 2;
+
+ /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */
+ if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI)
+ *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
+ pos++;
+
+ /* u8 bandwidth */
+ switch (status->rate->bw) {
+ case RATE_INFO_BW_160:
+ *pos = 11;
+ break;
+ case RATE_INFO_BW_80:
+ *pos = 4;
+ break;
+ case RATE_INFO_BW_40:
+ *pos = 1;
+ break;
+ default:
+ *pos = 0;
+ break;
+ }
+ pos++;
+
+ /* u8 mcs_nss[4] */
+ *pos = (status->rate->mcs << 4) | status->rate->nss;
+ pos += 4;
+
+ /* u8 coding */
+ pos++;
+ /* u8 group_id */
+ pos++;
+ /* u16 partial_aid */
+ pos += 2;
+ } else if (status && status->rate &&
+ (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)) {
+ struct ieee80211_radiotap_he *he;
+
+ rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE);
+
+ /* required alignment from rthdr */
+ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2);
+ he = (struct ieee80211_radiotap_he *)pos;
+
+ he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU |
+ IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN |
+ IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN |
+ IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN);
+
+ he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN);
+
+#define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f)
+
+ he->data6 |= HE_PREP(DATA6_NSTS, status->rate->nss);
+
+#define CHECK_GI(s) \
+ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \
+ (int)NL80211_RATE_INFO_HE_GI_##s)
+
+ CHECK_GI(0_8);
+ CHECK_GI(1_6);
+ CHECK_GI(3_2);
+
+ he->data3 |= HE_PREP(DATA3_DATA_MCS, status->rate->mcs);
+ he->data3 |= HE_PREP(DATA3_DATA_DCM, status->rate->he_dcm);
+
+ he->data5 |= HE_PREP(DATA5_GI, status->rate->he_gi);
+
+ switch (status->rate->bw) {
+ case RATE_INFO_BW_20:
+ he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ);
+ break;
+ case RATE_INFO_BW_40:
+ he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ);
+ break;
+ case RATE_INFO_BW_80:
+ he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ);
+ break;
+ case RATE_INFO_BW_160:
+ he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ);
+ break;
+ case RATE_INFO_BW_HE_RU:
+#define CHECK_RU_ALLOC(s) \
+ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \
+ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4)
+
+ CHECK_RU_ALLOC(26);
+ CHECK_RU_ALLOC(52);
+ CHECK_RU_ALLOC(106);
+ CHECK_RU_ALLOC(242);
+ CHECK_RU_ALLOC(484);
+ CHECK_RU_ALLOC(996);
+ CHECK_RU_ALLOC(2x996);
+
+ he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC,
+ status->rate->he_ru_alloc + 4);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid SU BW %d\n", status->rate->bw);
+ }
+
+ pos += sizeof(struct ieee80211_radiotap_he);
+ }
+
+ if ((status && status->rate) || info->status.rates[0].idx < 0)
return;
/* IEEE80211_RADIOTAP_MCS
@@ -511,12 +670,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
struct sk_buff *skb, bool dropped)
{
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ u16 tx_time_est = ieee80211_info_get_tx_time_est(info);
struct ieee80211_hdr *hdr = (void *)skb->data;
bool acked = info->flags & IEEE80211_TX_STAT_ACK;
if (dropped)
acked = false;
+ if (tx_time_est) {
+ struct sta_info *sta;
+
+ rcu_read_lock();
+
+ sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2);
+ ieee80211_sta_update_pending_airtime(local, sta,
+ skb_get_queue_mapping(skb),
+ tx_time_est,
+ true);
+ rcu_read_unlock();
+ }
+
if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
struct ieee80211_sub_if_data *sdata;
@@ -645,7 +818,8 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw,
void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
struct ieee80211_supported_band *sband,
- int retry_count, int shift, bool send_to_cooked)
+ int retry_count, int shift, bool send_to_cooked,
+ struct ieee80211_tx_status *status)
{
struct sk_buff *skb2;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
@@ -654,14 +828,14 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
int rtap_len;
/* send frame to monitor interfaces now */
- rtap_len = ieee80211_tx_radiotap_len(info);
+ rtap_len = ieee80211_tx_radiotap_len(info, status);
if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) {
pr_err("ieee80211_tx_status: headroom too small\n");
dev_kfree_skb(skb);
return;
}
ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count,
- rtap_len, shift);
+ rtap_len, shift, status);
/* XXX: is this sufficient for BPF? */
skb_reset_mac_header(skb);
@@ -717,6 +891,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
+ u16 tx_time_est;
rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
@@ -826,6 +1001,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
ieee80211_sta_register_airtime(&sta->sta, tid,
info->status.tx_time, 0);
+ if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) {
+ /* Do this here to avoid the expensive lookup of the sta
+ * in ieee80211_report_used_skb().
+ */
+ ieee80211_sta_update_pending_airtime(local, sta,
+ skb_get_queue_mapping(skb),
+ tx_time_est,
+ true);
+ ieee80211_info_set_tx_time_est(info, 0);
+ }
+
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
if (info->flags & IEEE80211_TX_STAT_ACK) {
if (sta->status_stats.lost_packets)
@@ -870,7 +1056,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
I802_DEBUG_INC(local->dot11FailedCount);
}
- if (ieee80211_is_nullfunc(fc) && ieee80211_has_pm(fc) &&
+ if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) &&
+ ieee80211_has_pm(fc) &&
ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) &&
!(info->flags & IEEE80211_TX_CTL_INJECTED) &&
local->ps_sdata && !(local->scanning)) {
@@ -901,7 +1088,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
}
/* send to monitor interfaces */
- ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked);
+ ieee80211_tx_monitor(local, skb, sband, retry_count, shift,
+ send_to_cooked, status);
}
void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
@@ -912,19 +1100,13 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
.skb = skb,
.info = IEEE80211_SKB_CB(skb),
};
- struct rhlist_head *tmp;
struct sta_info *sta;
rcu_read_lock();
- for_each_sta_info(local, hdr->addr1, sta, tmp) {
- /* skip wrong virtual interface */
- if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr))
- continue;
-
+ sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2);
+ if (sta)
status.sta = &sta->sta;
- break;
- }
__ieee80211_tx_status(hw, &status);
rcu_read_unlock();
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 727dc9f3f3b3..e7f57bb18f6e 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -263,9 +263,21 @@ int ieee80211_tkip_decrypt_data(struct arc4_ctx *ctx,
if ((keyid >> 6) != key->conf.keyidx)
return TKIP_DECRYPT_INVALID_KEYIDX;
- if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT &&
- (iv32 < rx_ctx->iv32 ||
- (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16)))
+ /* Reject replays if the received TSC is smaller than or equal to the
+ * last received value in a valid message, but with an exception for
+ * the case where a new key has been set and no valid frame using that
+ * key has yet received and the local RSC was initialized to 0. This
+ * exception allows the very first frame sent by the transmitter to be
+ * accepted even if that transmitter were to use TSC 0 (IEEE 802.11
+ * described TSC to be initialized to 1 whenever a new key is taken into
+ * use).
+ */
+ if (iv32 < rx_ctx->iv32 ||
+ (iv32 == rx_ctx->iv32 &&
+ (iv16 < rx_ctx->iv16 ||
+ (iv16 == rx_ctx->iv16 &&
+ (rx_ctx->iv32 || rx_ctx->iv16 ||
+ rx_ctx->ctx.state != TKIP_STATE_NOT_INIT)))))
return TKIP_DECRYPT_REPLAY;
if (only_iv) {
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 3bb4459b52c7..427f51a0a994 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -408,20 +408,20 @@ TRACE_EVENT(drv_bss_info_changed,
__field(u32, basic_rates)
__array(int, mcast_rate, NUM_NL80211_BANDS)
__field(u16, ht_operation_mode)
- __field(s32, cqm_rssi_thold);
- __field(s32, cqm_rssi_hyst);
- __field(u32, channel_width);
- __field(u32, channel_cfreq1);
+ __field(s32, cqm_rssi_thold)
+ __field(s32, cqm_rssi_hyst)
+ __field(u32, channel_width)
+ __field(u32, channel_cfreq1)
__dynamic_array(u32, arp_addr_list,
info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ?
IEEE80211_BSS_ARP_ADDR_LIST_LEN :
- info->arp_addr_cnt);
- __field(int, arp_addr_cnt);
- __field(bool, qos);
- __field(bool, idle);
- __field(bool, ps);
- __dynamic_array(u8, ssid, info->ssid_len);
- __field(bool, hidden_ssid);
+ info->arp_addr_cnt)
+ __field(int, arp_addr_cnt)
+ __field(bool, qos)
+ __field(bool, idle)
+ __field(bool, ps)
+ __dynamic_array(u8, ssid, info->ssid_len)
+ __field(bool, hidden_ssid)
__field(int, txpower)
__field(u8, p2p_oppps_ctwindow)
),
@@ -1242,9 +1242,10 @@ TRACE_EVENT(drv_remain_on_channel,
)
);
-DEFINE_EVENT(local_only_evt, drv_cancel_remain_on_channel,
- TP_PROTO(struct ieee80211_local *local),
- TP_ARGS(local)
+DEFINE_EVENT(local_sdata_evt, drv_cancel_remain_on_channel,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata),
+ TP_ARGS(local, sdata)
);
TRACE_EVENT(drv_set_ringparam,
@@ -1671,8 +1672,8 @@ TRACE_EVENT(drv_start_ap,
VIF_ENTRY
__field(u8, dtimper)
__field(u16, bcnint)
- __dynamic_array(u8, ssid, info->ssid_len);
- __field(bool, hidden_ssid);
+ __dynamic_array(u8, ssid, info->ssid_len)
+ __field(bool, hidden_ssid)
),
TP_fast_assign(
@@ -1738,7 +1739,7 @@ TRACE_EVENT(drv_join_ibss,
VIF_ENTRY
__field(u8, dtimper)
__field(u16, bcnint)
- __dynamic_array(u8, ssid, info->ssid_len);
+ __dynamic_array(u8, ssid, info->ssid_len)
),
TP_fast_assign(
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index f13eb2f61ccf..87def9cb91ff 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -162,6 +162,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
break;
}
case NL80211_BAND_5GHZ:
+ case NL80211_BAND_6GHZ:
if (r->flags & IEEE80211_RATE_MANDATORY_A)
mrate = r->bitrate;
break;
@@ -1616,7 +1617,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local,
static bool ieee80211_tx_frags(struct ieee80211_local *local,
struct ieee80211_vif *vif,
- struct ieee80211_sta *sta,
+ struct sta_info *sta,
struct sk_buff_head *skbs,
bool txpending)
{
@@ -1678,7 +1679,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
info->control.vif = vif;
- control.sta = sta;
+ control.sta = sta ? &sta->sta : NULL;
__skb_unlink(skb, skbs);
drv_tx(local, &control, skb);
@@ -1697,7 +1698,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
struct ieee80211_tx_info *info;
struct ieee80211_sub_if_data *sdata;
struct ieee80211_vif *vif;
- struct ieee80211_sta *pubsta;
struct sk_buff *skb;
bool result = true;
__le16 fc;
@@ -1712,11 +1712,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
if (sta && !sta->uploaded)
sta = NULL;
- if (sta)
- pubsta = &sta->sta;
- else
- pubsta = NULL;
-
switch (sdata->vif.type) {
case NL80211_IFTYPE_MONITOR:
if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
@@ -1743,8 +1738,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
break;
}
- result = ieee80211_tx_frags(local, vif, pubsta, skbs,
- txpending);
+ result = ieee80211_tx_frags(local, vif, sta, skbs, txpending);
ieee80211_tpt_led_trig_tx(local, fc, led_len);
@@ -2262,6 +2256,15 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
payload[7]);
}
+ /*
+ * Initialize skb->priority for QoS frames. This is put in the TID field
+ * of the frame before passing it to the driver.
+ */
+ if (ieee80211_is_data_qos(hdr->frame_control)) {
+ u8 *p = ieee80211_get_qos_ctl(hdr);
+ skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK;
+ }
+
memset(info, 0, sizeof(*info));
info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
@@ -2276,6 +2279,9 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
* isn't always enough to find the interface to use; for proper
* VLAN/WDS support we will need a different mechanism (which
* likely isn't going to be monitor interfaces).
+ *
+ * This is necessary, for example, for old hostapd versions that
+ * don't use nl80211-based management TX/RX.
*/
sdata = IEEE80211_DEV_TO_SUB_IF(dev);
@@ -2423,6 +2429,33 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
return 0;
}
+static int ieee80211_store_ack_skb(struct ieee80211_local *local,
+ struct sk_buff *skb,
+ u32 *info_flags)
+{
+ struct sk_buff *ack_skb = skb_clone_sk(skb);
+ u16 info_id = 0;
+
+ if (ack_skb) {
+ unsigned long flags;
+ int id;
+
+ spin_lock_irqsave(&local->ack_status_lock, flags);
+ id = idr_alloc(&local->ack_status_frames, ack_skb,
+ 1, 0x2000, GFP_ATOMIC);
+ spin_unlock_irqrestore(&local->ack_status_lock, flags);
+
+ if (id >= 0) {
+ info_id = id;
+ *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+ } else {
+ kfree_skb(ack_skb);
+ }
+ }
+
+ return info_id;
+}
+
/**
* ieee80211_build_hdr - build 802.11 header in the given frame
* @sdata: virtual interface to build the header for
@@ -2716,26 +2749,8 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
}
if (unlikely(!multicast && skb->sk &&
- skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) {
- struct sk_buff *ack_skb = skb_clone_sk(skb);
-
- if (ack_skb) {
- unsigned long flags;
- int id;
-
- spin_lock_irqsave(&local->ack_status_lock, flags);
- id = idr_alloc(&local->ack_status_frames, ack_skb,
- 1, 0x10000, GFP_ATOMIC);
- spin_unlock_irqrestore(&local->ack_status_lock, flags);
-
- if (id >= 0) {
- info_id = id;
- info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
- } else {
- kfree_skb(ack_skb);
- }
- }
- }
+ skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
+ info_id = ieee80211_store_ack_skb(local, skb, &info_flags);
/*
* If the skb is shared we need to obtain our own copy.
@@ -3528,7 +3543,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sub_if_data, u.ap);
__skb_queue_tail(&tx.skbs, skb);
- ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+ ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false);
return true;
}
@@ -3546,6 +3561,11 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
ieee80211_tx_result r;
struct ieee80211_vif *vif = txq->vif;
+ WARN_ON_ONCE(softirq_count() == 0);
+
+ if (!ieee80211_txq_airtime_check(hw, txq))
+ return NULL;
+
begin:
spin_lock_bh(&fq->lock);
@@ -3656,6 +3676,21 @@ begin:
}
IEEE80211_SKB_CB(skb)->control.vif = vif;
+
+ if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
+ u32 airtime;
+
+ airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta,
+ skb->len);
+ if (airtime) {
+ airtime = ieee80211_info_set_tx_time_est(info, airtime);
+ ieee80211_sta_update_pending_airtime(local, tx.sta,
+ txq->ac,
+ airtime,
+ false);
+ }
+ }
+
return skb;
out:
@@ -3669,7 +3704,8 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
{
struct ieee80211_local *local = hw_to_local(hw);
struct ieee80211_txq *ret = NULL;
- struct txq_info *txqi = NULL;
+ struct txq_info *txqi = NULL, *head = NULL;
+ bool found_eligible_txq = false;
spin_lock_bh(&local->active_txq_lock[ac]);
@@ -3680,13 +3716,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
if (!txqi)
goto out;
+ if (txqi == head) {
+ if (!found_eligible_txq)
+ goto out;
+ else
+ found_eligible_txq = false;
+ }
+
+ if (!head)
+ head = txqi;
+
if (txqi->txq.sta) {
struct sta_info *sta = container_of(txqi->txq.sta,
- struct sta_info, sta);
+ struct sta_info, sta);
+ bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq);
+ s64 deficit = sta->airtime[txqi->txq.ac].deficit;
- if (sta->airtime[txqi->txq.ac].deficit < 0) {
+ if (aql_check)
+ found_eligible_txq = true;
+
+ if (deficit < 0)
sta->airtime[txqi->txq.ac].deficit +=
sta->airtime_weight;
+
+ if (deficit < 0 || !aql_check) {
list_move_tail(&txqi->schedule_order,
&local->active_txqs[txqi->txq.ac]);
goto begin;
@@ -3740,6 +3793,33 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(__ieee80211_schedule_txq);
+bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
+ struct ieee80211_txq *txq)
+{
+ struct sta_info *sta;
+ struct ieee80211_local *local = hw_to_local(hw);
+
+ if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
+ return true;
+
+ if (!txq->sta)
+ return true;
+
+ sta = container_of(txq->sta, struct sta_info, sta);
+ if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+ sta->airtime[txq->ac].aql_limit_low)
+ return true;
+
+ if (atomic_read(&local->aql_total_pending_airtime) <
+ local->aql_threshold &&
+ atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+ sta->airtime[txq->ac].aql_limit_high)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(ieee80211_txq_airtime_check);
+
bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
struct ieee80211_txq *txq)
{
@@ -3869,18 +3949,15 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
}
}
- next = skb;
- while (next) {
- skb = next;
- next = skb->next;
-
- skb->prev = NULL;
- skb->next = NULL;
+ skb_list_walk_safe(skb, skb, next) {
+ skb_mark_not_on_list(skb);
skb = ieee80211_build_hdr(sdata, skb, info_flags,
sta, ctrl_flags);
- if (IS_ERR(skb))
+ if (IS_ERR(skb)) {
+ kfree_skb_list(next);
goto out;
+ }
ieee80211_tx_stats(dev, skb->len);
@@ -4647,7 +4724,8 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
if (!sband)
return bcn;
- ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false);
+ ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false,
+ NULL);
return bcn;
}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ad1e58184c4e..decd46b38393 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -247,7 +247,8 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
struct sta_info *sta;
int i;
- spin_lock_bh(&fq->lock);
+ local_bh_disable();
+ spin_lock(&fq->lock);
if (sdata->vif.type == NL80211_IFTYPE_AP)
ps = &sdata->bss->ps;
@@ -273,9 +274,9 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
&txqi->flags))
continue;
- spin_unlock_bh(&fq->lock);
+ spin_unlock(&fq->lock);
drv_wake_tx_queue(local, txqi);
- spin_lock_bh(&fq->lock);
+ spin_lock(&fq->lock);
}
}
@@ -288,12 +289,14 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
(ps && atomic_read(&ps->num_sta_ps)) || ac != vif->txq->ac)
goto out;
- spin_unlock_bh(&fq->lock);
+ spin_unlock(&fq->lock);
drv_wake_tx_queue(local, txqi);
+ local_bh_enable();
return;
out:
- spin_unlock_bh(&fq->lock);
+ spin_unlock(&fq->lock);
+ local_bh_enable();
}
static void
@@ -1060,16 +1063,22 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
elem_parse_failed = true;
break;
case WLAN_EID_VHT_OPERATION:
- if (elen >= sizeof(struct ieee80211_vht_operation))
+ if (elen >= sizeof(struct ieee80211_vht_operation)) {
elems->vht_operation = (void *)pos;
- else
- elem_parse_failed = true;
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
+ break;
+ }
+ elem_parse_failed = true;
break;
case WLAN_EID_OPMODE_NOTIF:
- if (elen > 0)
+ if (elen > 0) {
elems->opmode_notif = pos;
- else
- elem_parse_failed = true;
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
+ break;
+ }
+ elem_parse_failed = true;
break;
case WLAN_EID_MESH_ID:
elems->mesh_id = pos;
@@ -1200,6 +1209,13 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
elems->cisco_dtpc_elem = pos;
break;
+ case WLAN_EID_ADDBA_EXT:
+ if (elen != sizeof(struct ieee80211_addba_ext_ie)) {
+ elem_parse_failed = true;
+ break;
+ }
+ elems->addba_ext_ie = (void *)pos;
+ break;
case WLAN_EID_TIMEOUT_INTERVAL:
if (elen >= sizeof(struct ieee80211_timeout_interval_ie))
elems->timeout_int = (void *)pos;
@@ -1233,6 +1249,10 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION &&
elen == 3) {
elems->mbssid_config_ie = (void *)&pos[1];
+ } else if (pos[0] == WLAN_EID_EXT_HE_SPR &&
+ elen >= sizeof(*elems->he_spr) &&
+ elen >= ieee80211_he_spr_size(&pos[1])) {
+ elems->he_spr = (void *)&pos[1];
}
break;
default:
@@ -1572,7 +1592,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
}
void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
- const u8 *bssid, u16 stype, u16 reason,
+ const u8 *da, const u8 *bssid,
+ u16 stype, u16 reason,
bool send_frame, u8 *frame_buf)
{
struct ieee80211_local *local = sdata->local;
@@ -1583,7 +1604,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
mgmt->duration = 0; /* initialize only */
mgmt->seq_ctrl = 0; /* initialize only */
- memcpy(mgmt->da, bssid, ETH_ALEN);
+ memcpy(mgmt->da, da, ETH_ALEN);
memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
memcpy(mgmt->bssid, bssid, ETH_ALEN);
/* u.deauth.reason_code == u.disassoc.reason_code */
@@ -2409,11 +2430,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
/* add back keys */
list_for_each_entry(sdata, &local->interfaces, list)
- ieee80211_reset_crypto_tx_tailroom(sdata);
-
- list_for_each_entry(sdata, &local->interfaces, list)
- if (ieee80211_sdata_running(sdata))
- ieee80211_enable_keys(sdata);
+ ieee80211_reenable_keys(sdata);
/* Reconfigure sched scan if it was interrupted by FW restart */
mutex_lock(&local->mtx);
@@ -2702,6 +2719,27 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
return pos;
}
+u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype)
+{
+ const struct ieee80211_sta_he_cap *he_cap;
+ struct ieee80211_supported_band *sband;
+ u8 n;
+
+ sband = ieee80211_get_sband(sdata);
+ if (!sband)
+ return 0;
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, iftype);
+ if (!he_cap)
+ return 0;
+
+ n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem);
+ return 2 + 1 +
+ sizeof(he_cap->he_cap_elem) + n +
+ ieee80211_he_ppe_size(he_cap->ppe_thres[0],
+ he_cap->he_cap_elem.phy_cap_info);
+}
+
u8 *ieee80211_ie_build_he_cap(u8 *pos,
const struct ieee80211_sta_he_cap *he_cap,
u8 *end)
@@ -2891,6 +2929,34 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
return pos + sizeof(struct ieee80211_vht_operation);
}
+u8 *ieee80211_ie_build_he_oper(u8 *pos)
+{
+ struct ieee80211_he_operation *he_oper;
+ u32 he_oper_params;
+
+ *pos++ = WLAN_EID_EXTENSION;
+ *pos++ = 1 + sizeof(struct ieee80211_he_operation);
+ *pos++ = WLAN_EID_EXT_HE_OPERATION;
+
+ he_oper_params = 0;
+ he_oper_params |= u32_encode_bits(1023, /* disabled */
+ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
+ he_oper_params |= u32_encode_bits(1,
+ IEEE80211_HE_OPERATION_ER_SU_DISABLE);
+ he_oper_params |= u32_encode_bits(1,
+ IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED);
+
+ he_oper = (struct ieee80211_he_operation *)pos;
+ he_oper->he_oper_params = cpu_to_le32(he_oper_params);
+
+ /* don't require special HE peer rates */
+ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff);
+
+ /* TODO add VHT operational and 6GHz operational subelement? */
+
+ return pos + sizeof(struct ieee80211_vht_operation);
+}
+
bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
struct cfg80211_chan_def *chandef)
{
@@ -2927,10 +2993,22 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
int cf0, cf1;
int ccfs0, ccfs1, ccfs2;
int ccf0, ccf1;
+ u32 vht_cap;
+ bool support_80_80 = false;
+ bool support_160 = false;
if (!oper || !htop)
return false;
+ vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap;
+ support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK |
+ IEEE80211_VHT_CAP_EXT_NSS_BW_MASK));
+ support_80_80 = ((vht_cap &
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) ||
+ (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
+ vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) ||
+ ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >>
+ IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1));
ccfs0 = oper->center_freq_seg0_idx;
ccfs1 = oper->center_freq_seg1_idx;
ccfs2 = (le16_to_cpu(htop->operation_mode) &
@@ -2958,10 +3036,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
unsigned int diff;
diff = abs(ccf1 - ccf0);
- if (diff == 8) {
+ if ((diff == 8) && support_160) {
new.width = NL80211_CHAN_WIDTH_160;
new.center_freq1 = cf1;
- } else if (diff > 8) {
+ } else if ((diff > 8) && support_80_80) {
new.width = NL80211_CHAN_WIDTH_80P80;
new.center_freq2 = cf1;
}
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index b20ff28d9f30..ccdcb9ad9ac7 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -4,7 +4,7 @@
*
* Portions of this file
* Copyright(c) 2015 - 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018 - 2019 Intel Corporation
*/
#include <linux/ieee80211.h>
@@ -349,6 +349,14 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
return IEEE80211_STA_RX_BW_160;
+ /*
+ * If this is non-zero, then it does support 160 MHz after all,
+ * in one form or the other. We don't distinguish here (or even
+ * above) between 160 and 80+80 yet.
+ */
+ if (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)
+ return IEEE80211_STA_RX_BW_160;
+
return IEEE80211_STA_RX_BW_80;
}
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index ee72779729e5..91bf32af55e9 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -946,7 +946,8 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
info = IEEE80211_SKB_CB(skb);
- if (info->control.hw_key)
+ if (info->control.hw_key &&
+ !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE))
return TX_CONTINUE;
if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
@@ -962,6 +963,9 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
bip_ipn_set64(mmie->sequence_number, pn64);
+ if (info->control.hw_key)
+ return TX_CONTINUE;
+
bip_aad(skb, aad);
/*
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c312741df2ce..4701edffb1f7 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -617,16 +617,15 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net,
struct net_device *dev;
struct dst_entry *dst;
struct flowi6 fl6;
- int err;
if (!ipv6_stub)
return ERR_PTR(-EAFNOSUPPORT);
memset(&fl6, 0, sizeof(fl6));
memcpy(&fl6.daddr, addr, sizeof(struct in6_addr));
- err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6);
- if (err)
- return ERR_PTR(err);
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ return ERR_CAST(dst);
dev = dst->dev;
dev_hold(dev);
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index d25e91d7bdc1..44b675016393 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -133,12 +133,12 @@ static int mpls_xmit(struct sk_buff *skb)
mpls_stats_inc_outucastpkts(out_dev, skb);
if (rt) {
- if (rt->rt_gw_family == AF_INET)
- err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4,
- skb);
- else if (rt->rt_gw_family == AF_INET6)
+ if (rt->rt_gw_family == AF_INET6)
err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6,
skb);
+ else
+ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4,
+ skb);
} else if (rt6) {
if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) {
/* 6PE (RFC 4798) */
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
new file mode 100644
index 000000000000..49f6054e7f4e
--- /dev/null
+++ b/net/mptcp/Kconfig
@@ -0,0 +1,28 @@
+
+config MPTCP
+ bool "MPTCP: Multipath TCP"
+ depends on INET
+ select SKB_EXTENSIONS
+ select CRYPTO_LIB_SHA256
+ help
+ Multipath TCP (MPTCP) connections send and receive data over multiple
+ subflows in order to utilize multiple network paths. Each subflow
+ uses the TCP protocol, and TCP options carry header information for
+ MPTCP.
+
+if MPTCP
+
+config MPTCP_IPV6
+ bool "MPTCP: IPv6 support for Multipath TCP"
+ select IPV6
+ default y
+
+config MPTCP_HMAC_TEST
+ bool "Tests for MPTCP HMAC implementation"
+ help
+ This option enable boot time self-test for the HMAC implementation
+ used by the MPTCP code
+
+ Say N if you are unsure.
+
+endif
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
new file mode 100644
index 000000000000..4e98d9edfd0a
--- /dev/null
+++ b/net/mptcp/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MPTCP) += mptcp.o
+
+mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
new file mode 100644
index 000000000000..40d1bb18fd60
--- /dev/null
+++ b/net/mptcp/crypto.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP cryptographic functions
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ *
+ * Note: This code is based on mptcp_ctrl.c, mptcp_ipv4.c, and
+ * mptcp_ipv6 from multipath-tcp.org, authored by:
+ *
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ * Gregory Detal <gregory.detal@uclouvain.be>
+ * Fabien DuchĂŞne <fabien.duchene@uclouvain.be>
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
+ * Andreas Ripke <ripke@neclab.eu>
+ * Vlad Dogaru <vlad.dogaru@intel.com>
+ * Octavian Purdila <octavian.purdila@intel.com>
+ * John Ronan <jronan@tssg.org>
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
+ * Brandon Heller <brandonh@stanford.edu>
+ */
+
+#include <linux/kernel.h>
+#include <crypto/sha.h>
+#include <asm/unaligned.h>
+
+#include "protocol.h"
+
+#define SHA256_DIGEST_WORDS (SHA256_DIGEST_SIZE / 4)
+
+void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
+{
+ __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
+ __be64 input = cpu_to_be64(key);
+ struct sha256_state state;
+
+ sha256_init(&state);
+ sha256_update(&state, (__force u8 *)&input, sizeof(input));
+ sha256_final(&state, (u8 *)mptcp_hashed_key);
+
+ if (token)
+ *token = be32_to_cpu(mptcp_hashed_key[0]);
+ if (idsn)
+ *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
+}
+
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hmac)
+{
+ u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
+ __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
+ __be32 *hash_out = (__force __be32 *)hmac;
+ struct sha256_state state;
+ u8 key1be[8];
+ u8 key2be[8];
+ int i;
+
+ put_unaligned_be64(key1, key1be);
+ put_unaligned_be64(key2, key2be);
+
+ /* Generate key xored with ipad */
+ memset(input, 0x36, SHA_MESSAGE_BYTES);
+ for (i = 0; i < 8; i++)
+ input[i] ^= key1be[i];
+ for (i = 0; i < 8; i++)
+ input[i + 8] ^= key2be[i];
+
+ put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]);
+ put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
+
+ sha256_init(&state);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + 8);
+
+ /* emit sha256(K1 || msg) on the second input block, so we can
+ * reuse 'input' for the last hashing
+ */
+ sha256_final(&state, &input[SHA256_BLOCK_SIZE]);
+
+ /* Prepare second part of hmac */
+ memset(input, 0x5C, SHA_MESSAGE_BYTES);
+ for (i = 0; i < 8; i++)
+ input[i] ^= key1be[i];
+ for (i = 0; i < 8; i++)
+ input[i + 8] ^= key2be[i];
+
+ sha256_init(&state);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE);
+ sha256_final(&state, (u8 *)mptcp_hashed_key);
+
+ /* takes only first 160 bits */
+ for (i = 0; i < 5; i++)
+ hash_out[i] = mptcp_hashed_key[i];
+}
+
+#ifdef CONFIG_MPTCP_HMAC_TEST
+struct test_cast {
+ char *key;
+ char *msg;
+ char *result;
+};
+
+/* we can't reuse RFC 4231 test vectors, as we have constraint on the
+ * input and key size, and we truncate the output.
+ */
+static struct test_cast tests[] = {
+ {
+ .key = "0b0b0b0b0b0b0b0b",
+ .msg = "48692054",
+ .result = "8385e24fb4235ac37556b6b886db106284a1da67",
+ },
+ {
+ .key = "aaaaaaaaaaaaaaaa",
+ .msg = "dddddddd",
+ .result = "2c5e219164ff1dca1c4a92318d847bb6b9d44492",
+ },
+ {
+ .key = "0102030405060708",
+ .msg = "cdcdcdcd",
+ .result = "e73b9ba9969969cefb04aa0d6df18ec2fcc075b6",
+ },
+};
+
+static int __init test_mptcp_crypto(void)
+{
+ char hmac[20], hmac_hex[41];
+ u32 nonce1, nonce2;
+ u64 key1, key2;
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ /* mptcp hmap will convert to be before computing the hmac */
+ key1 = be64_to_cpu(*((__be64 *)&tests[i].key[0]));
+ key2 = be64_to_cpu(*((__be64 *)&tests[i].key[8]));
+ nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
+ nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
+
+ mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac);
+ for (j = 0; j < 20; ++j)
+ sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
+ hmac_hex[40] = 0;
+
+ if (memcmp(hmac_hex, tests[i].result, 40))
+ pr_err("test %d failed, got %s expected %s", i,
+ hmac_hex, tests[i].result);
+ else
+ pr_info("test %d [ ok ]", i);
+ }
+ return 0;
+}
+
+late_initcall(test_mptcp_crypto);
+#endif
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
new file mode 100644
index 000000000000..8e39585d37f3
--- /dev/null
+++ b/net/mptcp/ctrl.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Tessares SA.
+ */
+
+#include <linux/sysctl.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include "protocol.h"
+
+#define MPTCP_SYSCTL_PATH "net/mptcp"
+
+static int mptcp_pernet_id;
+struct mptcp_pernet {
+ struct ctl_table_header *ctl_table_hdr;
+
+ int mptcp_enabled;
+};
+
+static struct mptcp_pernet *mptcp_get_pernet(struct net *net)
+{
+ return net_generic(net, mptcp_pernet_id);
+}
+
+int mptcp_is_enabled(struct net *net)
+{
+ return mptcp_get_pernet(net)->mptcp_enabled;
+}
+
+static struct ctl_table mptcp_sysctl_table[] = {
+ {
+ .procname = "enabled",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* users with CAP_NET_ADMIN or root (not and) can change this
+ * value, same as other sysctl or the 'net' tree.
+ */
+ .proc_handler = proc_dointvec,
+ },
+ {}
+};
+
+static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
+{
+ pernet->mptcp_enabled = 1;
+}
+
+static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
+{
+ struct ctl_table_header *hdr;
+ struct ctl_table *table;
+
+ table = mptcp_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ }
+
+ table[0].data = &pernet->mptcp_enabled;
+
+ hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
+ if (!hdr)
+ goto err_reg;
+
+ pernet->ctl_table_hdr = hdr;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
+{
+ struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(pernet->ctl_table_hdr);
+
+ kfree(table);
+}
+
+static int __net_init mptcp_net_init(struct net *net)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(net);
+
+ mptcp_pernet_set_defaults(pernet);
+
+ return mptcp_pernet_new_table(net, pernet);
+}
+
+/* Note: the callback will only be called per extra netns */
+static void __net_exit mptcp_net_exit(struct net *net)
+{
+ struct mptcp_pernet *pernet = mptcp_get_pernet(net);
+
+ mptcp_pernet_del_table(pernet);
+}
+
+static struct pernet_operations mptcp_pernet_ops = {
+ .init = mptcp_net_init,
+ .exit = mptcp_net_exit,
+ .id = &mptcp_pernet_id,
+ .size = sizeof(struct mptcp_pernet),
+};
+
+void __init mptcp_init(void)
+{
+ mptcp_proto_init();
+
+ if (register_pernet_subsys(&mptcp_pernet_ops) < 0)
+ panic("Failed to register MPTCP pernet subsystem.\n");
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+int __init mptcpv6_init(void)
+{
+ int err;
+
+ err = mptcp_proto_v6_init();
+
+ return err;
+}
+#endif
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
new file mode 100644
index 000000000000..45acd877bef3
--- /dev/null
+++ b/net/mptcp/options.c
@@ -0,0 +1,586 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static bool mptcp_cap_flag_sha256(u8 flags)
+{
+ return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
+}
+
+void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
+ int opsize, struct tcp_options_received *opt_rx)
+{
+ struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
+ u8 subtype = *ptr >> 4;
+ int expected_opsize;
+ u8 version;
+ u8 flags;
+
+ switch (subtype) {
+ case MPTCPOPT_MP_CAPABLE:
+ /* strict size checking */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ if (skb->len > tcp_hdr(skb)->doff << 2)
+ expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ else
+ expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
+ } else {
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
+ expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
+ else
+ expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
+ }
+ if (opsize != expected_opsize)
+ break;
+
+ /* try to be gentle vs future versions on the initial syn */
+ version = *ptr++ & MPTCP_VERSION_MASK;
+ if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
+ if (version != MPTCP_SUPPORTED_VERSION)
+ break;
+ } else if (version < MPTCP_SUPPORTED_VERSION) {
+ break;
+ }
+
+ flags = *ptr++;
+ if (!mptcp_cap_flag_sha256(flags) ||
+ (flags & MPTCP_CAP_EXTENSIBILITY))
+ break;
+
+ /* RFC 6824, Section 3.1:
+ * "For the Checksum Required bit (labeled "A"), if either
+ * host requires the use of checksums, checksums MUST be used.
+ * In other words, the only way for checksums not to be used
+ * is if both hosts in their SYNs set A=0."
+ *
+ * Section 3.3.0:
+ * "If a checksum is not present when its use has been
+ * negotiated, the receiver MUST close the subflow with a RST as
+ * it is considered broken."
+ *
+ * We don't implement DSS checksum - fall back to TCP.
+ */
+ if (flags & MPTCP_CAP_CHECKSUM_REQD)
+ break;
+
+ mp_opt->mp_capable = 1;
+ if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
+ mp_opt->sndr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
+ mp_opt->rcvr_key = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
+ /* Section 3.1.:
+ * "the data parameters in a MP_CAPABLE are semantically
+ * equivalent to those in a DSS option and can be used
+ * interchangeably."
+ */
+ mp_opt->dss = 1;
+ mp_opt->use_map = 1;
+ mp_opt->mpc_map = 1;
+ mp_opt->data_len = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
+ version, flags, opsize, mp_opt->sndr_key,
+ mp_opt->rcvr_key, mp_opt->data_len);
+ break;
+
+ case MPTCPOPT_DSS:
+ pr_debug("DSS");
+ ptr++;
+
+ /* we must clear 'mpc_map' be able to detect MP_CAPABLE
+ * map vs DSS map in mptcp_incoming_options(), and reconstruct
+ * map info accordingly
+ */
+ mp_opt->mpc_map = 0;
+ flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
+ mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
+ mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
+ mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
+ mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
+ mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
+
+ pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
+ mp_opt->data_fin, mp_opt->dsn64,
+ mp_opt->use_map, mp_opt->ack64,
+ mp_opt->use_ack);
+
+ expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
+
+ if (mp_opt->use_ack) {
+ if (mp_opt->ack64)
+ expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
+ else
+ expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
+ }
+
+ if (mp_opt->use_map) {
+ if (mp_opt->dsn64)
+ expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
+ else
+ expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
+ }
+
+ /* RFC 6824, Section 3.3:
+ * If a checksum is present, but its use had
+ * not been negotiated in the MP_CAPABLE handshake,
+ * the checksum field MUST be ignored.
+ */
+ if (opsize != expected_opsize &&
+ opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
+ break;
+
+ mp_opt->dss = 1;
+
+ if (mp_opt->use_ack) {
+ if (mp_opt->ack64) {
+ mp_opt->data_ack = get_unaligned_be64(ptr);
+ ptr += 8;
+ } else {
+ mp_opt->data_ack = get_unaligned_be32(ptr);
+ ptr += 4;
+ }
+
+ pr_debug("data_ack=%llu", mp_opt->data_ack);
+ }
+
+ if (mp_opt->use_map) {
+ if (mp_opt->dsn64) {
+ mp_opt->data_seq = get_unaligned_be64(ptr);
+ ptr += 8;
+ } else {
+ mp_opt->data_seq = get_unaligned_be32(ptr);
+ ptr += 4;
+ }
+
+ mp_opt->subflow_seq = get_unaligned_be32(ptr);
+ ptr += 4;
+
+ mp_opt->data_len = get_unaligned_be16(ptr);
+ ptr += 2;
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
+ mp_opt->data_seq, mp_opt->subflow_seq,
+ mp_opt->data_len);
+ }
+
+ break;
+
+ default:
+ break;
+ }
+}
+
+void mptcp_get_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx)
+{
+ const unsigned char *ptr;
+ const struct tcphdr *th = tcp_hdr(skb);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+ ptr = (const unsigned char *)(th + 1);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return;
+ if (opsize > length)
+ return; /* don't parse partial options */
+ if (opcode == TCPOPT_MPTCP)
+ mptcp_parse_option(skb, ptr, opsize, opt_rx);
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+}
+
+bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
+ unsigned int *size, struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ /* we will use snd_isn to detect first pkt [re]transmission
+ * in mptcp_established_options_mp()
+ */
+ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
+ if (subflow->request_mptcp) {
+ pr_debug("local_key=%llu", subflow->local_key);
+ opts->suboptions = OPTION_MPTCP_MPC_SYN;
+ opts->sndr_key = subflow->local_key;
+ *size = TCPOLEN_MPTCP_MPC_SYN;
+ return true;
+ }
+ return false;
+}
+
+void mptcp_rcv_synsent(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ pr_debug("subflow=%p", subflow);
+ if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
+ subflow->mp_capable = 1;
+ subflow->can_ack = 1;
+ subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
+ } else {
+ tcp_sk(sk)->is_mptcp = 0;
+ }
+}
+
+static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_ext *mpext;
+ unsigned int data_len;
+
+ pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow,
+ subflow->fourth_ack, subflow->snd_isn,
+ skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+
+ if (subflow->mp_capable && !subflow->fourth_ack && skb &&
+ subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
+ /* When skb is not available, we better over-estimate the
+ * emitted options len. A full DSS option is longer than
+ * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
+ * that.
+ */
+ mpext = mptcp_get_ext(skb);
+ data_len = mpext ? mpext->data_len : 0;
+
+ /* we will check ext_copy.data_len in mptcp_write_options() to
+ * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
+ * TCPOLEN_MPTCP_MPC_ACK
+ */
+ opts->ext_copy.data_len = data_len;
+ opts->suboptions = OPTION_MPTCP_MPC_ACK;
+ opts->sndr_key = subflow->local_key;
+ opts->rcvr_key = subflow->remote_key;
+
+ /* Section 3.1.
+ * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
+ * packets that start the first subflow of an MPTCP connection,
+ * as well as the first packet that carries data
+ */
+ if (data_len > 0)
+ *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
+ else
+ *size = TCPOLEN_MPTCP_MPC_ACK;
+
+ pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
+ subflow, subflow->local_key, subflow->remote_key,
+ data_len);
+
+ return true;
+ }
+ return false;
+}
+
+static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
+ struct mptcp_ext *ext)
+{
+ ext->data_fin = 1;
+
+ if (!ext->use_map) {
+ /* RFC6824 requires a DSS mapping with specific values
+ * if DATA_FIN is set but no data payload is mapped
+ */
+ ext->use_map = 1;
+ ext->dsn64 = 1;
+ ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
+ ext->subflow_seq = 0;
+ ext->data_len = 1;
+ } else {
+ /* If there's an existing DSS mapping, DATA_FIN consumes
+ * 1 additional byte of mapping space.
+ */
+ ext->data_len++;
+ }
+}
+
+static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ unsigned int dss_size = 0;
+ struct mptcp_ext *mpext;
+ struct mptcp_sock *msk;
+ unsigned int ack_size;
+ bool ret = false;
+ u8 tcp_fin;
+
+ if (skb) {
+ mpext = mptcp_get_ext(skb);
+ tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ } else {
+ mpext = NULL;
+ tcp_fin = 0;
+ }
+
+ if (!skb || (mpext && mpext->use_map) || tcp_fin) {
+ unsigned int map_size;
+
+ map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
+
+ remaining -= map_size;
+ dss_size = map_size;
+ if (mpext)
+ opts->ext_copy = *mpext;
+
+ if (skb && tcp_fin &&
+ subflow->conn->sk_state != TCP_ESTABLISHED)
+ mptcp_write_data_fin(subflow, &opts->ext_copy);
+ ret = true;
+ }
+
+ opts->ext_copy.use_ack = 0;
+ msk = mptcp_sk(subflow->conn);
+ if (!msk || !READ_ONCE(msk->can_ack)) {
+ *size = ALIGN(dss_size, 4);
+ return ret;
+ }
+
+ ack_size = TCPOLEN_MPTCP_DSS_ACK64;
+
+ /* Add kind/length/subtype/flag overhead if mapping is not populated */
+ if (dss_size == 0)
+ ack_size += TCPOLEN_MPTCP_DSS_BASE;
+
+ dss_size += ack_size;
+
+ opts->ext_copy.data_ack = msk->ack_seq;
+ opts->ext_copy.ack64 = 1;
+ opts->ext_copy.use_ack = 1;
+
+ *size = ALIGN(dss_size, 4);
+ return true;
+}
+
+bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+ unsigned int *size, unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ unsigned int opt_size = 0;
+ bool ret = false;
+
+ if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
+ ret = true;
+ else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
+ opts))
+ ret = true;
+
+ /* we reserved enough space for the above options, and exceeding the
+ * TCP option space would be fatal
+ */
+ if (WARN_ON_ONCE(opt_size > remaining))
+ return false;
+
+ *size += opt_size;
+ remaining -= opt_size;
+
+ return ret;
+}
+
+bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ if (subflow_req->mp_capable) {
+ opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
+ opts->sndr_key = subflow_req->local_key;
+ *size = TCPOLEN_MPTCP_MPC_SYNACK;
+ pr_debug("subflow_req=%p, local_key=%llu",
+ subflow_req, subflow_req->local_key);
+ return true;
+ }
+ return false;
+}
+
+static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt)
+{
+ /* here we can process OoO, in-window pkts, only in-sequence 4th ack
+ * are relevant
+ */
+ if (likely(subflow->fourth_ack ||
+ TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
+ return true;
+
+ if (mp_opt->use_ack)
+ subflow->fourth_ack = 1;
+
+ if (subflow->can_ack)
+ return true;
+
+ /* If the first established packet does not contain MP_CAPABLE + data
+ * then fallback to TCP
+ */
+ if (!mp_opt->mp_capable) {
+ subflow->mp_capable = 0;
+ tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
+ return false;
+ }
+ subflow->remote_key = mp_opt->sndr_key;
+ subflow->can_ack = 1;
+ return true;
+}
+
+void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
+ struct tcp_options_received *opt_rx)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_options_received *mp_opt;
+ struct mptcp_ext *mpext;
+
+ mp_opt = &opt_rx->mptcp;
+ if (!check_fourth_ack(subflow, skb, mp_opt))
+ return;
+
+ if (!mp_opt->dss)
+ return;
+
+ mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
+ if (!mpext)
+ return;
+
+ memset(mpext, 0, sizeof(*mpext));
+
+ if (mp_opt->use_map) {
+ if (mp_opt->mpc_map) {
+ /* this is an MP_CAPABLE carrying MPTCP data
+ * we know this map the first chunk of data
+ */
+ mptcp_crypto_key_sha(subflow->remote_key, NULL,
+ &mpext->data_seq);
+ mpext->data_seq++;
+ mpext->subflow_seq = 1;
+ mpext->dsn64 = 1;
+ mpext->mpc_map = 1;
+ } else {
+ mpext->data_seq = mp_opt->data_seq;
+ mpext->subflow_seq = mp_opt->subflow_seq;
+ mpext->dsn64 = mp_opt->dsn64;
+ }
+ mpext->data_len = mp_opt->data_len;
+ mpext->use_map = 1;
+ }
+
+ if (mp_opt->use_ack) {
+ mpext->data_ack = mp_opt->data_ack;
+ mpext->use_ack = 1;
+ mpext->ack64 = mp_opt->ack64;
+ }
+
+ mpext->data_fin = mp_opt->data_fin;
+}
+
+void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
+{
+ if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
+ OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
+ u8 len;
+
+ if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
+ len = TCPOLEN_MPTCP_MPC_SYN;
+ else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
+ len = TCPOLEN_MPTCP_MPC_SYNACK;
+ else if (opts->ext_copy.data_len)
+ len = TCPOLEN_MPTCP_MPC_ACK_DATA;
+ else
+ len = TCPOLEN_MPTCP_MPC_ACK;
+
+ *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
+ (MPTCPOPT_MP_CAPABLE << 12) |
+ (MPTCP_SUPPORTED_VERSION << 8) |
+ MPTCP_CAP_HMAC_SHA256);
+
+ if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
+ opts->suboptions))
+ goto mp_capable_done;
+
+ put_unaligned_be64(opts->sndr_key, ptr);
+ ptr += 2;
+ if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
+ goto mp_capable_done;
+
+ put_unaligned_be64(opts->rcvr_key, ptr);
+ ptr += 2;
+ if (!opts->ext_copy.data_len)
+ goto mp_capable_done;
+
+ put_unaligned_be32(opts->ext_copy.data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ ptr += 1;
+ }
+
+mp_capable_done:
+ if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
+ struct mptcp_ext *mpext = &opts->ext_copy;
+ u8 len = TCPOLEN_MPTCP_DSS_BASE;
+ u8 flags = 0;
+
+ if (mpext->use_ack) {
+ len += TCPOLEN_MPTCP_DSS_ACK64;
+ flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
+ }
+
+ if (mpext->use_map) {
+ len += TCPOLEN_MPTCP_DSS_MAP64;
+
+ /* Use only 64-bit mapping flags for now, add
+ * support for optional 32-bit mappings later.
+ */
+ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
+ if (mpext->data_fin)
+ flags |= MPTCP_DSS_DATA_FIN;
+ }
+
+ *ptr++ = htonl((TCPOPT_MPTCP << 24) |
+ (len << 16) |
+ (MPTCPOPT_DSS << 12) |
+ (flags));
+
+ if (mpext->use_ack) {
+ put_unaligned_be64(mpext->data_ack, ptr);
+ ptr += 2;
+ }
+
+ if (mpext->use_map) {
+ put_unaligned_be64(mpext->data_seq, ptr);
+ ptr += 2;
+ put_unaligned_be32(mpext->subflow_seq, ptr);
+ ptr += 1;
+ put_unaligned_be32(mpext->data_len << 16 |
+ TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
+ }
+ }
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
new file mode 100644
index 000000000000..030dee668e0a
--- /dev/null
+++ b/net/mptcp/protocol.c
@@ -0,0 +1,1252 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/sched/signal.h>
+#include <linux/atomic.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/transp_v6.h>
+#endif
+#include <net/mptcp.h>
+#include "protocol.h"
+
+#define MPTCP_SAME_STATE TCP_MAX_STATES
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+struct mptcp6_sock {
+ struct mptcp_sock msk;
+ struct ipv6_pinfo np;
+};
+#endif
+
+/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
+ * completed yet or has failed, return the subflow socket.
+ * Otherwise return NULL.
+ */
+static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
+{
+ if (!msk->subflow || READ_ONCE(msk->can_ack))
+ return NULL;
+
+ return msk->subflow;
+}
+
+static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
+{
+ return msk->first && !sk_is_mptcp(msk->first);
+}
+
+static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
+{
+ sock_owned_by_me((const struct sock *)msk);
+
+ if (likely(!__mptcp_needs_tcp_fallback(msk)))
+ return NULL;
+
+ if (msk->subflow) {
+ release_sock((struct sock *)msk);
+ return msk->subflow;
+ }
+
+ return NULL;
+}
+
+static bool __mptcp_can_create_subflow(const struct mptcp_sock *msk)
+{
+ return !msk->first;
+}
+
+static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct socket *ssock;
+ int err;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (ssock)
+ goto set_state;
+
+ if (!__mptcp_can_create_subflow(msk))
+ return ERR_PTR(-EINVAL);
+
+ err = mptcp_subflow_create_socket(sk, &ssock);
+ if (err)
+ return ERR_PTR(err);
+
+ msk->first = ssock->sk;
+ msk->subflow = ssock;
+ subflow = mptcp_subflow_ctx(ssock->sk);
+ list_add(&subflow->node, &msk->conn_list);
+ subflow->request_mptcp = 1;
+
+set_state:
+ if (state != MPTCP_SAME_STATE)
+ inet_sk_state_store(sk, state);
+ return ssock;
+}
+
+static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ return mptcp_subflow_tcp_sock(subflow);
+ }
+
+ return NULL;
+}
+
+static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
+{
+ if (!msk->cached_ext)
+ msk->cached_ext = __skb_ext_alloc();
+
+ return !!msk->cached_ext;
+}
+
+static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+
+ sock_owned_by_me(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ if (subflow->data_avail)
+ return mptcp_subflow_tcp_sock(subflow);
+ }
+
+ return NULL;
+}
+
+static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
+ const struct sk_buff *skb,
+ const struct mptcp_ext *mpext)
+{
+ if (!tcp_skb_can_collapse_to(skb))
+ return false;
+
+ /* can collapse only if MPTCP level sequence is in order */
+ return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
+}
+
+static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
+ struct msghdr *msg, long *timeo, int *pmss_now,
+ int *ps_goal)
+{
+ int mss_now, avail_size, size_goal, ret;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_ext *mpext = NULL;
+ struct sk_buff *skb, *tail;
+ bool can_collapse = false;
+ struct page_frag *pfrag;
+ size_t psize;
+
+ /* use the mptcp page cache so that we can easily move the data
+ * from one substream to another, but do per subflow memory accounting
+ */
+ pfrag = sk_page_frag(sk);
+ while (!sk_page_frag_refill(ssk, pfrag) ||
+ !mptcp_ext_cache_refill(msk)) {
+ ret = sk_stream_wait_memory(ssk, timeo);
+ if (ret)
+ return ret;
+ if (unlikely(__mptcp_needs_tcp_fallback(msk)))
+ return 0;
+ }
+
+ /* compute copy limit */
+ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
+ *pmss_now = mss_now;
+ *ps_goal = size_goal;
+ avail_size = size_goal;
+ skb = tcp_write_queue_tail(ssk);
+ if (skb) {
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+
+ /* Limit the write to the size available in the
+ * current skb, if any, so that we create at most a new skb.
+ * Explicitly tells TCP internals to avoid collapsing on later
+ * queue management operation, to avoid breaking the ext <->
+ * SSN association set here
+ */
+ can_collapse = (size_goal - skb->len > 0) &&
+ mptcp_skb_can_collapse_to(msk, skb, mpext);
+ if (!can_collapse)
+ TCP_SKB_CB(skb)->eor = 1;
+ else
+ avail_size = size_goal - skb->len;
+ }
+ psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
+
+ /* Copy to page */
+ pr_debug("left=%zu", msg_data_left(msg));
+ psize = copy_page_from_iter(pfrag->page, pfrag->offset,
+ min_t(size_t, msg_data_left(msg), psize),
+ &msg->msg_iter);
+ pr_debug("left=%zu", msg_data_left(msg));
+ if (!psize)
+ return -EINVAL;
+
+ /* tell the TCP stack to delay the push so that we can safely
+ * access the skb after the sendpages call
+ */
+ ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+ msg->msg_flags | MSG_SENDPAGE_NOTLAST);
+ if (ret <= 0)
+ return ret;
+ if (unlikely(ret < psize))
+ iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ /* if the tail skb extension is still the cached one, collapsing
+ * really happened. Note: we can't check for 'same skb' as the sk_buff
+ * hdr on tail can be transmitted, freed and re-allocated by the
+ * do_tcp_sendpages() call
+ */
+ tail = tcp_write_queue_tail(ssk);
+ if (mpext && tail && mpext == skb_ext_find(tail, SKB_EXT_MPTCP)) {
+ WARN_ON_ONCE(!can_collapse);
+ mpext->data_len += ret;
+ goto out;
+ }
+
+ skb = tcp_write_queue_tail(ssk);
+ mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
+ msk->cached_ext = NULL;
+
+ memset(mpext, 0, sizeof(*mpext));
+ mpext->data_seq = msk->write_seq;
+ mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
+ mpext->data_len = ret;
+ mpext->use_map = 1;
+ mpext->dsn64 = 1;
+
+ pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
+ mpext->data_seq, mpext->subflow_seq, mpext->data_len,
+ mpext->dsn64);
+
+out:
+ pfrag->offset += ret;
+ msk->write_seq += ret;
+ mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
+
+ return ret;
+}
+
+static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct socket *sock;
+
+ if (likely(sk_stream_is_writeable(ssk)))
+ return;
+
+ sock = READ_ONCE(ssk->sk_socket);
+
+ if (sock) {
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic();
+ /* set NOSPACE only after clearing SEND_SPACE flag */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
+}
+
+static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+{
+ int mss_now = 0, size_goal = 0, ret = 0;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+ size_t copied = 0;
+ struct sock *ssk;
+ long timeo;
+
+ if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock)) {
+fallback:
+ pr_debug("fallback passthrough");
+ ret = sock_sendmsg(ssock, msg);
+ return ret >= 0 ? ret + copied : (copied ? copied : ret);
+ }
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ ssk = mptcp_subflow_get(msk);
+ if (!ssk) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ pr_debug("conn_list->subflow=%p", ssk);
+
+ lock_sock(ssk);
+ while (msg_data_left(msg)) {
+ ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
+ &size_goal);
+ if (ret < 0)
+ break;
+ if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) {
+ release_sock(ssk);
+ ssock = __mptcp_tcp_fallback(msk);
+ goto fallback;
+ }
+
+ copied += ret;
+ }
+
+ if (copied) {
+ ret = copied;
+ tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
+ size_goal);
+ }
+
+ ssk_check_wmem(msk, ssk);
+ release_sock(ssk);
+ release_sock(sk);
+ return ret;
+}
+
+int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct mptcp_read_arg *arg = desc->arg.data;
+ size_t copy_len;
+
+ copy_len = min(desc->count, len);
+
+ if (likely(arg->msg)) {
+ int err;
+
+ err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len);
+ if (err) {
+ pr_debug("error path");
+ desc->error = err;
+ return err;
+ }
+ } else {
+ pr_debug("Flushing skb payload");
+ }
+
+ desc->count -= copy_len;
+
+ pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count);
+ return copy_len;
+}
+
+static void mptcp_wait_data(struct sock *sk, long *timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+ sk_wait_event(sk, timeo,
+ test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
+
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ bool more_data_avail = false;
+ struct mptcp_read_arg arg;
+ read_descriptor_t desc;
+ bool wait_data = false;
+ struct socket *ssock;
+ struct tcp_sock *tp;
+ bool done = false;
+ struct sock *ssk;
+ int copied = 0;
+ int target;
+ long timeo;
+
+ if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock)) {
+fallback:
+ pr_debug("fallback-read subflow=%p",
+ mptcp_subflow_ctx(ssock->sk));
+ copied = sock_recvmsg(ssock, msg, flags);
+ return copied;
+ }
+
+ arg.msg = msg;
+ desc.arg.data = &arg;
+ desc.error = 0;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+
+ len = min_t(size_t, len, INT_MAX);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ while (!done) {
+ u32 map_remaining;
+ int bytes_read;
+
+ ssk = mptcp_subflow_recv_lookup(msk);
+ pr_debug("msk=%p ssk=%p", msk, ssk);
+ if (!ssk)
+ goto wait_for_data;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ tp = tcp_sk(ssk);
+
+ lock_sock(ssk);
+ do {
+ /* try to read as much data as available */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+ desc.count = min_t(size_t, len - copied, map_remaining);
+ pr_debug("reading %zu bytes, copied %d", desc.count,
+ copied);
+ bytes_read = tcp_read_sock(ssk, &desc,
+ mptcp_read_actor);
+ if (bytes_read < 0) {
+ if (!copied)
+ copied = bytes_read;
+ done = true;
+ goto next;
+ }
+
+ pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
+ msk->ack_seq + bytes_read);
+ msk->ack_seq += bytes_read;
+ copied += bytes_read;
+ if (copied >= len) {
+ done = true;
+ goto next;
+ }
+ if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
+ pr_err("Urgent data present, cannot proceed");
+ done = true;
+ goto next;
+ }
+next:
+ more_data_avail = mptcp_subflow_data_available(ssk);
+ } while (more_data_avail && !done);
+ release_sock(ssk);
+ continue;
+
+wait_for_data:
+ more_data_avail = false;
+
+ /* only the master socket status is relevant here. The exit
+ * conditions mirror closely tcp_recvmsg()
+ */
+ if (copied >= target)
+ break;
+
+ if (copied) {
+ if (sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current))
+ break;
+ } else {
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ break;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ copied = -ENOTCONN;
+ break;
+ }
+
+ if (!timeo) {
+ copied = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ pr_debug("block timeout %ld", timeo);
+ wait_data = true;
+ mptcp_wait_data(sk, &timeo);
+ if (unlikely(__mptcp_tcp_fallback(msk)))
+ goto fallback;
+ }
+
+ if (more_data_avail) {
+ if (!test_bit(MPTCP_DATA_READY, &msk->flags))
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ } else if (!wait_data) {
+ clear_bit(MPTCP_DATA_READY, &msk->flags);
+
+ /* .. race-breaker: ssk might get new data after last
+ * data_available() returns false.
+ */
+ ssk = mptcp_subflow_recv_lookup(msk);
+ if (unlikely(ssk))
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ }
+
+ release_sock(sk);
+ return copied;
+}
+
+/* subflow sockets can be either outgoing (connect) or incoming
+ * (accept).
+ *
+ * Outgoing subflows use in-kernel sockets.
+ * Incoming subflows do not have their own 'struct socket' allocated,
+ * so we need to use tcp_close() after detaching them from the mptcp
+ * parent socket.
+ */
+static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
+ struct mptcp_subflow_context *subflow,
+ long timeout)
+{
+ struct socket *sock = READ_ONCE(ssk->sk_socket);
+
+ list_del(&subflow->node);
+
+ if (sock && sock != sk->sk_socket) {
+ /* outgoing subflow */
+ sock_release(sock);
+ } else {
+ /* incoming subflow */
+ tcp_close(ssk, timeout);
+ }
+}
+
+static int __mptcp_init_sock(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ INIT_LIST_HEAD(&msk->conn_list);
+ __set_bit(MPTCP_SEND_SPACE, &msk->flags);
+
+ msk->first = NULL;
+
+ return 0;
+}
+
+static int mptcp_init_sock(struct sock *sk)
+{
+ if (!mptcp_is_enabled(sock_net(sk)))
+ return -ENOPROTOOPT;
+
+ return __mptcp_init_sock(sk);
+}
+
+static void mptcp_subflow_shutdown(struct sock *ssk, int how)
+{
+ lock_sock(ssk);
+
+ switch (ssk->sk_state) {
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* fall through */
+ case TCP_SYN_SENT:
+ tcp_disconnect(ssk, O_NONBLOCK);
+ break;
+ default:
+ ssk->sk_shutdown |= how;
+ tcp_shutdown(ssk, how);
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ ssk->sk_state_change(ssk);
+ release_sock(ssk);
+}
+
+/* Called with msk lock held, releases such lock before returning */
+static void mptcp_close(struct sock *sk, long timeout)
+{
+ struct mptcp_subflow_context *subflow, *tmp;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ LIST_HEAD(conn_list);
+
+ lock_sock(sk);
+
+ mptcp_token_destroy(msk->token);
+ inet_sk_state_store(sk, TCP_CLOSE);
+
+ list_splice_init(&msk->conn_list, &conn_list);
+
+ release_sock(sk);
+
+ list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ __mptcp_close_ssk(sk, ssk, subflow, timeout);
+ }
+
+ sk_common_release(sk);
+}
+
+static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ const struct ipv6_pinfo *ssk6 = inet6_sk(ssk);
+ struct ipv6_pinfo *msk6 = inet6_sk(msk);
+
+ msk->sk_v6_daddr = ssk->sk_v6_daddr;
+ msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr;
+
+ if (msk6 && ssk6) {
+ msk6->saddr = ssk6->saddr;
+ msk6->flow_label = ssk6->flow_label;
+ }
+#endif
+
+ inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num;
+ inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport;
+ inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport;
+ inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr;
+ inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr;
+ inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
+{
+ unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo);
+
+ return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
+}
+#endif
+
+static struct sock *mptcp_sk_clone_lock(const struct sock *sk)
+{
+ struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
+
+ if (!nsk)
+ return NULL;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (nsk->sk_family == AF_INET6)
+ inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
+#endif
+
+ return nsk;
+}
+
+static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
+ bool kern)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *listener;
+ struct sock *newsk;
+
+ listener = __mptcp_nmpc_socket(msk);
+ if (WARN_ON_ONCE(!listener)) {
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ pr_debug("msk=%p, listener=%p", msk, mptcp_subflow_ctx(listener->sk));
+ newsk = inet_csk_accept(listener->sk, flags, err, kern);
+ if (!newsk)
+ return NULL;
+
+ pr_debug("msk=%p, subflow is mptcp=%d", msk, sk_is_mptcp(newsk));
+
+ if (sk_is_mptcp(newsk)) {
+ struct mptcp_subflow_context *subflow;
+ struct sock *new_mptcp_sock;
+ struct sock *ssk = newsk;
+ u64 ack_seq;
+
+ subflow = mptcp_subflow_ctx(newsk);
+ lock_sock(sk);
+
+ local_bh_disable();
+ new_mptcp_sock = mptcp_sk_clone_lock(sk);
+ if (!new_mptcp_sock) {
+ *err = -ENOBUFS;
+ local_bh_enable();
+ release_sock(sk);
+ mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1);
+ tcp_close(newsk, 0);
+ return NULL;
+ }
+
+ __mptcp_init_sock(new_mptcp_sock);
+
+ msk = mptcp_sk(new_mptcp_sock);
+ msk->local_key = subflow->local_key;
+ msk->token = subflow->token;
+ msk->subflow = NULL;
+ msk->first = newsk;
+
+ mptcp_token_update_accept(newsk, new_mptcp_sock);
+
+ msk->write_seq = subflow->idsn + 1;
+ if (subflow->can_ack) {
+ msk->can_ack = true;
+ msk->remote_key = subflow->remote_key;
+ mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ msk->ack_seq = ack_seq;
+ }
+ newsk = new_mptcp_sock;
+ mptcp_copy_inaddrs(newsk, ssk);
+ list_add(&subflow->node, &msk->conn_list);
+
+ /* will be fully established at mptcp_stream_accept()
+ * completion.
+ */
+ inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV);
+ bh_unlock_sock(new_mptcp_sock);
+ local_bh_enable();
+ release_sock(sk);
+
+ /* the subflow can already receive packet, avoid racing with
+ * the receive path and process the pending ones
+ */
+ lock_sock(ssk);
+ subflow->rel_write_seq = 1;
+ subflow->tcp_sock = ssk;
+ subflow->conn = new_mptcp_sock;
+ if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue)))
+ mptcp_subflow_data_available(ssk);
+ release_sock(ssk);
+ }
+
+ return newsk;
+}
+
+static void mptcp_destroy(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (msk->cached_ext)
+ __skb_ext_put(msk->cached_ext);
+}
+
+static int mptcp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret = -EOPNOTSUPP;
+ struct socket *ssock;
+ struct sock *ssk;
+
+ pr_debug("msk=%p", msk);
+
+ /* @@ the meaning of setsockopt() when the socket is connected and
+ * there are multiple subflows is not defined.
+ */
+ lock_sock(sk);
+ ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
+ if (IS_ERR(ssock)) {
+ release_sock(sk);
+ return ret;
+ }
+
+ ssk = ssock->sk;
+ sock_hold(ssk);
+ release_sock(sk);
+
+ ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
+ sock_put(ssk);
+
+ return ret;
+}
+
+static int mptcp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *option)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int ret = -EOPNOTSUPP;
+ struct socket *ssock;
+ struct sock *ssk;
+
+ pr_debug("msk=%p", msk);
+
+ /* @@ the meaning of getsockopt() when the socket is connected and
+ * there are multiple subflows is not defined.
+ */
+ lock_sock(sk);
+ ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
+ if (IS_ERR(ssock)) {
+ release_sock(sk);
+ return ret;
+ }
+
+ ssk = ssock->sk;
+ sock_hold(ssk);
+ release_sock(sk);
+
+ ret = tcp_getsockopt(ssk, level, optname, optval, option);
+ sock_put(ssk);
+
+ return ret;
+}
+
+static int mptcp_get_port(struct sock *sk, unsigned short snum)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket *ssock;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ pr_debug("msk=%p, subflow=%p", msk, ssock);
+ if (WARN_ON_ONCE(!ssock))
+ return -EINVAL;
+
+ return inet_csk_get_port(ssock->sk, snum);
+}
+
+void mptcp_finish_connect(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk;
+ struct sock *sk;
+ u64 ack_seq;
+
+ subflow = mptcp_subflow_ctx(ssk);
+
+ if (!subflow->mp_capable)
+ return;
+
+ sk = subflow->conn;
+ msk = mptcp_sk(sk);
+
+ pr_debug("msk=%p, token=%u", sk, subflow->token);
+
+ mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ subflow->map_seq = ack_seq;
+ subflow->map_subflow_seq = 1;
+ subflow->rel_write_seq = 1;
+
+ /* the socket is not connected yet, no msk/subflow ops can access/race
+ * accessing the field below
+ */
+ WRITE_ONCE(msk->remote_key, subflow->remote_key);
+ WRITE_ONCE(msk->local_key, subflow->local_key);
+ WRITE_ONCE(msk->token, subflow->token);
+ WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
+ WRITE_ONCE(msk->ack_seq, ack_seq);
+ WRITE_ONCE(msk->can_ack, 1);
+}
+
+static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ rcu_assign_pointer(sk->sk_wq, &parent->wq);
+ sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static bool mptcp_memory_free(const struct sock *sk, int wake)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
+}
+
+static struct proto mptcp_prot = {
+ .name = "MPTCP",
+ .owner = THIS_MODULE,
+ .init = mptcp_init_sock,
+ .close = mptcp_close,
+ .accept = mptcp_accept,
+ .setsockopt = mptcp_setsockopt,
+ .getsockopt = mptcp_getsockopt,
+ .shutdown = tcp_shutdown,
+ .destroy = mptcp_destroy,
+ .sendmsg = mptcp_sendmsg,
+ .recvmsg = mptcp_recvmsg,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
+ .get_port = mptcp_get_port,
+ .stream_memory_free = mptcp_memory_free,
+ .obj_size = sizeof(struct mptcp_sock),
+ .no_autobind = true,
+};
+
+static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ lock_sock(sock->sk);
+ ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
+ goto unlock;
+ }
+
+ err = ssock->ops->bind(ssock, uaddr, addr_len);
+ if (!err)
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+
+unlock:
+ release_sock(sock->sk);
+ return err;
+}
+
+static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ lock_sock(sock->sk);
+ ssock = __mptcp_socket_create(msk, TCP_SYN_SENT);
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
+ goto unlock;
+ }
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
+ mptcp_subflow_ctx(ssock->sk)->request_mptcp = 0;
+#endif
+
+ err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
+ inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+
+unlock:
+ release_sock(sock->sk);
+ return err;
+}
+
+static int mptcp_v4_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ if (sock->sk->sk_prot == &tcp_prot) {
+ /* we are being invoked from __sys_accept4, after
+ * mptcp_accept() has just accepted a non-mp-capable
+ * flow: sk is a tcp_sk, not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ sock->ops = &inet_stream_ops;
+ }
+
+ return inet_getname(sock, uaddr, peer);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static int mptcp_v6_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ if (sock->sk->sk_prot == &tcpv6_prot) {
+ /* we are being invoked from __sys_accept4 after
+ * mptcp_accept() has accepted a non-mp-capable
+ * subflow: sk is a tcp_sk, not mptcp.
+ *
+ * Hand the socket over to tcp so all further
+ * socket ops bypass mptcp.
+ */
+ sock->ops = &inet6_stream_ops;
+ }
+
+ return inet6_getname(sock, uaddr, peer);
+}
+#endif
+
+static int mptcp_listen(struct socket *sock, int backlog)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ pr_debug("msk=%p", msk);
+
+ lock_sock(sock->sk);
+ ssock = __mptcp_socket_create(msk, TCP_LISTEN);
+ if (IS_ERR(ssock)) {
+ err = PTR_ERR(ssock);
+ goto unlock;
+ }
+
+ err = ssock->ops->listen(ssock, backlog);
+ inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk));
+ if (!err)
+ mptcp_copy_inaddrs(sock->sk, ssock->sk);
+
+unlock:
+ release_sock(sock->sk);
+ return err;
+}
+
+static bool is_tcp_proto(const struct proto *p)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ return p == &tcp_prot || p == &tcpv6_prot;
+#else
+ return p == &tcp_prot;
+#endif
+}
+
+static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
+ int flags, bool kern)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct socket *ssock;
+ int err;
+
+ pr_debug("msk=%p", msk);
+
+ lock_sock(sock->sk);
+ if (sock->sk->sk_state != TCP_LISTEN)
+ goto unlock_fail;
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock)
+ goto unlock_fail;
+
+ sock_hold(ssock->sk);
+ release_sock(sock->sk);
+
+ err = ssock->ops->accept(sock, newsock, flags, kern);
+ if (err == 0 && !is_tcp_proto(newsock->sk->sk_prot)) {
+ struct mptcp_sock *msk = mptcp_sk(newsock->sk);
+ struct mptcp_subflow_context *subflow;
+
+ /* set ssk->sk_socket of accept()ed flows to mptcp socket.
+ * This is needed so NOSPACE flag can be set from tcp stack.
+ */
+ list_for_each_entry(subflow, &msk->conn_list, node) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!ssk->sk_socket)
+ mptcp_sock_graft(ssk, newsock);
+ }
+
+ inet_sk_state_store(newsock->sk, TCP_ESTABLISHED);
+ }
+
+ sock_put(ssock->sk);
+ return err;
+
+unlock_fail:
+ release_sock(sock->sk);
+ return -EINVAL;
+}
+
+static __poll_t mptcp_poll(struct file *file, struct socket *sock,
+ struct poll_table_struct *wait)
+{
+ struct sock *sk = sock->sk;
+ struct mptcp_sock *msk;
+ struct socket *ssock;
+ __poll_t mask = 0;
+
+ msk = mptcp_sk(sk);
+ lock_sock(sk);
+ ssock = __mptcp_nmpc_socket(msk);
+ if (ssock) {
+ mask = ssock->ops->poll(file, ssock, wait);
+ release_sock(sk);
+ return mask;
+ }
+
+ release_sock(sk);
+ sock_poll_wait(file, sock, wait);
+ lock_sock(sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
+ return ssock->ops->poll(file, ssock, NULL);
+
+ if (test_bit(MPTCP_DATA_READY, &msk->flags))
+ mask = EPOLLIN | EPOLLRDNORM;
+ if (sk_stream_is_writeable(sk) &&
+ test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ mask |= EPOLLOUT | EPOLLWRNORM;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+
+ release_sock(sk);
+
+ return mask;
+}
+
+static int mptcp_shutdown(struct socket *sock, int how)
+{
+ struct mptcp_sock *msk = mptcp_sk(sock->sk);
+ struct mptcp_subflow_context *subflow;
+ int ret = 0;
+
+ pr_debug("sk=%p, how=%d", msk, how);
+
+ lock_sock(sock->sk);
+
+ if (how == SHUT_WR || how == SHUT_RDWR)
+ inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
+
+ how++;
+
+ if ((how & ~SHUTDOWN_MASK) || !how) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sock->sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+
+ mptcp_subflow_shutdown(tcp_sk, how);
+ }
+
+out_unlock:
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static const struct proto_ops mptcp_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = mptcp_bind,
+ .connect = mptcp_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = mptcp_stream_accept,
+ .getname = mptcp_v4_getname,
+ .poll = mptcp_poll,
+ .ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = mptcp_listen,
+ .shutdown = mptcp_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw mptcp_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_MPTCP,
+ .prot = &mptcp_prot,
+ .ops = &mptcp_stream_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+void mptcp_proto_init(void)
+{
+ mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
+
+ mptcp_subflow_init();
+
+ if (proto_register(&mptcp_prot, 1) != 0)
+ panic("Failed to register MPTCP proto.\n");
+
+ inet_register_protosw(&mptcp_protosw);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static const struct proto_ops mptcp_v6_stream_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = mptcp_bind,
+ .connect = mptcp_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = mptcp_stream_accept,
+ .getname = mptcp_v6_getname,
+ .poll = mptcp_poll,
+ .ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
+ .listen = mptcp_listen,
+ .shutdown = mptcp_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet6_sendmsg,
+ .recvmsg = inet6_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct proto mptcp_v6_prot;
+
+static void mptcp_v6_destroy(struct sock *sk)
+{
+ mptcp_destroy(sk);
+ inet6_destroy_sock(sk);
+}
+
+static struct inet_protosw mptcp_v6_protosw = {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_MPTCP,
+ .prot = &mptcp_v6_prot,
+ .ops = &mptcp_v6_stream_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+int mptcp_proto_v6_init(void)
+{
+ int err;
+
+ mptcp_v6_prot = mptcp_prot;
+ strcpy(mptcp_v6_prot.name, "MPTCPv6");
+ mptcp_v6_prot.slab = NULL;
+ mptcp_v6_prot.destroy = mptcp_v6_destroy;
+ mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock);
+
+ err = proto_register(&mptcp_v6_prot, 1);
+ if (err)
+ return err;
+
+ err = inet6_register_protosw(&mptcp_v6_protosw);
+ if (err)
+ proto_unregister(&mptcp_v6_prot);
+
+ return err;
+}
+#endif
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
new file mode 100644
index 000000000000..8a99a2930284
--- /dev/null
+++ b/net/mptcp/protocol.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#ifndef __MPTCP_PROTOCOL_H
+#define __MPTCP_PROTOCOL_H
+
+#include <linux/random.h>
+#include <net/tcp.h>
+#include <net/inet_connection_sock.h>
+
+#define MPTCP_SUPPORTED_VERSION 1
+
+/* MPTCP option bits */
+#define OPTION_MPTCP_MPC_SYN BIT(0)
+#define OPTION_MPTCP_MPC_SYNACK BIT(1)
+#define OPTION_MPTCP_MPC_ACK BIT(2)
+
+/* MPTCP option subtypes */
+#define MPTCPOPT_MP_CAPABLE 0
+#define MPTCPOPT_MP_JOIN 1
+#define MPTCPOPT_DSS 2
+#define MPTCPOPT_ADD_ADDR 3
+#define MPTCPOPT_RM_ADDR 4
+#define MPTCPOPT_MP_PRIO 5
+#define MPTCPOPT_MP_FAIL 6
+#define MPTCPOPT_MP_FASTCLOSE 7
+
+/* MPTCP suboption lengths */
+#define TCPOLEN_MPTCP_MPC_SYN 4
+#define TCPOLEN_MPTCP_MPC_SYNACK 12
+#define TCPOLEN_MPTCP_MPC_ACK 20
+#define TCPOLEN_MPTCP_MPC_ACK_DATA 22
+#define TCPOLEN_MPTCP_DSS_BASE 4
+#define TCPOLEN_MPTCP_DSS_ACK32 4
+#define TCPOLEN_MPTCP_DSS_ACK64 8
+#define TCPOLEN_MPTCP_DSS_MAP32 10
+#define TCPOLEN_MPTCP_DSS_MAP64 14
+#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
+
+/* MPTCP MP_CAPABLE flags */
+#define MPTCP_VERSION_MASK (0x0F)
+#define MPTCP_CAP_CHECKSUM_REQD BIT(7)
+#define MPTCP_CAP_EXTENSIBILITY BIT(6)
+#define MPTCP_CAP_HMAC_SHA256 BIT(0)
+#define MPTCP_CAP_FLAG_MASK (0x3F)
+
+/* MPTCP DSS flags */
+#define MPTCP_DSS_DATA_FIN BIT(4)
+#define MPTCP_DSS_DSN64 BIT(3)
+#define MPTCP_DSS_HAS_MAP BIT(2)
+#define MPTCP_DSS_ACK64 BIT(1)
+#define MPTCP_DSS_HAS_ACK BIT(0)
+#define MPTCP_DSS_FLAG_MASK (0x1F)
+
+/* MPTCP socket flags */
+#define MPTCP_DATA_READY BIT(0)
+#define MPTCP_SEND_SPACE BIT(1)
+
+/* MPTCP connection sock */
+struct mptcp_sock {
+ /* inet_connection_sock must be the first member */
+ struct inet_connection_sock sk;
+ u64 local_key;
+ u64 remote_key;
+ u64 write_seq;
+ u64 ack_seq;
+ u32 token;
+ unsigned long flags;
+ bool can_ack;
+ struct list_head conn_list;
+ struct skb_ext *cached_ext; /* for the next sendmsg */
+ struct socket *subflow; /* outgoing connect/listener/!mp_capable */
+ struct sock *first;
+};
+
+#define mptcp_for_each_subflow(__msk, __subflow) \
+ list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+
+static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
+{
+ return (struct mptcp_sock *)sk;
+}
+
+struct mptcp_subflow_request_sock {
+ struct tcp_request_sock sk;
+ u16 mp_capable : 1,
+ mp_join : 1,
+ backup : 1,
+ remote_key_valid : 1;
+ u64 local_key;
+ u64 remote_key;
+ u64 idsn;
+ u32 token;
+ u32 ssn_offset;
+};
+
+static inline struct mptcp_subflow_request_sock *
+mptcp_subflow_rsk(const struct request_sock *rsk)
+{
+ return (struct mptcp_subflow_request_sock *)rsk;
+}
+
+/* MPTCP subflow context */
+struct mptcp_subflow_context {
+ struct list_head node;/* conn_list of subflows */
+ u64 local_key;
+ u64 remote_key;
+ u64 idsn;
+ u64 map_seq;
+ u32 snd_isn;
+ u32 token;
+ u32 rel_write_seq;
+ u32 map_subflow_seq;
+ u32 ssn_offset;
+ u32 map_data_len;
+ u32 request_mptcp : 1, /* send MP_CAPABLE */
+ mp_capable : 1, /* remote is MPTCP capable */
+ fourth_ack : 1, /* send initial DSS */
+ conn_finished : 1,
+ map_valid : 1,
+ mpc_map : 1,
+ data_avail : 1,
+ rx_eof : 1,
+ can_ack : 1; /* only after processing the remote a key */
+
+ struct sock *tcp_sock; /* tcp sk backpointer */
+ struct sock *conn; /* parent mptcp_sock */
+ const struct inet_connection_sock_af_ops *icsk_af_ops;
+ void (*tcp_data_ready)(struct sock *sk);
+ void (*tcp_state_change)(struct sock *sk);
+ void (*tcp_write_space)(struct sock *sk);
+
+ struct rcu_head rcu;
+};
+
+static inline struct mptcp_subflow_context *
+mptcp_subflow_ctx(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* Use RCU on icsk_ulp_data only for sock diag code */
+ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
+}
+
+static inline struct sock *
+mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow)
+{
+ return subflow->tcp_sock;
+}
+
+static inline u64
+mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow)
+{
+ return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq -
+ subflow->ssn_offset -
+ subflow->map_subflow_seq;
+}
+
+static inline u64
+mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
+{
+ return subflow->map_seq + mptcp_subflow_get_map_offset(subflow);
+}
+
+int mptcp_is_enabled(struct net *net);
+bool mptcp_subflow_data_available(struct sock *sk);
+void mptcp_subflow_init(void);
+int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
+
+static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *ctx)
+{
+ sk->sk_data_ready = ctx->tcp_data_ready;
+ sk->sk_state_change = ctx->tcp_state_change;
+ sk->sk_write_space = ctx->tcp_write_space;
+
+ inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops;
+}
+
+extern const struct inet_connection_sock_af_ops ipv4_specific;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+extern const struct inet_connection_sock_af_ops ipv6_specific;
+#endif
+
+void mptcp_proto_init(void);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+int mptcp_proto_v6_init(void);
+#endif
+
+struct mptcp_read_arg {
+ struct msghdr *msg;
+};
+
+int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len);
+
+void mptcp_get_options(const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx);
+
+void mptcp_finish_connect(struct sock *sk);
+
+int mptcp_token_new_request(struct request_sock *req);
+void mptcp_token_destroy_request(u32 token);
+int mptcp_token_new_connect(struct sock *sk);
+int mptcp_token_new_accept(u32 token);
+void mptcp_token_update_accept(struct sock *sk, struct sock *conn);
+void mptcp_token_destroy(u32 token);
+
+void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
+static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
+{
+ /* we might consider a faster version that computes the key as a
+ * hash of some information available in the MPTCP socket. Use
+ * random data at the moment, as it's probably the safest option
+ * in case multiple sockets are opened in different namespaces at
+ * the same time.
+ */
+ get_random_bytes(key, sizeof(u64));
+ mptcp_crypto_key_sha(*key, token, idsn);
+}
+
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hash_out);
+
+static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
+{
+ return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP);
+}
+
+static inline bool before64(__u64 seq1, __u64 seq2)
+{
+ return (__s64)(seq1 - seq2) < 0;
+}
+
+#define after64(seq2, seq1) before64(seq1, seq2)
+
+#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
new file mode 100644
index 000000000000..65122edf60aa
--- /dev/null
+++ b/net/mptcp/subflow.c
@@ -0,0 +1,865 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/inet_hashtables.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+#include <net/ip6_route.h>
+#endif
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static int subflow_rebuild_header(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ int err = 0;
+
+ if (subflow->request_mptcp && !subflow->token) {
+ pr_debug("subflow=%p", sk);
+ err = mptcp_token_new_connect(sk);
+ }
+
+ if (err)
+ return err;
+
+ return subflow->icsk_af_ops->rebuild_header(sk);
+}
+
+static void subflow_req_destructor(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+
+ pr_debug("subflow_req=%p", subflow_req);
+
+ if (subflow_req->mp_capable)
+ mptcp_token_destroy_request(subflow_req->token);
+ tcp_request_sock_ops.destructor(req);
+}
+
+static void subflow_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct tcp_options_received rx_opt;
+
+ pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
+
+ memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
+ mptcp_get_options(skb, &rx_opt);
+
+ subflow_req->mp_capable = 0;
+ subflow_req->remote_key_valid = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+ /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
+ * TCP option space.
+ */
+ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
+ return;
+#endif
+
+ if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
+ int err;
+
+ err = mptcp_token_new_request(req);
+ if (err == 0)
+ subflow_req->mp_capable = 1;
+
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ }
+}
+
+static void subflow_v4_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ tcp_rsk(req)->is_mptcp = 1;
+
+ tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
+
+ subflow_init_req(req, sk_listener, skb);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static void subflow_v6_init_req(struct request_sock *req,
+ const struct sock *sk_listener,
+ struct sk_buff *skb)
+{
+ tcp_rsk(req)->is_mptcp = 1;
+
+ tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
+
+ subflow_init_req(req, sk_listener, skb);
+}
+#endif
+
+static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
+
+ if (subflow->conn && !subflow->conn_finished) {
+ pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
+ subflow->remote_key);
+ mptcp_finish_connect(sk);
+ subflow->conn_finished = 1;
+
+ if (skb) {
+ pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+ }
+ }
+}
+
+static struct request_sock_ops subflow_request_sock_ops;
+static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
+
+static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow=%p", subflow);
+
+ /* Never answer to SYNs sent to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ goto drop;
+
+ return tcp_conn_request(&subflow_request_sock_ops,
+ &subflow_request_sock_ipv4_ops,
+ sk, skb);
+drop:
+ tcp_listendrop(sk);
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
+static struct inet_connection_sock_af_ops subflow_v6_specific;
+static struct inet_connection_sock_af_ops subflow_v6m_specific;
+
+static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow=%p", subflow);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return subflow_v4_conn_request(sk, skb);
+
+ if (!ipv6_unicast_destination(skb))
+ goto drop;
+
+ return tcp_conn_request(&subflow_request_sock_ops,
+ &subflow_request_sock_ipv6_ops, sk, skb);
+
+drop:
+ tcp_listendrop(sk);
+ return 0; /* don't send reset */
+}
+#endif
+
+static struct sock *subflow_syn_recv_sock(const struct sock *sk,
+ struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst,
+ struct request_sock *req_unhash,
+ bool *own_req)
+{
+ struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
+ struct mptcp_subflow_request_sock *subflow_req;
+ struct tcp_options_received opt_rx;
+ struct sock *child;
+
+ pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
+
+ if (tcp_rsk(req)->is_mptcp == 0)
+ goto create_child;
+
+ /* if the sk is MP_CAPABLE, we try to fetch the client key */
+ subflow_req = mptcp_subflow_rsk(req);
+ if (subflow_req->mp_capable) {
+ if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
+ /* here we can receive and accept an in-window,
+ * out-of-order pkt, which will not carry the MP_CAPABLE
+ * opt even on mptcp enabled paths
+ */
+ goto create_child;
+ }
+
+ opt_rx.mptcp.mp_capable = 0;
+ mptcp_get_options(skb, &opt_rx);
+ if (opt_rx.mptcp.mp_capable) {
+ subflow_req->remote_key = opt_rx.mptcp.sndr_key;
+ subflow_req->remote_key_valid = 1;
+ } else {
+ subflow_req->mp_capable = 0;
+ }
+ }
+
+create_child:
+ child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+ req_unhash, own_req);
+
+ if (child && *own_req) {
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
+
+ /* we have null ctx on TCP fallback, not fatal on MPC
+ * handshake
+ */
+ if (!ctx)
+ return child;
+
+ if (ctx->mp_capable) {
+ if (mptcp_token_new_accept(ctx->token))
+ goto close_child;
+ }
+ }
+
+ return child;
+
+close_child:
+ pr_debug("closing child socket");
+ tcp_send_active_reset(child, GFP_ATOMIC);
+ inet_csk_prepare_forced_close(child);
+ tcp_done(child);
+ return NULL;
+}
+
+static struct inet_connection_sock_af_ops subflow_specific;
+
+enum mapping_status {
+ MAPPING_OK,
+ MAPPING_INVALID,
+ MAPPING_EMPTY,
+ MAPPING_DATA_FIN
+};
+
+static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
+{
+ if ((u32)seq == (u32)old_seq)
+ return old_seq;
+
+ /* Assume map covers data not mapped yet. */
+ return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
+}
+
+static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
+{
+ WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
+ ssn, subflow->map_subflow_seq, subflow->map_data_len);
+}
+
+static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ unsigned int skb_consumed;
+
+ skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
+ if (WARN_ON_ONCE(skb_consumed >= skb->len))
+ return true;
+
+ return skb->len - skb_consumed <= subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+}
+
+static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
+
+ if (unlikely(before(ssn, subflow->map_subflow_seq))) {
+ /* Mapping covers data later in the subflow stream,
+ * currently unsupported.
+ */
+ warn_bad_map(subflow, ssn);
+ return false;
+ }
+ if (unlikely(!before(ssn, subflow->map_subflow_seq +
+ subflow->map_data_len))) {
+ /* Mapping does covers past subflow data, invalid */
+ warn_bad_map(subflow, ssn + skb->len);
+ return false;
+ }
+ return true;
+}
+
+static enum mapping_status get_mapping_status(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct mptcp_ext *mpext;
+ struct sk_buff *skb;
+ u16 data_len;
+ u64 map_seq;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ return MAPPING_EMPTY;
+
+ mpext = mptcp_get_ext(skb);
+ if (!mpext || !mpext->use_map) {
+ if (!subflow->map_valid && !skb->len) {
+ /* the TCP stack deliver 0 len FIN pkt to the receive
+ * queue, that is the only 0len pkts ever expected here,
+ * and we can admit no mapping only for 0 len pkts
+ */
+ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ WARN_ONCE(1, "0len seq %d:%d flags %x",
+ TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ TCP_SKB_CB(skb)->tcp_flags);
+ sk_eat_skb(ssk, skb);
+ return MAPPING_EMPTY;
+ }
+
+ if (!subflow->map_valid)
+ return MAPPING_INVALID;
+
+ goto validate_seq;
+ }
+
+ pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
+ mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
+ mpext->data_len, mpext->data_fin);
+
+ data_len = mpext->data_len;
+ if (data_len == 0) {
+ pr_err("Infinite mapping not handled");
+ return MAPPING_INVALID;
+ }
+
+ if (mpext->data_fin == 1) {
+ if (data_len == 1) {
+ pr_debug("DATA_FIN with no payload");
+ if (subflow->map_valid) {
+ /* A DATA_FIN might arrive in a DSS
+ * option before the previous mapping
+ * has been fully consumed. Continue
+ * handling the existing mapping.
+ */
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ } else {
+ return MAPPING_DATA_FIN;
+ }
+ }
+
+ /* Adjust for DATA_FIN using 1 byte of sequence space */
+ data_len--;
+ }
+
+ if (!mpext->dsn64) {
+ map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
+ mpext->data_seq);
+ pr_debug("expanded seq=%llu", subflow->map_seq);
+ } else {
+ map_seq = mpext->data_seq;
+ }
+
+ if (subflow->map_valid) {
+ /* Allow replacing only with an identical map */
+ if (subflow->map_seq == map_seq &&
+ subflow->map_subflow_seq == mpext->subflow_seq &&
+ subflow->map_data_len == data_len) {
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+ }
+
+ /* If this skb data are fully covered by the current mapping,
+ * the new map would need caching, which is not supported
+ */
+ if (skb_is_fully_mapped(ssk, skb))
+ return MAPPING_INVALID;
+
+ /* will validate the next map after consuming the current one */
+ return MAPPING_OK;
+ }
+
+ subflow->map_seq = map_seq;
+ subflow->map_subflow_seq = mpext->subflow_seq;
+ subflow->map_data_len = data_len;
+ subflow->map_valid = 1;
+ subflow->mpc_map = mpext->mpc_map;
+ pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
+ subflow->map_seq, subflow->map_subflow_seq,
+ subflow->map_data_len);
+
+validate_seq:
+ /* we revalidate valid mapping on new skb, because we must ensure
+ * the current skb is completely covered by the available mapping
+ */
+ if (!validate_mapping(ssk, skb))
+ return MAPPING_INVALID;
+
+ skb_ext_del(skb, SKB_EXT_MPTCP);
+ return MAPPING_OK;
+}
+
+static bool subflow_check_data_avail(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ enum mapping_status status;
+ struct mptcp_sock *msk;
+ struct sk_buff *skb;
+
+ pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
+ subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
+ if (subflow->data_avail)
+ return true;
+
+ if (!subflow->conn)
+ return false;
+
+ msk = mptcp_sk(subflow->conn);
+ for (;;) {
+ u32 map_remaining;
+ size_t delta;
+ u64 ack_seq;
+ u64 old_ack;
+
+ status = get_mapping_status(ssk);
+ pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
+ if (status == MAPPING_INVALID) {
+ ssk->sk_err = EBADMSG;
+ goto fatal;
+ }
+
+ if (status != MAPPING_OK)
+ return false;
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (WARN_ON_ONCE(!skb))
+ return false;
+
+ /* if msk lacks the remote key, this subflow must provide an
+ * MP_CAPABLE-based mapping
+ */
+ if (unlikely(!READ_ONCE(msk->can_ack))) {
+ if (!subflow->mpc_map) {
+ ssk->sk_err = EBADMSG;
+ goto fatal;
+ }
+ WRITE_ONCE(msk->remote_key, subflow->remote_key);
+ WRITE_ONCE(msk->ack_seq, subflow->map_seq);
+ WRITE_ONCE(msk->can_ack, true);
+ }
+
+ old_ack = READ_ONCE(msk->ack_seq);
+ ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
+ pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
+ ack_seq);
+ if (ack_seq == old_ack)
+ break;
+
+ /* only accept in-sequence mapping. Old values are spurious
+ * retransmission; we can hit "future" values on active backup
+ * subflow switch, we relay on retransmissions to get
+ * in-sequence data.
+ * Cuncurrent subflows support will require subflow data
+ * reordering
+ */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+ if (before64(ack_seq, old_ack))
+ delta = min_t(size_t, old_ack - ack_seq, map_remaining);
+ else
+ delta = min_t(size_t, ack_seq - old_ack, map_remaining);
+
+ /* discard mapped data */
+ pr_debug("discarding %zu bytes, current map len=%d", delta,
+ map_remaining);
+ if (delta) {
+ struct mptcp_read_arg arg = {
+ .msg = NULL,
+ };
+ read_descriptor_t desc = {
+ .count = delta,
+ .arg.data = &arg,
+ };
+ int ret;
+
+ ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
+ if (ret < 0) {
+ ssk->sk_err = -ret;
+ goto fatal;
+ }
+ if (ret < delta)
+ return false;
+ if (delta == map_remaining)
+ subflow->map_valid = 0;
+ }
+ }
+ return true;
+
+fatal:
+ /* fatal protocol error, close the socket */
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+ ssk->sk_error_report(ssk);
+ tcp_set_state(ssk, TCP_CLOSE);
+ tcp_send_active_reset(ssk, GFP_ATOMIC);
+ return false;
+}
+
+bool mptcp_subflow_data_available(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sk_buff *skb;
+
+ /* check if current mapping is still valid */
+ if (subflow->map_valid &&
+ mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
+ subflow->map_valid = 0;
+ subflow->data_avail = 0;
+
+ pr_debug("Done with mapping: seq=%u data_len=%u",
+ subflow->map_subflow_seq,
+ subflow->map_data_len);
+ }
+
+ if (!subflow_check_data_avail(sk)) {
+ subflow->data_avail = 0;
+ return false;
+ }
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ subflow->data_avail = skb &&
+ before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
+ return subflow->data_avail;
+}
+
+static void subflow_data_ready(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ if (!parent || !subflow->mp_capable) {
+ subflow->tcp_data_ready(sk);
+
+ if (parent)
+ parent->sk_data_ready(parent);
+ return;
+ }
+
+ if (mptcp_subflow_data_available(sk)) {
+ set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
+
+ parent->sk_data_ready(parent);
+ }
+}
+
+static void subflow_write_space(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
+
+ sk_stream_write_space(sk);
+ if (parent && sk_stream_is_writeable(sk)) {
+ set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
+ smp_mb__after_atomic();
+ /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
+ sk_stream_write_space(parent);
+ }
+}
+
+static struct inet_connection_sock_af_ops *
+subflow_default_af_ops(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return &subflow_v6_specific;
+#endif
+ return &subflow_specific;
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock_af_ops *target;
+
+ target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
+
+ pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
+ subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
+
+ if (likely(icsk->icsk_af_ops == target))
+ return;
+
+ subflow->icsk_af_ops = icsk->icsk_af_ops;
+ icsk->icsk_af_ops = target;
+}
+#endif
+
+int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
+{
+ struct mptcp_subflow_context *subflow;
+ struct net *net = sock_net(sk);
+ struct socket *sf;
+ int err;
+
+ err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
+ &sf);
+ if (err)
+ return err;
+
+ lock_sock(sf->sk);
+
+ /* kernel sockets do not by default acquire net ref, but TCP timer
+ * needs it.
+ */
+ sf->sk->sk_net_refcnt = 1;
+ get_net(net);
+#ifdef CONFIG_PROC_FS
+ this_cpu_add(*net->core.sock_inuse, 1);
+#endif
+ err = tcp_set_ulp(sf->sk, "mptcp");
+ release_sock(sf->sk);
+
+ if (err)
+ return err;
+
+ subflow = mptcp_subflow_ctx(sf->sk);
+ pr_debug("subflow=%p", subflow);
+
+ *new_sock = sf;
+ sock_hold(sk);
+ subflow->conn = sk;
+
+ return 0;
+}
+
+static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
+ gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct mptcp_subflow_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), priority);
+ if (!ctx)
+ return NULL;
+
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ INIT_LIST_HEAD(&ctx->node);
+
+ pr_debug("subflow=%p", ctx);
+
+ ctx->tcp_sock = sk;
+
+ return ctx;
+}
+
+static void __subflow_state_change(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_all(&wq->wait);
+ rcu_read_unlock();
+}
+
+static bool subflow_is_done(const struct sock *sk)
+{
+ return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
+}
+
+static void subflow_state_change(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = READ_ONCE(subflow->conn);
+
+ __subflow_state_change(sk);
+
+ /* as recvmsg() does not acquire the subflow socket for ssk selection
+ * a fin packet carrying a DSS can be unnoticed if we don't trigger
+ * the data available machinery here.
+ */
+ if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
+ set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
+
+ parent->sk_data_ready(parent);
+ }
+
+ if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
+ !subflow->rx_eof && subflow_is_done(sk)) {
+ subflow->rx_eof = 1;
+ parent->sk_shutdown |= RCV_SHUTDOWN;
+ __subflow_state_change(parent);
+ }
+}
+
+static int subflow_ulp_init(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct mptcp_subflow_context *ctx;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int err = 0;
+
+ /* disallow attaching ULP to a socket unless it has been
+ * created with sock_create_kern()
+ */
+ if (!sk->sk_kern_sock) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ctx = subflow_create_ctx(sk, GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
+
+ tp->is_mptcp = 1;
+ ctx->icsk_af_ops = icsk->icsk_af_ops;
+ icsk->icsk_af_ops = subflow_default_af_ops(sk);
+ ctx->tcp_data_ready = sk->sk_data_ready;
+ ctx->tcp_state_change = sk->sk_state_change;
+ ctx->tcp_write_space = sk->sk_write_space;
+ sk->sk_data_ready = subflow_data_ready;
+ sk->sk_write_space = subflow_write_space;
+ sk->sk_state_change = subflow_state_change;
+out:
+ return err;
+}
+
+static void subflow_ulp_release(struct sock *sk)
+{
+ struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
+
+ if (!ctx)
+ return;
+
+ if (ctx->conn)
+ sock_put(ctx->conn);
+
+ kfree_rcu(ctx, rcu);
+}
+
+static void subflow_ulp_fallback(struct sock *sk,
+ struct mptcp_subflow_context *old_ctx)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ mptcp_subflow_tcp_fallback(sk, old_ctx);
+ icsk->icsk_ulp_ops = NULL;
+ rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+ tcp_sk(sk)->is_mptcp = 0;
+}
+
+static void subflow_ulp_clone(const struct request_sock *req,
+ struct sock *newsk,
+ const gfp_t priority)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
+ struct mptcp_subflow_context *new_ctx;
+
+ if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
+ subflow_ulp_fallback(newsk, old_ctx);
+ return;
+ }
+
+ new_ctx = subflow_create_ctx(newsk, priority);
+ if (!new_ctx) {
+ subflow_ulp_fallback(newsk, old_ctx);
+ return;
+ }
+
+ /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
+ * established only after we receive the remote key
+ */
+ new_ctx->conn_finished = 1;
+ new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
+ new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
+ new_ctx->tcp_state_change = old_ctx->tcp_state_change;
+ new_ctx->tcp_write_space = old_ctx->tcp_write_space;
+ new_ctx->mp_capable = 1;
+ new_ctx->fourth_ack = subflow_req->remote_key_valid;
+ new_ctx->can_ack = subflow_req->remote_key_valid;
+ new_ctx->remote_key = subflow_req->remote_key;
+ new_ctx->local_key = subflow_req->local_key;
+ new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
+}
+
+static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
+ .name = "mptcp",
+ .owner = THIS_MODULE,
+ .init = subflow_ulp_init,
+ .release = subflow_ulp_release,
+ .clone = subflow_ulp_clone,
+};
+
+static int subflow_ops_init(struct request_sock_ops *subflow_ops)
+{
+ subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
+ subflow_ops->slab_name = "request_sock_subflow";
+
+ subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
+ subflow_ops->obj_size, 0,
+ SLAB_ACCOUNT |
+ SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ if (!subflow_ops->slab)
+ return -ENOMEM;
+
+ subflow_ops->destructor = subflow_req_destructor;
+
+ return 0;
+}
+
+void mptcp_subflow_init(void)
+{
+ subflow_request_sock_ops = tcp_request_sock_ops;
+ if (subflow_ops_init(&subflow_request_sock_ops) != 0)
+ panic("MPTCP: failed to init subflow request sock ops\n");
+
+ subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+ subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
+
+ subflow_specific = ipv4_specific;
+ subflow_specific.conn_request = subflow_v4_conn_request;
+ subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
+ subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_specific.rebuild_header = subflow_rebuild_header;
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+ subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
+
+ subflow_v6_specific = ipv6_specific;
+ subflow_v6_specific.conn_request = subflow_v6_conn_request;
+ subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
+ subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
+ subflow_v6_specific.rebuild_header = subflow_rebuild_header;
+
+ subflow_v6m_specific = subflow_v6_specific;
+ subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
+ subflow_v6m_specific.send_check = ipv4_specific.send_check;
+ subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
+ subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
+ subflow_v6m_specific.net_frag_header_len = 0;
+#endif
+
+ if (tcp_register_ulp(&subflow_ulp_ops) != 0)
+ panic("MPTCP: failed to register subflows to ULP\n");
+}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
new file mode 100644
index 000000000000..84d887806090
--- /dev/null
+++ b/net/mptcp/token.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP token management
+ * Copyright (c) 2017 - 2019, Intel Corporation.
+ *
+ * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org,
+ * authored by:
+ *
+ * Sébastien Barré <sebastien.barre@uclouvain.be>
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ * Gregory Detal <gregory.detal@uclouvain.be>
+ * Fabien DuchĂŞne <fabien.duchene@uclouvain.be>
+ * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ * Lavkesh Lahngir <lavkesh51@gmail.com>
+ * Andreas Ripke <ripke@neclab.eu>
+ * Vlad Dogaru <vlad.dogaru@intel.com>
+ * Octavian Purdila <octavian.purdila@intel.com>
+ * John Ronan <jronan@tssg.org>
+ * Catalin Nicutar <catalin.nicutar@gmail.com>
+ * Brandon Heller <brandonh@stanford.edu>
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/radix-tree.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <net/inet_common.h>
+#include <net/protocol.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static RADIX_TREE(token_tree, GFP_ATOMIC);
+static RADIX_TREE(token_req_tree, GFP_ATOMIC);
+static DEFINE_SPINLOCK(token_tree_lock);
+static int token_used __read_mostly;
+
+/**
+ * mptcp_token_new_request - create new key/idsn/token for subflow_request
+ * @req - the request socket
+ *
+ * This function is called when a new mptcp connection is coming in.
+ *
+ * It creates a unique token to identify the new mptcp connection,
+ * a secret local key and the initial data sequence number (idsn).
+ *
+ * Returns 0 on success.
+ */
+int mptcp_token_new_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ int err;
+
+ while (1) {
+ u32 token;
+
+ mptcp_crypto_key_gen_sha(&subflow_req->local_key,
+ &subflow_req->token,
+ &subflow_req->idsn);
+ pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n",
+ req, subflow_req->local_key, subflow_req->token,
+ subflow_req->idsn);
+
+ token = subflow_req->token;
+ spin_lock_bh(&token_tree_lock);
+ if (!radix_tree_lookup(&token_req_tree, token) &&
+ !radix_tree_lookup(&token_tree, token))
+ break;
+ spin_unlock_bh(&token_tree_lock);
+ }
+
+ err = radix_tree_insert(&token_req_tree,
+ subflow_req->token, &token_used);
+ spin_unlock_bh(&token_tree_lock);
+ return err;
+}
+
+/**
+ * mptcp_token_new_connect - create new key/idsn/token for subflow
+ * @sk - the socket that will initiate a connection
+ *
+ * This function is called when a new outgoing mptcp connection is
+ * initiated.
+ *
+ * It creates a unique token to identify the new mptcp connection,
+ * a secret local key and the initial data sequence number (idsn).
+ *
+ * On success, the mptcp connection can be found again using
+ * the computed token at a later time, this is needed to process
+ * join requests.
+ *
+ * returns 0 on success.
+ */
+int mptcp_token_new_connect(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *mptcp_sock = subflow->conn;
+ int err;
+
+ while (1) {
+ u32 token;
+
+ mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token,
+ &subflow->idsn);
+
+ pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n",
+ sk, subflow->local_key, subflow->token, subflow->idsn);
+
+ token = subflow->token;
+ spin_lock_bh(&token_tree_lock);
+ if (!radix_tree_lookup(&token_req_tree, token) &&
+ !radix_tree_lookup(&token_tree, token))
+ break;
+ spin_unlock_bh(&token_tree_lock);
+ }
+ err = radix_tree_insert(&token_tree, subflow->token, mptcp_sock);
+ spin_unlock_bh(&token_tree_lock);
+
+ return err;
+}
+
+/**
+ * mptcp_token_new_accept - insert token for later processing
+ * @token: the token to insert to the tree
+ *
+ * Called when a SYN packet creates a new logical connection, i.e.
+ * is not a join request.
+ *
+ * We don't have an mptcp socket yet at that point.
+ * This is paired with mptcp_token_update_accept, called on accept().
+ */
+int mptcp_token_new_accept(u32 token)
+{
+ int err;
+
+ spin_lock_bh(&token_tree_lock);
+ err = radix_tree_insert(&token_tree, token, &token_used);
+ spin_unlock_bh(&token_tree_lock);
+
+ return err;
+}
+
+/**
+ * mptcp_token_update_accept - update token to map to mptcp socket
+ * @conn: the new struct mptcp_sock
+ * @sk: the initial subflow for this mptcp socket
+ *
+ * Called when the first mptcp socket is created on accept to
+ * refresh the dummy mapping (done to reserve the token) with
+ * the mptcp_socket structure that wasn't allocated before.
+ */
+void mptcp_token_update_accept(struct sock *sk, struct sock *conn)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ void __rcu **slot;
+
+ spin_lock_bh(&token_tree_lock);
+ slot = radix_tree_lookup_slot(&token_tree, subflow->token);
+ WARN_ON_ONCE(!slot);
+ if (slot) {
+ WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used);
+ radix_tree_replace_slot(&token_tree, slot, conn);
+ }
+ spin_unlock_bh(&token_tree_lock);
+}
+
+/**
+ * mptcp_token_destroy_request - remove mptcp connection/token
+ * @token - token of mptcp connection to remove
+ *
+ * Remove not-yet-fully-established incoming connection identified
+ * by @token.
+ */
+void mptcp_token_destroy_request(u32 token)
+{
+ spin_lock_bh(&token_tree_lock);
+ radix_tree_delete(&token_req_tree, token);
+ spin_unlock_bh(&token_tree_lock);
+}
+
+/**
+ * mptcp_token_destroy - remove mptcp connection/token
+ * @token - token of mptcp connection to remove
+ *
+ * Remove the connection identified by @token.
+ */
+void mptcp_token_destroy(u32 token)
+{
+ spin_lock_bh(&token_tree_lock);
+ radix_tree_delete(&token_tree, token);
+ spin_unlock_bh(&token_tree_lock);
+}
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 0b3f0673e1a2..e37102546be6 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -64,6 +64,17 @@ enum {
NCSI_MODE_MAX
};
+/* Supported media status bits for Mellanox Mac affinity command.
+ * Bit (0-2) for different protocol support; Bit 1 for RBT support,
+ * bit 1 for SMBUS support and bit 2 for PCIE support. Bit (3-5)
+ * for different protocol availability. Bit 4 for RBT, bit 4 for
+ * SMBUS and bit 5 for PCIE.
+ */
+enum {
+ MLX_MC_RBT_SUPPORT = 0x01, /* MC supports RBT */
+ MLX_MC_RBT_AVL = 0x08, /* RBT medium is available */
+};
+
/* OEM Vendor Manufacture ID */
#define NCSI_OEM_MFR_MLX_ID 0x8119
#define NCSI_OEM_MFR_BCM_ID 0x113d
@@ -72,9 +83,15 @@ enum {
/* Mellanox specific OEM Command */
#define NCSI_OEM_MLX_CMD_GMA 0x00 /* CMD ID for Get MAC */
#define NCSI_OEM_MLX_CMD_GMA_PARAM 0x1b /* Parameter for GMA */
+#define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */
+#define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */
/* OEM Command payload lengths*/
#define NCSI_OEM_BCM_CMD_GMA_LEN 12
#define NCSI_OEM_MLX_CMD_GMA_LEN 8
+#define NCSI_OEM_MLX_CMD_SMAF_LEN 60
+/* Offset in OEM request */
+#define MLX_SMAF_MAC_ADDR_OFFSET 8 /* Offset for MAC in SMAF */
+#define MLX_SMAF_MED_SUPPORT_OFFSET 14 /* Offset for medium in SMAF */
/* Mac address offset in OEM response */
#define BCM_MAC_ADDR_OFFSET 28
#define MLX_MAC_ADDR_OFFSET 8
@@ -251,6 +268,8 @@ enum {
ncsi_dev_state_probe_deselect = 0x0201,
ncsi_dev_state_probe_package,
ncsi_dev_state_probe_channel,
+ ncsi_dev_state_probe_mlx_gma,
+ ncsi_dev_state_probe_mlx_smaf,
ncsi_dev_state_probe_cis,
ncsi_dev_state_probe_gvi,
ncsi_dev_state_probe_gc,
@@ -264,9 +283,7 @@ enum {
ncsi_dev_state_config_ev,
ncsi_dev_state_config_sma,
ncsi_dev_state_config_ebf,
-#if IS_ENABLED(CONFIG_IPV6)
- ncsi_dev_state_config_egmf,
-#endif
+ ncsi_dev_state_config_dgmf,
ncsi_dev_state_config_ecnt,
ncsi_dev_state_config_ec,
ncsi_dev_state_config_ae,
@@ -295,9 +312,6 @@ struct ncsi_dev_priv {
#define NCSI_DEV_RESET 8 /* Reset state of NC */
unsigned int gma_flag; /* OEM GMA flag */
spinlock_t lock; /* Protect the NCSI device */
-#if IS_ENABLED(CONFIG_IPV6)
- unsigned int inet6_addr_num; /* Number of IPv6 addresses */
-#endif
unsigned int package_probe_id;/* Current ID during probe */
unsigned int package_num; /* Number of packages */
struct list_head packages; /* List of packages */
@@ -316,6 +330,7 @@ struct ncsi_dev_priv {
struct list_head vlan_vids; /* List of active VLAN IDs */
bool multi_package; /* Enable multiple packages */
+ bool mlx_multi_host; /* Enable multi host Mellanox */
u32 package_whitelist; /* Packages to configure */
};
diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c
index 5c3fad8cba57..ba9ae482141b 100644
--- a/net/ncsi/ncsi-cmd.c
+++ b/net/ncsi/ncsi-cmd.c
@@ -54,7 +54,7 @@ static void ncsi_cmd_build_header(struct ncsi_pkt_hdr *h,
checksum = ncsi_calculate_checksum((unsigned char *)h,
sizeof(*h) + nca->payload);
pchecksum = (__be32 *)((void *)h + sizeof(struct ncsi_pkt_hdr) +
- nca->payload);
+ ALIGN(nca->payload, 4));
*pchecksum = htonl(checksum);
}
@@ -309,14 +309,21 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca)
int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
{
+ struct ncsi_cmd_handler *nch = NULL;
struct ncsi_request *nr;
+ unsigned char type;
struct ethhdr *eh;
- struct ncsi_cmd_handler *nch = NULL;
int i, ret;
+ /* Use OEM generic handler for Netlink request */
+ if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN)
+ type = NCSI_PKT_CMD_OEM;
+ else
+ type = nca->type;
+
/* Search for the handler */
for (i = 0; i < ARRAY_SIZE(ncsi_cmd_handlers); i++) {
- if (ncsi_cmd_handlers[i].type == nca->type) {
+ if (ncsi_cmd_handlers[i].type == type) {
if (ncsi_cmd_handlers[i].handler)
nch = &ncsi_cmd_handlers[i];
else
@@ -362,7 +369,15 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca)
eh = skb_push(nr->cmd, sizeof(*eh));
eh->h_proto = htons(ETH_P_NCSI);
eth_broadcast_addr(eh->h_dest);
- eth_broadcast_addr(eh->h_source);
+
+ /* If mac address received from device then use it for
+ * source address as unicast address else use broadcast
+ * address as source address
+ */
+ if (nca->ndp->gma_flag == 1)
+ memcpy(eh->h_source, nca->ndp->ndev.dev->dev_addr, ETH_ALEN);
+ else
+ eth_broadcast_addr(eh->h_source);
/* Start the timer for the request that might not have
* corresponding response. Given NCSI is an internal
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 755aab66dcab..1f387be7827b 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -8,13 +8,14 @@
#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
#include <net/ncsi.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/addrconf.h>
#include <net/ipv6.h>
-#include <net/if_inet6.h>
#include <net/genetlink.h>
#include "internal.h"
@@ -731,6 +732,34 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca)
return ret;
}
+static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca)
+{
+ union {
+ u8 data_u8[NCSI_OEM_MLX_CMD_SMAF_LEN];
+ u32 data_u32[NCSI_OEM_MLX_CMD_SMAF_LEN / sizeof(u32)];
+ } u;
+ int ret = 0;
+
+ memset(&u, 0, sizeof(u));
+ u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID);
+ u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF;
+ u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM;
+ memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET],
+ nca->ndp->ndev.dev->dev_addr, ETH_ALEN);
+ u.data_u8[MLX_SMAF_MED_SUPPORT_OFFSET] =
+ (MLX_MC_RBT_AVL | MLX_MC_RBT_SUPPORT);
+
+ nca->payload = NCSI_OEM_MLX_CMD_SMAF_LEN;
+ nca->data = u.data_u8;
+
+ ret = ncsi_xmit_cmd(nca);
+ if (ret)
+ netdev_err(nca->ndp->ndev.dev,
+ "NCSI: Failed to transmit cmd 0x%x during probe\n",
+ nca->type);
+ return ret;
+}
+
/* OEM Command handlers initialization */
static struct ncsi_oem_gma_handler {
unsigned int mfr_id;
@@ -765,9 +794,6 @@ static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
return -1;
}
- /* Set the flag for GMA command which should only be called once */
- nca->ndp->gma_flag = 1;
-
/* Get Mac address from NCSI device */
return nch->handler(nca);
}
@@ -978,9 +1004,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
case ncsi_dev_state_config_ev:
case ncsi_dev_state_config_sma:
case ncsi_dev_state_config_ebf:
-#if IS_ENABLED(CONFIG_IPV6)
- case ncsi_dev_state_config_egmf:
-#endif
+ case ncsi_dev_state_config_dgmf:
case ncsi_dev_state_config_ecnt:
case ncsi_dev_state_config_ec:
case ncsi_dev_state_config_ae:
@@ -1033,23 +1057,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
} else if (nd->state == ncsi_dev_state_config_ebf) {
nca.type = NCSI_PKT_CMD_EBF;
nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap;
- if (ncsi_channel_is_tx(ndp, nc))
+ /* if multicast global filtering is supported then
+ * disable it so that all multicast packet will be
+ * forwarded to management controller
+ */
+ if (nc->caps[NCSI_CAP_GENERIC].cap &
+ NCSI_CAP_GENERIC_MC)
+ nd->state = ncsi_dev_state_config_dgmf;
+ else if (ncsi_channel_is_tx(ndp, nc))
nd->state = ncsi_dev_state_config_ecnt;
else
nd->state = ncsi_dev_state_config_ec;
-#if IS_ENABLED(CONFIG_IPV6)
- if (ndp->inet6_addr_num > 0 &&
- (nc->caps[NCSI_CAP_GENERIC].cap &
- NCSI_CAP_GENERIC_MC))
- nd->state = ncsi_dev_state_config_egmf;
- } else if (nd->state == ncsi_dev_state_config_egmf) {
- nca.type = NCSI_PKT_CMD_EGMF;
- nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
+ } else if (nd->state == ncsi_dev_state_config_dgmf) {
+ nca.type = NCSI_PKT_CMD_DGMF;
if (ncsi_channel_is_tx(ndp, nc))
nd->state = ncsi_dev_state_config_ecnt;
else
nd->state = ncsi_dev_state_config_ec;
-#endif /* CONFIG_IPV6 */
} else if (nd->state == ncsi_dev_state_config_ecnt) {
if (np->preferred_channel &&
nc != np->preferred_channel)
@@ -1316,8 +1340,38 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
break;
}
nd->state = ncsi_dev_state_probe_cis;
+ if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) &&
+ ndp->mlx_multi_host)
+ nd->state = ncsi_dev_state_probe_mlx_gma;
+
schedule_work(&ndp->work);
break;
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+ case ncsi_dev_state_probe_mlx_gma:
+ ndp->pending_req_num = 1;
+
+ nca.type = NCSI_PKT_CMD_OEM;
+ nca.package = ndp->active_package->id;
+ nca.channel = 0;
+ ret = ncsi_oem_gma_handler_mlx(&nca);
+ if (ret)
+ goto error;
+
+ nd->state = ncsi_dev_state_probe_mlx_smaf;
+ break;
+ case ncsi_dev_state_probe_mlx_smaf:
+ ndp->pending_req_num = 1;
+
+ nca.type = NCSI_PKT_CMD_OEM;
+ nca.package = ndp->active_package->id;
+ nca.channel = 0;
+ ret = ncsi_oem_smaf_mlx(&nca);
+ if (ret)
+ goto error;
+
+ nd->state = ncsi_dev_state_probe_cis;
+ break;
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
case ncsi_dev_state_probe_cis:
ndp->pending_req_num = NCSI_RESERVED_CHANNEL;
@@ -1483,70 +1537,6 @@ out:
return -ENODEV;
}
-#if IS_ENABLED(CONFIG_IPV6)
-static int ncsi_inet6addr_event(struct notifier_block *this,
- unsigned long event, void *data)
-{
- struct inet6_ifaddr *ifa = data;
- struct net_device *dev = ifa->idev->dev;
- struct ncsi_dev *nd = ncsi_find_dev(dev);
- struct ncsi_dev_priv *ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
- struct ncsi_package *np;
- struct ncsi_channel *nc;
- struct ncsi_cmd_arg nca;
- bool action;
- int ret;
-
- if (!ndp || (ipv6_addr_type(&ifa->addr) &
- (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)))
- return NOTIFY_OK;
-
- switch (event) {
- case NETDEV_UP:
- action = (++ndp->inet6_addr_num) == 1;
- nca.type = NCSI_PKT_CMD_EGMF;
- break;
- case NETDEV_DOWN:
- action = (--ndp->inet6_addr_num == 0);
- nca.type = NCSI_PKT_CMD_DGMF;
- break;
- default:
- return NOTIFY_OK;
- }
-
- /* We might not have active channel or packages. The IPv6
- * required multicast will be enabled when active channel
- * or packages are chosen.
- */
- np = ndp->active_package;
- nc = ndp->active_channel;
- if (!action || !np || !nc)
- return NOTIFY_OK;
-
- /* We needn't enable or disable it if the function isn't supported */
- if (!(nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC))
- return NOTIFY_OK;
-
- nca.ndp = ndp;
- nca.req_flags = 0;
- nca.package = np->id;
- nca.channel = nc->id;
- nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap;
- ret = ncsi_xmit_cmd(&nca);
- if (ret) {
- netdev_warn(dev, "Fail to %s global multicast filter (%d)\n",
- (event == NETDEV_UP) ? "enable" : "disable", ret);
- return NOTIFY_DONE;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block ncsi_inet6addr_notifier = {
- .notifier_call = ncsi_inet6addr_event,
-};
-#endif /* CONFIG_IPV6 */
-
static int ncsi_kick_channels(struct ncsi_dev_priv *ndp)
{
struct ncsi_dev *nd = &ndp->ndev;
@@ -1691,6 +1681,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
{
struct ncsi_dev_priv *ndp;
struct ncsi_dev *nd;
+ struct platform_device *pdev;
+ struct device_node *np;
unsigned long flags;
int i;
@@ -1725,11 +1717,6 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
}
spin_lock_irqsave(&ncsi_dev_lock, flags);
-#if IS_ENABLED(CONFIG_IPV6)
- ndp->inet6_addr_num = 0;
- if (list_empty(&ncsi_dev_list))
- register_inet6addr_notifier(&ncsi_inet6addr_notifier);
-#endif
list_add_tail_rcu(&ndp->node, &ncsi_dev_list);
spin_unlock_irqrestore(&ncsi_dev_lock, flags);
@@ -1742,6 +1729,13 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
/* Set up generic netlink interface */
ncsi_init_netlink(dev);
+ pdev = to_platform_device(dev->dev.parent);
+ if (pdev) {
+ np = pdev->dev.of_node;
+ if (np && of_get_property(np, "mlx,multi-host", NULL))
+ ndp->mlx_multi_host = true;
+ }
+
return nd;
}
EXPORT_SYMBOL_GPL(ncsi_register_dev);
@@ -1896,10 +1890,6 @@ void ncsi_unregister_dev(struct ncsi_dev *nd)
spin_lock_irqsave(&ncsi_dev_lock, flags);
list_del_rcu(&ndp->node);
-#if IS_ENABLED(CONFIG_IPV6)
- if (list_empty(&ncsi_dev_list))
- unregister_inet6addr_notifier(&ncsi_inet6addr_notifier);
-#endif
spin_unlock_irqrestore(&ncsi_dev_lock, flags);
ncsi_unregister_netlink(nd->dev);
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index a8e9def593f2..80938b338fee 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -387,6 +387,9 @@ struct ncsi_aen_hncdsc_pkt {
#define NCSI_PKT_CMD_OEM 0x50 /* OEM */
#define NCSI_PKT_CMD_PLDM 0x51 /* PLDM request over NCSI over RBT */
#define NCSI_PKT_CMD_GPUUID 0x52 /* Get package UUID */
+#define NCSI_PKT_CMD_QPNPR 0x56 /* Query Pending NC PLDM request */
+#define NCSI_PKT_CMD_SNPR 0x57 /* Send NC PLDM Reply */
+
/* NCSI packet responses */
#define NCSI_PKT_RSP_CIS (NCSI_PKT_CMD_CIS + 0x80)
@@ -419,6 +422,8 @@ struct ncsi_aen_hncdsc_pkt {
#define NCSI_PKT_RSP_OEM (NCSI_PKT_CMD_OEM + 0x80)
#define NCSI_PKT_RSP_PLDM (NCSI_PKT_CMD_PLDM + 0x80)
#define NCSI_PKT_RSP_GPUUID (NCSI_PKT_CMD_GPUUID + 0x80)
+#define NCSI_PKT_RSP_QPNPR (NCSI_PKT_CMD_QPNPR + 0x80)
+#define NCSI_PKT_RSP_SNPR (NCSI_PKT_CMD_SNPR + 0x80)
/* NCSI response code/reason */
#define NCSI_PKT_RSP_C_COMPLETED 0x0000 /* Command Completed */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 7581bf919885..a94bb59793f0 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -47,7 +47,8 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED ||
ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) {
netdev_dbg(nr->ndp->ndev.dev,
- "NCSI: non zero response/reason code\n");
+ "NCSI: non zero response/reason code %04xh, %04xh\n",
+ ntohs(h->code), ntohs(h->reason));
return -EPERM;
}
@@ -55,7 +56,7 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
* sender doesn't support checksum according to NCSI
* specification.
*/
- pchecksum = (__be32 *)((void *)(h + 1) + payload - 4);
+ pchecksum = (__be32 *)((void *)(h + 1) + ALIGN(payload, 4) - 4);
if (ntohl(*pchecksum) == 0)
return 0;
@@ -63,7 +64,9 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr,
sizeof(*h) + payload - 4);
if (*pchecksum != htonl(checksum)) {
- netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n");
+ netdev_dbg(nr->ndp->ndev.dev,
+ "NCSI: checksum mismatched; recd: %08x calc: %08x\n",
+ *pchecksum, htonl(checksum));
return -EINVAL;
}
@@ -624,6 +627,9 @@ static int ncsi_rsp_handler_oem_mlx_gma(struct ncsi_request *nr)
saddr.sa_family = ndev->type;
ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
memcpy(saddr.sa_data, &rsp->data[MLX_MAC_ADDR_OFFSET], ETH_ALEN);
+ /* Set the flag for GMA command which should only be called once */
+ ndp->gma_flag = 1;
+
ret = ops->ndo_set_mac_address(ndev, &saddr);
if (ret < 0)
netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
@@ -668,6 +674,9 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
if (!is_valid_ether_addr((const u8 *)saddr.sa_data))
return -ENXIO;
+ /* Set the flag for GMA command which should only be called once */
+ ndp->gma_flag = 1;
+
ret = ops->ndo_set_mac_address(ndev, &saddr);
if (ret < 0)
netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
@@ -1035,6 +1044,11 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr)
return 0;
}
+static int ncsi_rsp_handler_pldm(struct ncsi_request *nr)
+{
+ return 0;
+}
+
static int ncsi_rsp_handler_netlink(struct ncsi_request *nr)
{
struct ncsi_dev_priv *ndp = nr->ndp;
@@ -1083,13 +1097,15 @@ static struct ncsi_rsp_handler {
{ NCSI_PKT_RSP_GVI, 40, ncsi_rsp_handler_gvi },
{ NCSI_PKT_RSP_GC, 32, ncsi_rsp_handler_gc },
{ NCSI_PKT_RSP_GP, -1, ncsi_rsp_handler_gp },
- { NCSI_PKT_RSP_GCPS, 172, ncsi_rsp_handler_gcps },
- { NCSI_PKT_RSP_GNS, 172, ncsi_rsp_handler_gns },
- { NCSI_PKT_RSP_GNPTS, 172, ncsi_rsp_handler_gnpts },
+ { NCSI_PKT_RSP_GCPS, 204, ncsi_rsp_handler_gcps },
+ { NCSI_PKT_RSP_GNS, 32, ncsi_rsp_handler_gns },
+ { NCSI_PKT_RSP_GNPTS, 48, ncsi_rsp_handler_gnpts },
{ NCSI_PKT_RSP_GPS, 8, ncsi_rsp_handler_gps },
{ NCSI_PKT_RSP_OEM, -1, ncsi_rsp_handler_oem },
- { NCSI_PKT_RSP_PLDM, 0, NULL },
- { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid }
+ { NCSI_PKT_RSP_PLDM, -1, ncsi_rsp_handler_pldm },
+ { NCSI_PKT_RSP_GPUUID, 20, ncsi_rsp_handler_gpuuid },
+ { NCSI_PKT_RSP_QPNPR, -1, ncsi_rsp_handler_pldm },
+ { NCSI_PKT_RSP_SNPR, -1, ncsi_rsp_handler_pldm }
};
int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 0d65f4d39494..91efae88e8c2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -20,7 +20,7 @@ config NETFILTER_FAMILY_ARP
bool
config NETFILTER_NETLINK_ACCT
-tristate "Netfilter NFACCT over NFNETLINK interface"
+ tristate "Netfilter NFACCT over NFNETLINK interface"
depends on NETFILTER_ADVANCED
select NETFILTER_NETLINK
help
@@ -34,7 +34,7 @@ config NETFILTER_NETLINK_QUEUE
help
If this option is enabled, the kernel will include support
for queueing packets via NFNETLINK.
-
+
config NETFILTER_NETLINK_LOG
tristate "Netfilter LOG over NFNETLINK interface"
default m if NETFILTER_ADVANCED=n
@@ -697,7 +697,7 @@ config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
depends on NF_FLOW_TABLE
help
- This option adds the flow table mixed IPv4/IPv6 support.
+ This option adds the flow table mixed IPv4/IPv6 support.
To compile it as a module, choose M here.
@@ -1502,7 +1502,7 @@ config NETFILTER_XT_MATCH_REALM
This option adds a `realm' match, which allows you to use the realm
key from the routing subsystem inside iptables.
- This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
+ This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
in tc world.
If you want to compile it as a module, say M here and read
@@ -1523,7 +1523,7 @@ config NETFILTER_XT_MATCH_SCTP
depends on NETFILTER_ADVANCED
default IP_SCTP
help
- With this option enabled, you will be able to use the
+ With this option enabled, you will be able to use the
`sctp' match in order to match on SCTP source/destination ports
and SCTP chunk types.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 9270a7fae484..3f572e5a975e 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -81,7 +81,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nft_chain_route.o nf_tables_offload.o
nf_tables_set-objs := nf_tables_set_core.o \
- nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
+ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
+ nft_set_pipapo.o
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
@@ -120,11 +121,12 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
-nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \
+ nf_flow_table_offload.o
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
-# generic X tables
+# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
# combos
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5d5bdf450091..78f046ec506f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
}
EXPORT_SYMBOL(nf_hook_slow);
+void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
+ const struct nf_hook_entries *e)
+{
+ struct sk_buff *skb, *next;
+ struct list_head sublist;
+ int ret;
+
+ INIT_LIST_HEAD(&sublist);
+
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ ret = nf_hook_slow(skb, state, e, 0);
+ if (ret == 1)
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* Put passed packets back on main list */
+ list_splice(&sublist, head);
+}
+EXPORT_SYMBOL(nf_hook_slow_list);
+
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 063df74b4647..26ab0e9612d8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -60,9 +60,9 @@ mtype_destroy(struct ip_set *set)
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
- ip_set_free(map->members);
if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
+ ip_set_free(map->members);
ip_set_free(map);
set->data = NULL;
@@ -75,7 +75,7 @@ mtype_flush(struct ip_set *set)
if (set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
- memset(map->members, 0, map->memsize);
+ bitmap_zero(map->members, map->elements);
set->elements = 0;
set->ext_size = 0;
}
@@ -192,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
#ifndef IP_SET_BITMAP_STORED_TIMEOUT
-static inline bool
+static bool
mtype_is_filled(const struct mtype_elem *x)
{
return true;
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 11ff9d4a7006..0a2196f59106 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -37,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip");
/* Type structure */
struct bitmap_ip {
- void *members; /* the set members */
+ unsigned long *members; /* the set members */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -55,7 +55,7 @@ struct bitmap_ip_adt_elem {
u16 id;
};
-static inline u32
+static u32
ip_to_id(const struct bitmap_ip *m, u32 ip)
{
return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts;
@@ -63,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip)
/* Common functions */
-static inline int
+static int
bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e,
struct bitmap_ip *map, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize)
{
return !!test_bit(id, map->members);
}
-static inline int
+static int
bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
u32 flags, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
{
return !test_and_clear_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
size_t dsize)
{
@@ -97,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
htonl(map->first_ip + id * map->hosts));
}
-static inline int
+static int
bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map)
{
return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
@@ -220,7 +220,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
u32 first_ip, u32 last_ip,
u32 elements, u32 hosts, u8 netmask)
{
- map->members = ip_set_alloc(map->memsize);
+ map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN);
if (!map->members)
return false;
map->first_ip = first_ip;
@@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
return true;
}
+static u32
+range_to_mask(u32 from, u32 to, u8 *bits)
+{
+ u32 mask = 0xFFFFFFFE;
+
+ *bits = 32;
+ while (--(*bits) > 0 && mask && (to & mask) != from)
+ mask <<= 1;
+
+ return mask;
+}
+
static int
bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
u32 flags)
@@ -310,7 +322,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (!map)
return -ENOMEM;
- map->memsize = bitmap_bytes(0, elements - 1);
+ map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_ip;
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 1d4e63326e68..739e343efaf6 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -42,7 +42,7 @@ enum {
/* Type structure */
struct bitmap_ipmac {
- void *members; /* the set members */
+ unsigned long *members; /* the set members */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -65,7 +65,7 @@ struct bitmap_ipmac_elem {
unsigned char filled;
} __aligned(__alignof__(u64));
-static inline u32
+static u32
ip_to_id(const struct bitmap_ipmac *m, u32 ip)
{
return ip - m->first_ip;
@@ -79,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip)
/* Common functions */
-static inline int
+static int
bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
const struct bitmap_ipmac *map, size_t dsize)
{
@@ -94,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
return -EAGAIN;
}
-static inline int
+static int
bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
{
const struct bitmap_ipmac_elem *elem;
@@ -106,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
return elem->filled == MAC_FILLED;
}
-static inline int
+static int
bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)
{
return elem->filled == MAC_FILLED;
}
-static inline int
+static int
bitmap_ipmac_add_timeout(unsigned long *timeout,
const struct bitmap_ipmac_adt_elem *e,
const struct ip_set_ext *ext, struct ip_set *set,
@@ -139,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,
return 0;
}
-static inline int
+static int
bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
struct bitmap_ipmac *map, u32 flags, size_t dsize)
{
@@ -177,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
return IPSET_ADD_STORE_PLAIN_TIMEOUT;
}
-static inline int
+static int
bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,
struct bitmap_ipmac *map)
{
return !test_and_clear_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
u32 id, size_t dsize)
{
@@ -197,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether));
}
-static inline int
+static int
bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map)
{
return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
@@ -299,7 +299,7 @@ static bool
init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
u32 first_ip, u32 last_ip, u32 elements)
{
- map->members = ip_set_alloc(map->memsize);
+ map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN);
if (!map->members)
return false;
map->first_ip = first_ip;
@@ -360,7 +360,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (!map)
return -ENOMEM;
- map->memsize = bitmap_bytes(0, elements - 1);
+ map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_ipmac;
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
kfree(map);
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 704a0dda1609..b49978dd810d 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -30,7 +30,7 @@ MODULE_ALIAS("ip_set_bitmap:port");
/* Type structure */
struct bitmap_port {
- void *members; /* the set members */
+ unsigned long *members; /* the set members */
u16 first_port; /* host byte order, included in range */
u16 last_port; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -46,7 +46,7 @@ struct bitmap_port_adt_elem {
u16 id;
};
-static inline u16
+static u16
port_to_id(const struct bitmap_port *m, u16 port)
{
return port - m->first_port;
@@ -54,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port)
/* Common functions */
-static inline int
+static int
bitmap_port_do_test(const struct bitmap_port_adt_elem *e,
const struct bitmap_port *map, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize)
{
return !!test_bit(id, map->members);
}
-static inline int
+static int
bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
struct bitmap_port *map, u32 flags, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_port_do_del(const struct bitmap_port_adt_elem *e,
struct bitmap_port *map)
{
return !test_and_clear_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
size_t dsize)
{
@@ -89,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
htons(map->first_port + id));
}
-static inline int
+static int
bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
{
return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
}
+static bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case NFPROTO_IPV6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int
bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
@@ -204,7 +231,7 @@ static bool
init_map_port(struct ip_set *set, struct bitmap_port *map,
u16 first_port, u16 last_port)
{
- map->members = ip_set_alloc(map->memsize);
+ map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN);
if (!map->members)
return false;
map->first_port = first_port;
@@ -244,7 +271,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
return -ENOMEM;
map->elements = elements;
- map->memsize = bitmap_bytes(0, map->elements);
+ map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_port;
if (!init_map_port(set, map, first_port, last_port)) {
kfree(map);
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index e64d5f9a89dd..69c107f9ba8d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -35,7 +35,7 @@ struct ip_set_net {
static unsigned int ip_set_net_id __read_mostly;
-static inline struct ip_set_net *ip_set_pernet(struct net *net)
+static struct ip_set_net *ip_set_pernet(struct net *net)
{
return net_generic(net, ip_set_net_id);
}
@@ -67,13 +67,13 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
* serialized by ip_set_type_mutex.
*/
-static inline void
+static void
ip_set_type_lock(void)
{
mutex_lock(&ip_set_type_mutex);
}
-static inline void
+static void
ip_set_type_unlock(void)
{
mutex_unlock(&ip_set_type_mutex);
@@ -277,7 +277,7 @@ ip_set_free(void *members)
}
EXPORT_SYMBOL_GPL(ip_set_free);
-static inline bool
+static bool
flag_nested(const struct nlattr *nla)
{
return nla->nla_type & NLA_F_NESTED;
@@ -296,7 +296,8 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr)
if (unlikely(!flag_nested(nla)))
return -IPSET_ERR_PROTOCOL;
- if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL))
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla,
+ ipaddr_policy, NULL))
return -IPSET_ERR_PROTOCOL;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4)))
return -IPSET_ERR_PROTOCOL;
@@ -314,7 +315,8 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
if (unlikely(!flag_nested(nla)))
return -IPSET_ERR_PROTOCOL;
- if (nla_parse_nested_deprecated(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy, NULL))
+ if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla,
+ ipaddr_policy, NULL))
return -IPSET_ERR_PROTOCOL;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6)))
return -IPSET_ERR_PROTOCOL;
@@ -325,6 +327,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+static u32
+ip_set_timeout_get(const unsigned long *timeout)
+{
+ u32 t;
+
+ if (*timeout == IPSET_ELEM_PERMANENT)
+ return 0;
+
+ t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC;
+ /* Zero value in userspace means no timeout */
+ return t == 0 ? 1 : t;
+}
+
+static char *
+ip_set_comment_uget(struct nlattr *tb)
+{
+ return nla_data(tb);
+}
+
+/* Called from uadd only, protected by the set spinlock.
+ * The kadt functions don't use the comment extensions in any way.
+ */
+void
+ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
+ const struct ip_set_ext *ext)
+{
+ struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
+ size_t len = ext->comment ? strlen(ext->comment) : 0;
+
+ if (unlikely(c)) {
+ set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
+ kfree_rcu(c, rcu);
+ rcu_assign_pointer(comment->c, NULL);
+ }
+ if (!len)
+ return;
+ if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
+ len = IPSET_MAX_COMMENT_SIZE;
+ c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
+ if (unlikely(!c))
+ return;
+ strlcpy(c->str, ext->comment, len + 1);
+ set->ext_size += sizeof(*c) + strlen(c->str) + 1;
+ rcu_assign_pointer(comment->c, c);
+}
+EXPORT_SYMBOL_GPL(ip_set_init_comment);
+
+/* Used only when dumping a set, protected by rcu_read_lock() */
+static int
+ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
+{
+ struct ip_set_comment_rcu *c = rcu_dereference(comment->c);
+
+ if (!c)
+ return 0;
+ return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str);
+}
+
+/* Called from uadd/udel, flush or the garbage collectors protected
+ * by the set spinlock.
+ * Called when the set is destroyed and when there can't be any user
+ * of the set data anymore.
+ */
+static void
+ip_set_comment_free(struct ip_set *set, void *ptr)
+{
+ struct ip_set_comment *comment = ptr;
+ struct ip_set_comment_rcu *c;
+
+ c = rcu_dereference_protected(comment->c, 1);
+ if (unlikely(!c))
+ return;
+ set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
+ kfree_rcu(c, rcu);
+ rcu_assign_pointer(comment->c, NULL);
+}
+
typedef void (*destroyer)(struct ip_set *, void *);
/* ipset data extension types, in size order */
@@ -351,12 +430,12 @@ const struct ip_set_ext_type ip_set_extensions[] = {
.flag = IPSET_FLAG_WITH_COMMENT,
.len = sizeof(struct ip_set_comment),
.align = __alignof__(struct ip_set_comment),
- .destroy = (destroyer) ip_set_comment_free,
+ .destroy = ip_set_comment_free,
},
};
EXPORT_SYMBOL_GPL(ip_set_extensions);
-static inline bool
+static bool
add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
{
return ip_set_extensions[id].flag ?
@@ -446,6 +525,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
}
EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+static u64
+ip_set_get_bytes(const struct ip_set_counter *counter)
+{
+ return (u64)atomic64_read(&(counter)->bytes);
+}
+
+static u64
+ip_set_get_packets(const struct ip_set_counter *counter)
+{
+ return (u64)atomic64_read(&(counter)->packets);
+}
+
+static bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+ return nla_put_net64(skb, IPSET_ATTR_BYTES,
+ cpu_to_be64(ip_set_get_bytes(counter)),
+ IPSET_ATTR_PAD) ||
+ nla_put_net64(skb, IPSET_ATTR_PACKETS,
+ cpu_to_be64(ip_set_get_packets(counter)),
+ IPSET_ATTR_PAD);
+}
+
+static bool
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
+{
+ /* Send nonzero parameters only */
+ return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
+ nla_put_net64(skb, IPSET_ATTR_SKBMARK,
+ cpu_to_be64((u64)skbinfo->skbmark << 32 |
+ skbinfo->skbmarkmask),
+ IPSET_ATTR_PAD)) ||
+ (skbinfo->skbprio &&
+ nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
+ cpu_to_be32(skbinfo->skbprio))) ||
+ (skbinfo->skbqueue &&
+ nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
+ cpu_to_be16(skbinfo->skbqueue)));
+}
+
int
ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
const void *e, bool active)
@@ -471,6 +590,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
}
EXPORT_SYMBOL_GPL(ip_set_put_extensions);
+static bool
+ip_set_match_counter(u64 counter, u64 match, u8 op)
+{
+ switch (op) {
+ case IPSET_COUNTER_NONE:
+ return true;
+ case IPSET_COUNTER_EQ:
+ return counter == match;
+ case IPSET_COUNTER_NE:
+ return counter != match;
+ case IPSET_COUNTER_LT:
+ return counter < match;
+ case IPSET_COUNTER_GT:
+ return counter > match;
+ }
+ return false;
+}
+
+static void
+ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
+{
+ atomic64_add((long long)bytes, &(counter)->bytes);
+}
+
+static void
+ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
+{
+ atomic64_add((long long)packets, &(counter)->packets);
+}
+
+static void
+ip_set_update_counter(struct ip_set_counter *counter,
+ const struct ip_set_ext *ext, u32 flags)
+{
+ if (ext->packets != ULLONG_MAX &&
+ !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
+ ip_set_add_bytes(ext->bytes, counter);
+ ip_set_add_packets(ext->packets, counter);
+ }
+}
+
+static void
+ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
+ const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ mext->skbinfo = *skbinfo;
+}
+
bool
ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags, void *data)
@@ -506,7 +674,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions);
* The set behind an index may change by swapping only, from userspace.
*/
-static inline void
+static void
__ip_set_get(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -514,7 +682,7 @@ __ip_set_get(struct ip_set *set)
write_unlock_bh(&ip_set_ref_lock);
}
-static inline void
+static void
__ip_set_put(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -526,7 +694,7 @@ __ip_set_put(struct ip_set *set)
/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
* a separate reference counter
*/
-static inline void
+static void
__ip_set_put_netlink(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -541,7 +709,7 @@ __ip_set_put_netlink(struct ip_set *set)
* so it can't be destroyed (or changed) under our foot.
*/
-static inline struct ip_set *
+static struct ip_set *
ip_set_rcu_get(struct net *net, ip_set_id_t index)
{
struct ip_set *set;
@@ -670,7 +838,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname);
*
*/
-static inline void
+static void
__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
{
struct ip_set *set;
@@ -934,7 +1102,8 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
/* Without holding any locks, create private part. */
if (attr[IPSET_ATTR_DATA] &&
- nla_parse_nested_deprecated(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], set->type->create_policy, NULL)) {
+ nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
+ set->type->create_policy, NULL)) {
ret = -IPSET_ERR_PROTOCOL;
goto put_out;
}
@@ -1252,6 +1421,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF)
#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16)
+int
+ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
+{
+ u32 cadt_flags = 0;
+
+ if (SET_WITH_TIMEOUT(set))
+ if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(set->timeout))))
+ return -EMSGSIZE;
+ if (SET_WITH_COUNTER(set))
+ cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
+ if (SET_WITH_COMMENT(set))
+ cadt_flags |= IPSET_FLAG_WITH_COMMENT;
+ if (SET_WITH_SKBINFO(set))
+ cadt_flags |= IPSET_FLAG_WITH_SKBINFO;
+ if (SET_WITH_FORCEADD(set))
+ cadt_flags |= IPSET_FLAG_WITH_FORCEADD;
+
+ if (!cadt_flags)
+ return 0;
+ return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
+}
+EXPORT_SYMBOL_GPL(ip_set_put_flags);
+
static int
ip_set_dump_done(struct netlink_callback *cb)
{
@@ -1281,32 +1474,43 @@ dump_attrs(struct nlmsghdr *nlh)
}
}
+static const struct nla_policy
+ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FLAGS] = { .type = NLA_U32 },
+};
+
static int
-dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
+ip_set_dump_start(struct netlink_callback *cb)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
struct nlattr *attr = (void *)nlh + min_len;
+ struct sk_buff *skb = cb->skb;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
u32 dump_type;
- ip_set_id_t index;
int ret;
- ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr,
- nlh->nlmsg_len - min_len,
- ip_set_setname_policy, NULL);
+ ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr,
+ nlh->nlmsg_len - min_len,
+ ip_set_dump_policy, NULL);
if (ret)
- return ret;
+ goto error;
cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]);
if (cda[IPSET_ATTR_SETNAME]) {
+ ip_set_id_t index;
struct ip_set *set;
set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
&index);
- if (!set)
- return -ENOENT;
-
+ if (!set) {
+ ret = -ENOENT;
+ goto error;
+ }
dump_type = DUMP_ONE;
cb->args[IPSET_CB_INDEX] = index;
} else {
@@ -1322,10 +1526,17 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
cb->args[IPSET_CB_DUMP] = dump_type;
return 0;
+
+error:
+ /* We have to create and send the error message manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK) {
+ netlink_ack(cb->skb, nlh, ret, NULL);
+ }
+ return ret;
}
static int
-ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb)
{
ip_set_id_t index = IPSET_INVALID_ID, max;
struct ip_set *set = NULL;
@@ -1336,18 +1547,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
bool is_destroyed;
int ret = 0;
- if (!cb->args[IPSET_CB_DUMP]) {
- ret = dump_init(cb, inst);
- if (ret < 0) {
- nlh = nlmsg_hdr(cb->skb);
- /* We have to create and send the error message
- * manually :-(
- */
- if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(cb->skb, nlh, ret, NULL);
- return ret;
- }
- }
+ if (!cb->args[IPSET_CB_DUMP])
+ return -EINVAL;
if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
goto out;
@@ -1483,7 +1684,8 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
{
struct netlink_dump_control c = {
- .dump = ip_set_dump_start,
+ .start = ip_set_dump_start,
+ .dump = ip_set_dump_do,
.done = ip_set_dump_done,
};
return netlink_dump_start(ctnl, skb, nlh, &c);
@@ -1543,9 +1745,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
cmdattr = (void *)&errmsg->msg + min_len;
- ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr,
- nlh->nlmsg_len - min_len,
- ip_set_adt_policy, NULL);
+ ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr,
+ nlh->nlmsg_len - min_len, ip_set_adt_policy,
+ NULL);
if (ret) {
nlmsg_free(skb2);
@@ -1596,7 +1798,9 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
use_lineno = !!attr[IPSET_ATTR_LINENO];
if (attr[IPSET_ATTR_DATA]) {
- if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL))
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
+ attr[IPSET_ATTR_DATA],
+ set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
ret = call_ad(ctnl, skb, set, tb, adt, flags,
use_lineno);
@@ -1606,7 +1810,8 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
if (nla_type(nla) != IPSET_ATTR_DATA ||
!flag_nested(nla) ||
- nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL))
+ nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
+ set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
ret = call_ad(ctnl, skb, set, tb, adt,
flags, use_lineno);
@@ -1644,6 +1849,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
+ u32 lineno;
if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
@@ -1655,11 +1861,12 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (!set)
return -ENOENT;
- if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL))
+ if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
+ set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
rcu_read_lock_bh();
- ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0);
rcu_read_unlock_bh();
/* Userspace can't trigger element to be re-added */
if (ret == -EAGAIN)
@@ -1961,7 +2168,7 @@ static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
[IPSET_CMD_LIST] = {
.call = ip_set_dump,
.attr_count = IPSET_ATTR_CMD_MAX,
- .policy = ip_set_setname_policy,
+ .policy = ip_set_dump_policy,
},
[IPSET_CMD_SAVE] = {
.call = ip_set_dump,
@@ -2069,8 +2276,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
}
req_version->version = IPSET_PROTOCOL;
- ret = copy_to_user(user, req_version,
- sizeof(struct ip_set_req_version));
+ if (copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version)))
+ ret = -EFAULT;
goto done;
}
case IP_SET_OP_GET_BYNAME: {
@@ -2129,7 +2337,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
} /* end of switch(op) */
copy:
- ret = copy_to_user(user, data, copylen);
+ if (copy_to_user(user, data, copylen))
+ ret = -EFAULT;
done:
vfree(data);
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 2b8f959574b4..36615eb3eae1 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
}
EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
#endif
-
-bool
-ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
-{
- bool ret;
- u8 proto;
-
- switch (pf) {
- case NFPROTO_IPV4:
- ret = ip_set_get_ip4_port(skb, src, port, &proto);
- break;
- case NFPROTO_IPV6:
- ret = ip_set_get_ip6_port(skb, src, port, &proto);
- break;
- default:
- return false;
- }
- if (!ret)
- return ret;
- switch (proto) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- return true;
- default:
- return false;
- }
-}
-EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 0feb77fa9edc..7480ce55b5c8 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -7,7 +7,7 @@
#include <linux/rcupdate.h>
#include <linux/jhash.h>
#include <linux/types.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
+#include <linux/netfilter/ipset/ip_set.h>
#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
#define ipset_dereference_protected(p, set) \
@@ -39,7 +39,7 @@
#ifdef IP_SET_HASH_WITH_MULTI
#define AHASH_MAX(h) ((h)->ahash_max)
-static inline u8
+static u8
tune_ahash_max(u8 curr, u32 multi)
{
u32 n;
@@ -909,7 +909,7 @@ out:
return ret;
}
-static inline int
+static int
mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
struct ip_set_ext *mext, struct ip_set *set, u32 flags)
{
@@ -953,7 +953,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]));
#endif
key = HKEY(d, h->initval, t->htable_bits);
- n = rcu_dereference_bh(hbucket(t, key));
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n)
continue;
for (i = 0; i < n->pos; i++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index f4432d9fcad0..5d6d68eaf6a9 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -44,7 +44,7 @@ struct hash_ip4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ip4_data_equal(const struct hash_ip4_elem *e1,
const struct hash_ip4_elem *e2,
u32 *multi)
@@ -63,7 +63,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
{
next->ip = e->ip;
@@ -171,7 +171,7 @@ struct hash_ip6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
const struct hash_ip6_elem *ip2,
u32 *multi)
@@ -179,7 +179,7 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
}
-static inline void
+static void
hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
{
ip6_netmask(ip, prefix);
@@ -196,7 +196,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e)
{
}
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index 24d8f4df4230..eceb7bc4a93a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -47,7 +47,7 @@ struct hash_ipmac4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1,
const struct hash_ipmac4_elem *e2,
u32 *multi)
@@ -67,7 +67,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmac4_data_next(struct hash_ipmac4_elem *next,
const struct hash_ipmac4_elem *e)
{
@@ -154,7 +154,7 @@ struct hash_ipmac6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1,
const struct hash_ipmac6_elem *e2,
u32 *multi)
@@ -175,7 +175,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmac6_data_next(struct hash_ipmac6_elem *next,
const struct hash_ipmac6_elem *e)
{
@@ -209,7 +209,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb,
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- if (opt->flags & IPSET_DIM_ONE_SRC)
+ if (opt->flags & IPSET_DIM_TWO_SRC)
ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
else
ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index 7a1734aad0c5..aba1df617d6e 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -42,7 +42,7 @@ struct hash_ipmark4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
const struct hash_ipmark4_elem *ip2,
u32 *multi)
@@ -64,7 +64,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
const struct hash_ipmark4_elem *d)
{
@@ -165,7 +165,7 @@ struct hash_ipmark6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
const struct hash_ipmark6_elem *ip2,
u32 *multi)
@@ -187,7 +187,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmark6_data_next(struct hash_ipmark6_elem *next,
const struct hash_ipmark6_elem *d)
{
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 32e240658334..1ff228717e29 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -47,7 +47,7 @@ struct hash_ipport4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
const struct hash_ipport4_elem *ip2,
u32 *multi)
@@ -71,7 +71,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipport4_data_next(struct hash_ipport4_elem *next,
const struct hash_ipport4_elem *d)
{
@@ -202,7 +202,7 @@ struct hash_ipport6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
const struct hash_ipport6_elem *ip2,
u32 *multi)
@@ -226,7 +226,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipport6_data_next(struct hash_ipport6_elem *next,
const struct hash_ipport6_elem *d)
{
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 15d419353179..fa88afd812fa 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -46,7 +46,7 @@ struct hash_ipportip4_elem {
u8 padding;
};
-static inline bool
+static bool
hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
const struct hash_ipportip4_elem *ip2,
u32 *multi)
@@ -72,7 +72,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
const struct hash_ipportip4_elem *d)
{
@@ -210,7 +210,7 @@ struct hash_ipportip6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
const struct hash_ipportip6_elem *ip2,
u32 *multi)
@@ -236,7 +236,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportip6_data_next(struct hash_ipportip6_elem *next,
const struct hash_ipportip6_elem *d)
{
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 7a4d7afd4121..eef6ecfcb409 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -59,7 +59,7 @@ struct hash_ipportnet4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
const struct hash_ipportnet4_elem *ip2,
u32 *multi)
@@ -71,25 +71,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
{
elem->ip2 &= ip_set_netmask(cidr);
@@ -116,7 +116,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
const struct hash_ipportnet4_elem *d)
{
@@ -308,7 +308,7 @@ struct hash_ipportnet6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
const struct hash_ipportnet6_elem *ip2,
u32 *multi)
@@ -320,25 +320,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip2, cidr);
@@ -365,7 +365,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next,
const struct hash_ipportnet6_elem *d)
{
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index d94c585d33c5..0b61593165ef 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -37,7 +37,7 @@ struct hash_mac4_elem {
/* Common functions */
-static inline bool
+static bool
hash_mac4_data_equal(const struct hash_mac4_elem *e1,
const struct hash_mac4_elem *e2,
u32 *multi)
@@ -45,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1,
return ether_addr_equal(e1->ether, e2->ether);
}
-static inline bool
+static bool
hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e)
{
if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
@@ -56,7 +56,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_mac4_data_next(struct hash_mac4_elem *next,
const struct hash_mac4_elem *e)
{
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index c259cbc3ef45..136cf0781d3a 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -47,7 +47,7 @@ struct hash_net4_elem {
/* Common functions */
-static inline bool
+static bool
hash_net4_data_equal(const struct hash_net4_elem *ip1,
const struct hash_net4_elem *ip2,
u32 *multi)
@@ -56,25 +56,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_net4_do_data_match(const struct hash_net4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
{
elem->ip &= ip_set_netmask(cidr);
@@ -97,7 +97,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_net4_data_next(struct hash_net4_elem *next,
const struct hash_net4_elem *d)
{
@@ -212,7 +212,7 @@ struct hash_net6_elem {
/* Common functions */
-static inline bool
+static bool
hash_net6_data_equal(const struct hash_net6_elem *ip1,
const struct hash_net6_elem *ip2,
u32 *multi)
@@ -221,25 +221,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_net6_do_data_match(const struct hash_net6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip, cidr);
@@ -262,7 +262,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_net6_data_next(struct hash_net6_elem *next,
const struct hash_net6_elem *d)
{
@@ -368,6 +368,7 @@ static struct ip_set_type hash_net_type __read_mostly = {
[IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 87b29f971226..be5e95a0d876 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -25,7 +25,8 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */
+/* 6 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -57,12 +58,13 @@ struct hash_netiface4_elem {
u8 cidr;
u8 nomatch;
u8 elem;
+ u8 wildcard;
char iface[IFNAMSIZ];
};
/* Common functions */
-static inline bool
+static bool
hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
const struct hash_netiface4_elem *ip2,
u32 *multi)
@@ -71,28 +73,30 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- strcmp(ip1->iface, ip2->iface) == 0;
+ (ip1->wildcard ?
+ strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+ strcmp(ip1->iface, ip2->iface) == 0);
}
-static inline int
+static int
hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
{
elem->ip &= ip_set_netmask(cidr);
@@ -103,7 +107,8 @@ static bool
hash_netiface4_data_list(struct sk_buff *skb,
const struct hash_netiface4_elem *data)
{
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+ u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+ (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
if (data->nomatch)
flags |= IPSET_FLAG_NOMATCH;
@@ -119,7 +124,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netiface4_data_next(struct hash_netiface4_elem *next,
const struct hash_netiface4_elem *d)
{
@@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
+ if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+ e.wildcard = 1;
}
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
e.ip = htonl(ip & ip_set_hostmask(e.cidr));
@@ -280,12 +287,13 @@ struct hash_netiface6_elem {
u8 cidr;
u8 nomatch;
u8 elem;
+ u8 wildcard;
char iface[IFNAMSIZ];
};
/* Common functions */
-static inline bool
+static bool
hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
const struct hash_netiface6_elem *ip2,
u32 *multi)
@@ -294,28 +302,30 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- strcmp(ip1->iface, ip2->iface) == 0;
+ (ip1->wildcard ?
+ strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+ strcmp(ip1->iface, ip2->iface) == 0);
}
-static inline int
+static int
hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip, cidr);
@@ -326,7 +336,8 @@ static bool
hash_netiface6_data_list(struct sk_buff *skb,
const struct hash_netiface6_elem *data)
{
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+ u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+ (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
if (data->nomatch)
flags |= IPSET_FLAG_NOMATCH;
@@ -342,7 +353,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netiface6_data_next(struct hash_netiface6_elem *next,
const struct hash_netiface6_elem *d)
{
@@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
+ if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+ e.wildcard = 1;
}
ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index a3ae69bfee66..da4ef910b12d 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -52,7 +52,7 @@ struct hash_netnet4_elem {
/* Common functions */
-static inline bool
+static bool
hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
const struct hash_netnet4_elem *ip2,
u32 *multi)
@@ -61,32 +61,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
ip1->ccmp == ip2->ccmp;
}
-static inline int
+static int
hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
struct hash_netnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
{
if (inner) {
@@ -117,7 +117,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netnet4_data_next(struct hash_netnet4_elem *next,
const struct hash_netnet4_elem *d)
{
@@ -282,7 +282,7 @@ struct hash_netnet6_elem {
/* Common functions */
-static inline bool
+static bool
hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
const struct hash_netnet6_elem *ip2,
u32 *multi)
@@ -292,32 +292,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
ip1->ccmp == ip2->ccmp;
}
-static inline int
+static int
hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
struct hash_netnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
{
if (inner) {
@@ -348,7 +348,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netnet6_data_next(struct hash_netnet6_elem *next,
const struct hash_netnet6_elem *d)
{
@@ -476,6 +476,7 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 799f2272cc65..34448df80fb9 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -57,7 +57,7 @@ struct hash_netport4_elem {
/* Common functions */
-static inline bool
+static bool
hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
const struct hash_netport4_elem *ip2,
u32 *multi)
@@ -68,25 +68,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_netport4_do_data_match(const struct hash_netport4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
{
elem->ip &= ip_set_netmask(cidr);
@@ -112,7 +112,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netport4_data_next(struct hash_netport4_elem *next,
const struct hash_netport4_elem *d)
{
@@ -270,7 +270,7 @@ struct hash_netport6_elem {
/* Common functions */
-static inline bool
+static bool
hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
const struct hash_netport6_elem *ip2,
u32 *multi)
@@ -281,25 +281,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_netport6_do_data_match(const struct hash_netport6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip, cidr);
@@ -325,7 +325,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netport6_data_next(struct hash_netport6_elem *next,
const struct hash_netport6_elem *d)
{
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index a82b70e8b9a6..934c1712cba8 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -56,7 +56,7 @@ struct hash_netportnet4_elem {
/* Common functions */
-static inline bool
+static bool
hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
const struct hash_netportnet4_elem *ip2,
u32 *multi)
@@ -67,32 +67,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
struct hash_netportnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
u8 cidr, bool inner)
{
@@ -126,7 +126,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
const struct hash_netportnet4_elem *d)
{
@@ -331,7 +331,7 @@ struct hash_netportnet6_elem {
/* Common functions */
-static inline bool
+static bool
hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
const struct hash_netportnet6_elem *ip2,
u32 *multi)
@@ -343,32 +343,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
struct hash_netportnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
u8 cidr, bool inner)
{
@@ -402,7 +402,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netportnet6_data_next(struct hash_netportnet6_elem *next,
const struct hash_netportnet6_elem *d)
{
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 6f9ead6319e0..cd747c0962fd 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -149,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
kfree(e);
}
-static inline void
+static void
list_set_del(struct ip_set *set, struct set_elem *e)
{
struct list_set *map = set->data;
@@ -160,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e)
call_rcu(&e->rcu, __list_set_del_rcu);
}
-static inline void
+static void
list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
{
struct list_set *map = set->data;
@@ -288,7 +288,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (n &&
!(SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(n, set))))
- n = NULL;
+ n = NULL;
e = kzalloc(set->dsize, GFP_ATOMIC);
if (!e)
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index f6f1a0d5c47d..5b672e05d758 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -135,7 +135,7 @@ config IP_VS_WRR
module, choose M here. If unsure, say N.
config IP_VS_LC
- tristate "least-connection scheduling"
+ tristate "least-connection scheduling"
---help---
The least-connection scheduling algorithm directs network
connections to the server with the least number of active
@@ -145,7 +145,7 @@ config IP_VS_LC
module, choose M here. If unsure, say N.
config IP_VS_WLC
- tristate "weighted least-connection scheduling"
+ tristate "weighted least-connection scheduling"
---help---
The weighted least-connection scheduling algorithm directs network
connections to the server with the least active connections
@@ -333,7 +333,7 @@ config IP_VS_NFCT
config IP_VS_PE_SIP
tristate "SIP persistence engine"
- depends on IP_VS_PROTO_UDP
+ depends on IP_VS_PROTO_UDP
depends on NF_CONNTRACK_SIP
---help---
Allow persistence based on the SIP Call-ID
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 4515056ef1c2..f9b16f2b2219 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -193,21 +193,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *
mutex_lock(&__ip_vs_app_mutex);
+ /* increase the module use count */
+ if (!ip_vs_use_count_inc()) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
list_for_each_entry(a, &ipvs->app_list, a_list) {
if (!strcmp(app->name, a->name)) {
err = -EEXIST;
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
goto out_unlock;
}
}
a = kmemdup(app, sizeof(*app), GFP_KERNEL);
if (!a) {
err = -ENOMEM;
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
goto out_unlock;
}
INIT_LIST_HEAD(&a->incs_list);
list_add(&a->a_list, &ipvs->app_list);
- /* increase the module use count */
- ip_vs_use_count_inc();
out_unlock:
mutex_unlock(&__ip_vs_app_mutex);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 46f06f92ab8f..512259f579d7 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -617,7 +617,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
- union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
+ union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
@@ -2402,18 +2402,22 @@ estimator_fail:
return -ENOMEM;
}
-static void __net_exit __ip_vs_cleanup(struct net *net)
+static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */
- ip_vs_conn_net_cleanup(ipvs);
- ip_vs_app_net_cleanup(ipvs);
- ip_vs_protocol_net_cleanup(ipvs);
- ip_vs_control_net_cleanup(ipvs);
- ip_vs_estimator_net_cleanup(ipvs);
- IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
- net->ipvs = NULL;
+ struct netns_ipvs *ipvs;
+ struct net *net;
+
+ ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */
+ list_for_each_entry(net, net_list, exit_list) {
+ ipvs = net_ipvs(net);
+ ip_vs_conn_net_cleanup(ipvs);
+ ip_vs_app_net_cleanup(ipvs);
+ ip_vs_protocol_net_cleanup(ipvs);
+ ip_vs_control_net_cleanup(ipvs);
+ ip_vs_estimator_net_cleanup(ipvs);
+ IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
+ net->ipvs = NULL;
+ }
}
static int __net_init __ip_vs_dev_init(struct net *net)
@@ -2429,27 +2433,32 @@ hook_fail:
return ret;
}
-static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
+ struct netns_ipvs *ipvs;
+ struct net *net;
+
EnterFunction(2);
- nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- ipvs->enable = 0; /* Disable packet reception */
- smp_wmb();
- ip_vs_sync_net_cleanup(ipvs);
+ list_for_each_entry(net, net_list, exit_list) {
+ ipvs = net_ipvs(net);
+ nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ ipvs->enable = 0; /* Disable packet reception */
+ smp_wmb();
+ ip_vs_sync_net_cleanup(ipvs);
+ }
LeaveFunction(2);
}
static struct pernet_operations ipvs_core_ops = {
.init = __ip_vs_init,
- .exit = __ip_vs_cleanup,
+ .exit_batch = __ip_vs_cleanup_batch,
.id = &ip_vs_net_id,
.size = sizeof(struct netns_ipvs),
};
static struct pernet_operations ipvs_core_dev_ops = {
.init = __ip_vs_dev_init,
- .exit = __ip_vs_dev_cleanup,
+ .exit_batch = __ip_vs_dev_cleanup_batch,
};
/*
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 060565e7d227..8d14a1acbc37 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -93,7 +93,6 @@ static bool __ip_vs_addr_is_local_v6(struct net *net,
static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
- static int old_secure_tcp = 0;
int availmem;
int nomem;
int to_change = -1;
@@ -174,35 +173,35 @@ static void update_defense_level(struct netns_ipvs *ipvs)
spin_lock(&ipvs->securetcp_lock);
switch (ipvs->sysctl_secure_tcp) {
case 0:
- if (old_secure_tcp >= 2)
+ if (ipvs->old_secure_tcp >= 2)
to_change = 0;
break;
case 1:
if (nomem) {
- if (old_secure_tcp < 2)
+ if (ipvs->old_secure_tcp < 2)
to_change = 1;
ipvs->sysctl_secure_tcp = 2;
} else {
- if (old_secure_tcp >= 2)
+ if (ipvs->old_secure_tcp >= 2)
to_change = 0;
}
break;
case 2:
if (nomem) {
- if (old_secure_tcp < 2)
+ if (ipvs->old_secure_tcp < 2)
to_change = 1;
} else {
- if (old_secure_tcp >= 2)
+ if (ipvs->old_secure_tcp >= 2)
to_change = 0;
ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
- if (old_secure_tcp < 2)
+ if (ipvs->old_secure_tcp < 2)
to_change = 1;
break;
}
- old_secure_tcp = ipvs->sysctl_secure_tcp;
+ ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(ipvs,
ipvs->sysctl_secure_tcp > 1);
@@ -262,7 +261,7 @@ static inline unsigned int
ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
- register unsigned int porth = ntohs(port);
+ unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
__u32 ahash;
@@ -424,7 +423,7 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
if (!svc && protocol == IPPROTO_TCP &&
atomic_read(&ipvs->ftpsvc_counter) &&
- (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
+ (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
@@ -493,7 +492,7 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
const union nf_inet_addr *addr,
__be16 port)
{
- register unsigned int porth = ntohs(port);
+ unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
#ifdef CONFIG_IP_VS_IPV6
@@ -1275,7 +1274,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service *svc = NULL;
/* increase the module use count */
- ip_vs_use_count_inc();
+ if (!ip_vs_use_count_inc())
+ return -ENOPROTOOPT;
/* Lookup the scheduler by 'u->sched_name' */
if (strcmp(u->sched_name, "none")) {
@@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
/*
* Delete service by {netns} in the service table.
- * Called by __ip_vs_cleanup()
+ * Called by __ip_vs_batch_cleanup()
*/
-void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
+void ip_vs_service_nets_cleanup(struct list_head *net_list)
{
+ struct netns_ipvs *ipvs;
+ struct net *net;
+
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(ipvs, true);
+ list_for_each_entry(net, net_list, exit_list) {
+ ipvs = net_ipvs(net);
+ ip_vs_flush(ipvs, true);
+ }
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
@@ -1737,12 +1743,18 @@ proc_do_defense_mode(struct ctl_table *table, int write,
int val = *valp;
int rc;
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
- if ((*valp < 0) || (*valp > 3)) {
- /* Restore the correct value */
- *valp = val;
+ if (val < 0 || val > 3) {
+ rc = -EINVAL;
} else {
+ *valp = val;
update_defense_level(ipvs);
}
}
@@ -1756,33 +1768,20 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
int *valp = table->data;
int val[2];
int rc;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = table->maxlen,
+ .mode = table->mode,
+ };
- /* backup the value first */
memcpy(val, valp, sizeof(val));
-
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (valp[0] < 0 || valp[1] < 0 ||
- (valp[0] >= valp[1] && valp[1]))) {
- /* Restore the correct value */
- memcpy(valp, val, sizeof(val));
- }
- return rc;
-}
-
-static int
-proc_do_sync_mode(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- int *valp = table->data;
- int val = *valp;
- int rc;
-
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (*valp != val)) {
- if ((*valp < 0) || (*valp > 1)) {
- /* Restore the correct value */
- *valp = val;
- }
+ rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (write) {
+ if (val[0] < 0 || val[1] < 0 ||
+ (val[0] >= val[1] && val[1]))
+ rc = -EINVAL;
+ else
+ memcpy(valp, val, sizeof(val));
}
return rc;
}
@@ -1795,12 +1794,18 @@ proc_do_sync_ports(struct ctl_table *table, int write,
int val = *valp;
int rc;
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
- if (*valp < 1 || !is_power_of_2(*valp)) {
- /* Restore the correct value */
+ if (val < 1 || !is_power_of_2(val))
+ rc = -EINVAL;
+ else
*valp = val;
- }
}
return rc;
}
@@ -1860,7 +1865,9 @@ static struct ctl_table vs_vars[] = {
.procname = "sync_version",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_do_sync_mode,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "sync_ports",
@@ -2434,9 +2441,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (copy_from_user(arg, user, len) != 0)
return -EFAULT;
- /* increase the module use count */
- ip_vs_use_count_inc();
-
/* Handle daemons since they have another lock */
if (cmd == IP_VS_SO_SET_STARTDAEMON ||
cmd == IP_VS_SO_SET_STOPDAEMON) {
@@ -2449,13 +2453,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = -EINVAL;
if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
sizeof(cfg.mcast_ifn)) <= 0)
- goto out_dec;
+ return ret;
cfg.syncid = dm->syncid;
ret = start_sync_thread(ipvs, &cfg, dm->state);
} else {
ret = stop_sync_thread(ipvs, dm->state);
}
- goto out_dec;
+ return ret;
}
mutex_lock(&__ip_vs_mutex);
@@ -2550,10 +2554,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
out_unlock:
mutex_unlock(&__ip_vs_mutex);
- out_dec:
- /* decrease the module use count */
- ip_vs_use_count_dec();
-
return ret;
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index c8b5a504476c..77c323c36a88 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -160,7 +160,7 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
/* get weighted least-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
{
- register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest_set_elem *e;
struct ip_vs_dest *dest, *least;
int loh, doh;
@@ -209,7 +209,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* get weighted most-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
{
- register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest_set_elem *e;
struct ip_vs_dest *dest, *most;
int moh, doh;
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index 94d9d349ebb0..da0280cec506 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -174,8 +174,8 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
return 0;
}
- table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
- sizeof(unsigned long), GFP_KERNEL);
+ table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+ sizeof(unsigned long), GFP_KERNEL);
if (!table)
return -ENOMEM;
diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c
index 78b074cd5464..c03066fdd5ca 100644
--- a/net/netfilter/ipvs/ip_vs_ovf.c
+++ b/net/netfilter/ipvs/ip_vs_ovf.c
@@ -5,7 +5,7 @@
* Authors: Raducu Deaconu <rhadoo_io@yahoo.com>
*
* Scheduler implements "overflow" loadbalancing according to number of active
- * connections , will keep all conections to the node with the highest weight
+ * connections , will keep all connections to the node with the highest weight
* and overflow to the next node if the number of connections exceeds the node's
* weight.
* Note that this scheduler might not be suitable for UDP because it only uses
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 8e104dff7abc..166c669f0763 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -68,7 +68,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
struct ip_vs_pe *tmp;
/* increase the module use count */
- ip_vs_use_count_inc();
+ if (!ip_vs_use_count_inc())
+ return -ENOENT;
mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 000d961b97e4..32b028853a7c 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -710,7 +710,7 @@ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd
sizeof(tcp_timeouts));
if (!pd->timeout_table)
return -ENOMEM;
- pd->tcp_state_table = tcp_states;
+ pd->tcp_state_table = tcp_states;
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 2f9d5cd5daee..d4903723be7e 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -179,7 +179,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
}
/* increase the module use count */
- ip_vs_use_count_inc();
+ if (!ip_vs_use_count_inc())
+ return -ENOENT;
mutex_lock(&ip_vs_sched_mutex);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index a4a78c4b06de..605e0f68f8bd 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1239,7 +1239,7 @@ static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
p = msg_end;
if (p + sizeof(s->v4) > buffer+buflen) {
- IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n");
return;
}
s = (union ip_vs_sync_conn *)p;
@@ -1762,6 +1762,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ /* increase the module use count */
+ if (!ip_vs_use_count_inc())
+ return -ENOPROTOOPT;
+
/* Do not hold one mutex and then to block on another */
for (;;) {
rtnl_lock();
@@ -1892,9 +1896,6 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
mutex_unlock(&ipvs->sync_mutex);
rtnl_unlock();
- /* increase the module use count */
- ip_vs_use_count_inc();
-
return 0;
out:
@@ -1924,11 +1925,17 @@ out:
}
kfree(ti);
}
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
return result;
out_early:
mutex_unlock(&ipvs->sync_mutex);
rtnl_unlock();
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
return result;
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 9c464d24beec..b00866d777fe 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -208,7 +208,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
struct rtable *ort = skb_rtable(skb);
if (!skb->dev && sk && sk_fullsock(sk))
- ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
}
static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
@@ -407,12 +407,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
goto err_put;
skb_dst_drop(skb);
- if (noref) {
- if (!local)
- skb_dst_set_noref(skb, &rt->dst);
- else
- skb_dst_set(skb, dst_clone(&rt->dst));
- } else
+ if (noref)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
skb_dst_set(skb, &rt->dst);
return local;
@@ -574,12 +571,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
goto err_put;
skb_dst_drop(skb);
- if (noref) {
- if (!local)
- skb_dst_set_noref(skb, &rt->dst);
- else
- skb_dst_set(skb, dst_clone(&rt->dst));
- } else
+ if (noref)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
skb_dst_set(skb, &rt->dst);
return local;
@@ -613,7 +607,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
ret = ip_vs_confirm_conntrack(skb);
if (ret == NF_ACCEPT) {
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_forward_csum(skb);
}
return ret;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 81a8ef42b88d..d1305423640f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -73,8 +73,7 @@ struct conntrack_gc_work {
};
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
-static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
-static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
+static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
@@ -574,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
nf_ct_ext_destroy(tmpl);
- nf_ct_ext_free(tmpl);
if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -897,9 +895,10 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
}
/* Resolve race on insertion if this protocol allows this. */
-static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- struct nf_conntrack_tuple_hash *h)
+static __cold noinline int
+nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ struct nf_conntrack_tuple_hash *h)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1418,7 +1417,6 @@ void nf_conntrack_free(struct nf_conn *ct)
WARN_ON(atomic_read(&ct->ct_general.use) != 0);
nf_ct_ext_destroy(ct);
- nf_ct_ext_free(ct);
kmem_cache_free(nf_conntrack_cachep, ct);
smp_mb__before_atomic();
atomic_dec(&net->ct.count);
@@ -1793,8 +1791,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
if (nf_ct_is_confirmed(ct))
extra_jiffies += nfct_time_stamp;
- if (ct->timeout != extra_jiffies)
- ct->timeout = extra_jiffies;
+ if (READ_ONCE(ct->timeout) != extra_jiffies)
+ WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
if (do_acct)
nf_ct_acct_update(ct, ctinfo, skb->len);
@@ -2250,8 +2248,7 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
- hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
- GFP_KERNEL | __GFP_ZERO);
+ hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -2336,7 +2333,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return nf_conntrack_hash_resize(hashsize);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
static __always_inline unsigned int total_extension_size(void)
{
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 5e2812ee2149..7956c9f19899 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -24,11 +24,13 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
static DEFINE_MUTEX(nf_ct_ecache_mutex);
#define ECACHE_RETRY_WAIT (HZ/10)
+#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
enum retry_state {
STATE_CONGESTED,
@@ -38,11 +40,11 @@ enum retry_state {
static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
{
- struct nf_conn *refs[16];
+ struct nf_conn *refs[ECACHE_STACK_ALLOC];
+ enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int evicted = 0;
- enum retry_state ret = STATE_DONE;
spin_lock(&pcpu->lock);
@@ -53,10 +55,22 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
if (!nf_ct_is_confirmed(ct))
continue;
+ /* This ecache access is safe because the ct is on the
+ * pcpu dying list and we hold the spinlock -- the entry
+ * cannot be free'd until after the lock is released.
+ *
+ * This is true even if ct has a refcount of 0: the
+ * cpu that is about to free the entry must remove it
+ * from the dying list and needs the lock to do so.
+ */
e = nf_ct_ecache_find(ct);
if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
continue;
+ /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
+ * the worker owns this entry: the ct will remain valid
+ * until the worker puts its ct reference.
+ */
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
@@ -188,15 +202,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
if (notify == NULL)
goto out_unlock;
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
+ goto out_unlock;
+
e = nf_ct_ecache_find(ct);
if (e == NULL)
goto out_unlock;
events = xchg(&e->cache, 0);
- if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
- goto out_unlock;
-
/* We make a copy of the missed event cache without taking
* the lock, thus we may send missed events twice. However,
* this does not harm and it happens very rarely. */
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 65364de915d1..42557d2b6a90 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -25,8 +25,10 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_zones.h>
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index d4ed1e197921..3dbe2329c3f1 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -34,21 +34,23 @@ void nf_ct_ext_destroy(struct nf_conn *ct)
t->destroy(ct);
rcu_read_unlock();
}
+
+ kfree(ct->ext);
}
-EXPORT_SYMBOL(nf_ct_ext_destroy);
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int newlen, newoff, oldlen, alloc;
- struct nf_ct_ext *old, *new;
struct nf_ct_ext_type *t;
+ struct nf_ct_ext *new;
/* Conntrack must not be confirmed to avoid races on reallocation. */
WARN_ON(nf_ct_is_confirmed(ct));
- old = ct->ext;
- if (old) {
+ if (ct->ext) {
+ const struct nf_ct_ext *old = ct->ext;
+
if (__nf_ct_ext_exist(old, id))
return NULL;
oldlen = old->len;
@@ -68,22 +70,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
rcu_read_unlock();
alloc = max(newlen, NF_CT_EXT_PREALLOC);
- kmemleak_not_leak(old);
- new = __krealloc(old, alloc, gfp);
+ new = krealloc(ct->ext, alloc, gfp);
if (!new)
return NULL;
- if (!old) {
+ if (!ct->ext)
memset(new->offset, 0, sizeof(new->offset));
- ct->ext = new;
- } else if (new != old) {
- kfree_rcu(old, rcu);
- rcu_assign_pointer(ct->ext, new);
- }
new->offset[id] = newoff;
new->len = newlen;
memset((void *)new + newoff, 0, newlen - newoff);
+
+ ct->ext = new;
return (void *)new + newoff;
}
EXPORT_SYMBOL(nf_ct_ext_add);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 0ecb3e289ef2..9eca90414bb7 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -162,7 +162,7 @@ static int try_rfc959(const char *data, size_t dlen,
if (length == 0)
return 0;
- cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
+ cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
(array[2] << 8) | array[3]);
cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
return length;
@@ -322,7 +322,7 @@ static int find_pattern(const char *data, size_t dlen,
i++;
}
- pr_debug("Skipped up to `%c'!\n", skip);
+ pr_debug("Skipped up to 0x%hhx delimiter!\n", skip);
*numoff = i;
*numlen = getnum(data + i, dlen - i, cmd, term, numoff);
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 8d729e7c36ff..118f415928ae 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -21,10 +21,11 @@
#include <linux/rtnetlink.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_log.h>
static DEFINE_MUTEX(nf_ct_helper_mutex);
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 74b8113f7aeb..522792556632 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -11,7 +11,7 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
-static spinlock_t nf_connlabels_lock;
+static DEFINE_SPINLOCK(nf_connlabels_lock);
static int replace_u32(u32 *address, u32 mask, u32 new)
{
@@ -89,7 +89,6 @@ int nf_conntrack_labels_init(void)
{
BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
- spin_lock_init(&nf_connlabels_lock);
return nf_ct_extend_register(&labels_extend);
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6aa01eb6fe99..6a1c8f1f6171 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -506,9 +506,45 @@ nla_put_failure:
return -1;
}
+/* all these functions access ct->ext. Caller must either hold a reference
+ * on ct or prevent its deletion by holding either the bucket spinlock or
+ * pcpu dying list lock.
+ */
+static int ctnetlink_dump_extinfo(struct sk_buff *skb,
+ struct nf_conn *ct, u32 type)
+{
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+ ctnetlink_dump_labels(skb, ct) < 0 ||
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0 ||
+ ctnetlink_dump_ct_synproxy(skb, ct) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
+{
+ if (ctnetlink_dump_status(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_secctx(skb, ct) < 0 ||
+ ctnetlink_dump_id(skb, ct) < 0 ||
+ ctnetlink_dump_use(skb, ct) < 0 ||
+ ctnetlink_dump_master(skb, ct) < 0)
+ return -1;
+
+ if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) &&
+ (ctnetlink_dump_timeout(skb, ct) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct) < 0))
+ return -1;
+
+ return 0;
+}
+
static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool extinfo)
{
const struct nf_conntrack_zone *zone;
struct nlmsghdr *nlh;
@@ -552,20 +588,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
NF_CT_DEFAULT_ZONE_DIR) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_status(skb, ct) < 0 ||
- ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_acct(skb, ct, type) < 0 ||
- ctnetlink_dump_timestamp(skb, ct) < 0 ||
- ctnetlink_dump_protoinfo(skb, ct) < 0 ||
- ctnetlink_dump_helpinfo(skb, ct) < 0 ||
- ctnetlink_dump_mark(skb, ct) < 0 ||
- ctnetlink_dump_secctx(skb, ct) < 0 ||
- ctnetlink_dump_labels(skb, ct) < 0 ||
- ctnetlink_dump_id(skb, ct) < 0 ||
- ctnetlink_dump_use(skb, ct) < 0 ||
- ctnetlink_dump_master(skb, ct) < 0 ||
- ctnetlink_dump_ct_seq_adj(skb, ct) < 0 ||
- ctnetlink_dump_ct_synproxy(skb, ct) < 0)
+ if (ctnetlink_dump_info(skb, ct) < 0)
+ goto nla_put_failure;
+ if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0)
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -950,13 +975,11 @@ restart:
if (!ctnetlink_filter_match(ct, cb->data))
continue;
- rcu_read_lock();
res =
ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct);
- rcu_read_unlock();
+ ct, true);
if (res < 0) {
nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct;
@@ -1361,10 +1384,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
return -ENOMEM;
}
- rcu_read_lock();
err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type), ct);
- rcu_read_unlock();
+ NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true);
nf_ct_put(ct);
if (err <= 0)
goto free;
@@ -1426,12 +1447,18 @@ restart:
continue;
cb->args[1] = 0;
}
- rcu_read_lock();
+
+ /* We can't dump extension info for the unconfirmed
+ * list because unconfirmed conntracks can have
+ * ct->ext reallocated (and thus freed).
+ *
+ * In the dying list case ct->ext can't be free'd
+ * until after we drop pcpu->lock.
+ */
res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct);
- rcu_read_unlock();
+ ct, dying ? true : false);
if (res < 0) {
if (!atomic_inc_not_zero(&ct->ct_general.use))
continue;
@@ -3599,6 +3626,9 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
list_for_each_entry(net, net_exit_list, exit_list)
ctnetlink_net_exit(net);
+
+ /* wait for other cpus until they are done with ctnl_notifiers */
+ synchronize_rcu();
}
static struct pernet_operations ctnetlink_net_ops = {
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index b6b14db3955b..b3f4a334f9d7 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -677,6 +677,9 @@ static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeouts = data;
int i;
+ if (!timeouts)
+ timeouts = dn->dccp_timeout;
+
/* set default DCCP timeouts. */
for (i=0; i<CT_DCCP_MAX; i++)
timeouts[i] = dn->dccp_timeout[i];
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 097deba7441a..c2e3dff773bc 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -235,11 +235,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
}
/* Need to track icmp error message? */
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_SOURCE_QUENCH &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB &&
- icmph->type != ICMP_REDIRECT)
+ if (!icmp_is_err(icmph->type))
return NF_ACCEPT;
memset(&outer_daddr, 0, sizeof(outer_daddr));
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 7e317e6698ba..6f9144e1f1c1 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -22,7 +22,6 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
#include <net/netfilter/nf_log.h>
static const unsigned int nf_ct_icmpv6_timeout = 30*HZ;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index fce3d93f1541..4f897b14b606 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -114,7 +114,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
+/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
@@ -130,7 +130,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
/* REPLY */
/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
+/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
@@ -316,7 +316,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
}
- ct->proto.sctp.state = new_state;
+ ct->proto.sctp.state = SCTP_CONNTRACK_NONE;
}
return true;
@@ -594,6 +594,9 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct nf_sctp_net *sn = nf_sctp_pernet(net);
int i;
+ if (!timeouts)
+ timeouts = sn->timeouts;
+
/* set default SCTP timeouts. */
for (i=0; i<SCTP_CONNTRACK_MAX; i++)
timeouts[i] = sn->timeouts[i];
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 85c1f8c213b0..1926fd56df56 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1227,7 +1227,7 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
[CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 },
[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) },
- [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
+ [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
};
#define TCP_NLATTR_SIZE ( \
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index e0d392cb3075..410809c669e1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -511,8 +511,6 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
/* Log invalid packets of a given protocol */
static int log_invalid_proto_min __read_mostly;
static int log_invalid_proto_max __read_mostly = 255;
-static int zero;
-static int one = 1;
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
@@ -629,8 +627,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_LOG_INVALID] = {
.procname = "nf_conntrack_log_invalid",
@@ -654,8 +652,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_HELPER] = {
.procname = "nf_conntrack_helper",
@@ -663,8 +661,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_NF_CONNTRACK_EVENTS
[NF_SYSCTL_CT_EVENTS] = {
@@ -673,8 +671,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
@@ -684,8 +682,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = {
@@ -759,16 +757,16 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = {
.procname = "nf_conntrack_tcp_be_liberal",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
.procname = "nf_conntrack_tcp_max_retrans",
@@ -904,8 +902,8 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
@@ -1037,9 +1035,14 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_COUNT].data = &net->ct.count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
+ table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
+ table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp;
+#endif
table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout;
table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout;
table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout;
@@ -1164,7 +1167,6 @@ static int __init nf_conntrack_standalone_init(void)
if (ret < 0)
goto out_start;
- BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
#ifdef CONFIG_SYSCTL
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 13d0f4a92647..14387e0b8008 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -19,6 +19,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_timeout.h>
struct nf_ct_timeout *
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 5a35ef08c3cb..f108a76925dd 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -10,6 +10,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
@@ -50,5 +51,25 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
+int nft_fwd_dup_netdev_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ enum flow_action_id id, int oif)
+{
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+
+ /* nft_flow_rule_destroy() releases the reference on this device. */
+ dev = dev_get_by_index(ctx->net, oif);
+ if (!dev)
+ return -EOPNOTSUPP;
+
+ entry = &flow->rule->action.entries[ctx->num_actions++];
+ entry->id = id;
+ entry->dev = dev;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 80a8f9ae4c93..8af28e10b4e6 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -11,26 +11,18 @@
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-struct flow_offload_entry {
- struct flow_offload flow;
- struct nf_conn *ct;
- struct rcu_head rcu_head;
-};
-
static DEFINE_MUTEX(flowtable_lock);
static LIST_HEAD(flowtables);
static void
-flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
- struct nf_flow_route *route,
+flow_offload_fill_dir(struct flow_offload *flow,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
- struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
- struct dst_entry *dst = route->tuple[dir].dst;
+ struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
ft->dir = dir;
@@ -38,12 +30,10 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
case NFPROTO_IPV4:
ft->src_v4 = ctt->src.u3.in;
ft->dst_v4 = ctt->dst.u3.in;
- ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
break;
case NFPROTO_IPV6:
ft->src_v6 = ctt->src.u3.in6;
ft->dst_v6 = ctt->dst.u3.in6;
- ft->mtu = ip6_dst_mtu_forward(dst);
break;
}
@@ -51,49 +41,32 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
ft->l4proto = ctt->dst.protonum;
ft->src_port = ctt->src.u.tcp.port;
ft->dst_port = ctt->dst.u.tcp.port;
-
- ft->iifidx = other_dst->dev->ifindex;
- ft->dst_cache = dst;
}
-struct flow_offload *
-flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
{
- struct flow_offload_entry *entry;
struct flow_offload *flow;
if (unlikely(nf_ct_is_dying(ct) ||
!atomic_inc_not_zero(&ct->ct_general.use)))
return NULL;
- entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
- if (!entry)
+ flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+ if (!flow)
goto err_ct_refcnt;
- flow = &entry->flow;
+ flow->ct = ct;
- if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
- goto err_dst_cache_original;
-
- if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
- goto err_dst_cache_reply;
-
- entry->ct = ct;
-
- flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
- flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
+ flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
if (ct->status & IPS_SRC_NAT)
- flow->flags |= FLOW_OFFLOAD_SNAT;
+ __set_bit(NF_FLOW_SNAT, &flow->flags);
if (ct->status & IPS_DST_NAT)
- flow->flags |= FLOW_OFFLOAD_DNAT;
+ __set_bit(NF_FLOW_DNAT, &flow->flags);
return flow;
-err_dst_cache_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
-err_dst_cache_original:
- kfree(entry);
err_ct_refcnt:
nf_ct_put(ct);
@@ -101,6 +74,56 @@ err_ct_refcnt:
}
EXPORT_SYMBOL_GPL(flow_offload_alloc);
+static int flow_offload_fill_route(struct flow_offload *flow,
+ const struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ struct dst_entry *other_dst = route->tuple[!dir].dst;
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ if (!dst_hold_safe(route->tuple[dir].dst))
+ return -1;
+
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
+ break;
+ case NFPROTO_IPV6:
+ flow_tuple->mtu = ip6_dst_mtu_forward(dst);
+ break;
+ }
+
+ flow_tuple->iifidx = other_dst->dev->ifindex;
+ flow_tuple->dst_cache = dst;
+
+ return 0;
+}
+
+int flow_offload_route_init(struct flow_offload *flow,
+ const struct nf_flow_route *route)
+{
+ int err;
+
+ err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ if (err < 0)
+ return err;
+
+ err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
+ if (err < 0)
+ goto err_route_reply;
+
+ flow->type = NF_FLOW_OFFLOAD_ROUTE;
+
+ return 0;
+
+err_route_reply:
+ dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(flow_offload_route_init);
+
static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
{
tcp->state = TCP_CONNTRACK_ESTABLISHED;
@@ -111,11 +134,6 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
-static inline __s32 nf_flow_timeout_delta(unsigned int timeout)
-{
- return (__s32)(timeout - (u32)jiffies);
-}
-
static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
@@ -149,17 +167,23 @@ static void flow_offload_fixup_ct(struct nf_conn *ct)
flow_offload_fixup_ct_timeout(ct);
}
-void flow_offload_free(struct flow_offload *flow)
+static void flow_offload_route_release(struct flow_offload *flow)
{
- struct flow_offload_entry *e;
-
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
- e = container_of(flow, struct flow_offload_entry, flow);
- if (flow->flags & FLOW_OFFLOAD_DYING)
- nf_ct_delete(e->ct, 0, 0);
- nf_ct_put(e->ct);
- kfree_rcu(e, rcu_head);
+}
+
+void flow_offload_free(struct flow_offload *flow)
+{
+ switch (flow->type) {
+ case NF_FLOW_OFFLOAD_ROUTE:
+ flow_offload_route_release(flow);
+ break;
+ default:
+ break;
+ }
+ nf_ct_put(flow->ct);
+ kfree_rcu(flow, rcu_head);
}
EXPORT_SYMBOL_GPL(flow_offload_free);
@@ -201,6 +225,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
int err;
+ flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+
err = rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[0].node,
nf_flow_offload_rhash_params);
@@ -217,7 +243,11 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
return err;
}
- flow->timeout = (u32)jiffies;
+ if (nf_flowtable_hw_offload(flow_table)) {
+ __set_bit(NF_FLOW_HW, &flow->flags);
+ nf_flow_offload_add(flow_table, flow);
+ }
+
return 0;
}
EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -230,8 +260,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
- struct flow_offload_entry *e;
-
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
nf_flow_offload_rhash_params);
@@ -239,25 +267,21 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
- e = container_of(flow, struct flow_offload_entry, flow);
- clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
+ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
if (nf_flow_has_expired(flow))
- flow_offload_fixup_ct(e->ct);
- else if (flow->flags & FLOW_OFFLOAD_TEARDOWN)
- flow_offload_fixup_ct_timeout(e->ct);
+ flow_offload_fixup_ct(flow->ct);
+ else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
+ flow_offload_fixup_ct_timeout(flow->ct);
flow_offload_free(flow);
}
void flow_offload_teardown(struct flow_offload *flow)
{
- struct flow_offload_entry *e;
-
- flow->flags |= FLOW_OFFLOAD_TEARDOWN;
+ set_bit(NF_FLOW_TEARDOWN, &flow->flags);
- e = container_of(flow, struct flow_offload_entry, flow);
- flow_offload_fixup_ct_state(e->ct);
+ flow_offload_fixup_ct_state(flow->ct);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -267,7 +291,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
{
struct flow_offload_tuple_rhash *tuplehash;
struct flow_offload *flow;
- struct flow_offload_entry *e;
int dir;
tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
@@ -277,11 +300,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
+ if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
return NULL;
- e = container_of(flow, struct flow_offload_entry, flow);
- if (unlikely(nf_ct_is_dying(e->ct)))
+ if (unlikely(nf_ct_is_dying(flow->ct)))
return NULL;
return tuplehash;
@@ -325,12 +347,20 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
{
struct nf_flowtable *flow_table = data;
- struct flow_offload_entry *e;
- e = container_of(flow, struct flow_offload_entry, flow);
- if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) ||
- (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
- flow_offload_del(flow_table, flow);
+ if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) ||
+ test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ if (test_bit(NF_FLOW_HW, &flow->flags)) {
+ if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
+ nf_flow_offload_del(flow_table, flow);
+ else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
+ flow_offload_del(flow_table, flow);
+ } else {
+ flow_offload_del(flow_table, flow);
+ }
+ } else if (test_bit(NF_FLOW_HW, &flow->flags)) {
+ nf_flow_offload_stats(flow_table, flow);
+ }
}
static void nf_flow_offload_work_gc(struct work_struct *work)
@@ -463,6 +493,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
int err;
INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+ flow_block_init(&flowtable->flow_block);
err = rhashtable_init(&flowtable->rhashtable,
&nf_flow_offload_rhash_params);
@@ -483,18 +514,16 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init);
static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
- struct flow_offload_entry *e;
-
- e = container_of(flow, struct flow_offload_entry, flow);
if (!dev) {
flow_offload_teardown(flow);
return;
}
- if (net_eq(nf_ct_net(e->ct), dev_net(dev)) &&
+
+ if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
(flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
flow->tuplehash[1].tuple.iifidx == dev->ifindex))
- flow_offload_dead(flow);
+ flow_offload_teardown(flow);
}
static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
@@ -502,6 +531,7 @@ static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
{
nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
+ nf_flow_table_offload_flush(flowtable);
}
void nf_flow_table_cleanup(struct net_device *dev)
@@ -523,9 +553,23 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
+ nf_flow_table_offload_flush(flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+static int __init nf_flow_table_module_init(void)
+{
+ return nf_flow_table_offload_init();
+}
+
+static void __exit nf_flow_table_module_exit(void)
+{
+ nf_flow_table_offload_exit();
+}
+
+module_init(nf_flow_table_module_init);
+module_exit(nf_flow_table_module_exit);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 593357aedb36..88bedf1ff1ae 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -21,9 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
return NF_ACCEPT;
}
+static int nf_flow_rule_route_inet(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ int err;
+
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
+ break;
+ case NFPROTO_IPV6:
+ err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
+ break;
+ default:
+ err = -1;
+ break;
+ }
+
+ return err;
+}
+
static struct nf_flowtable_type flowtable_inet = {
.family = NFPROTO_INET,
.init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_inet,
.free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index d68c801dd614..9e563fd3da0f 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -144,11 +144,11 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
{
struct iphdr *iph = ip_hdr(skb);
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
return -1;
@@ -228,11 +228,17 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
{
skb_orphan(skb);
skb_dst_set_noref(skb, dst);
- skb->tstamp = 0;
dst_output(state->net, state->sk, skb);
return NF_STOLEN;
}
+static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table,
+ struct flow_offload *flow)
+{
+ return nf_flowtable_hw_offload(flow_table) &&
+ test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags);
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -273,6 +279,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
return NF_ACCEPT;
+ if (unlikely(nf_flow_offload_refresh(flow_table, flow)))
+ nf_flow_offload_add(flow_table, flow);
+
if (nf_flow_offload_dst_check(&rt->dst)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
@@ -281,9 +290,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
return NF_DROP;
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
iph = ip_hdr(skb);
ip_decrease_ttl(iph);
+ skb->tstamp = 0;
if (unlikely(dst_xfrm(&rt->dst))) {
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
@@ -414,11 +424,11 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow,
struct ipv6hdr *ip6h = ipv6_hdr(skb);
unsigned int thoff = sizeof(*ip6h);
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
return -1;
@@ -498,6 +508,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
sizeof(*ip6h)))
return NF_ACCEPT;
+ if (unlikely(nf_flow_offload_refresh(flow_table, flow)))
+ nf_flow_offload_add(flow_table, flow);
+
if (nf_flow_offload_dst_check(&rt->dst)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
@@ -509,9 +522,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
return NF_DROP;
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
ip6h = ipv6_hdr(skb);
ip6h->hop_limit--;
+ skb->tstamp = 0;
if (unlikely(dst_xfrm(&rt->dst))) {
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
new file mode 100644
index 000000000000..83e1db37c3b0
--- /dev/null
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -0,0 +1,905 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <linux/tc_act/tc_csum.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+static struct work_struct nf_flow_offload_work;
+static DEFINE_SPINLOCK(flow_offload_pending_list_lock);
+static LIST_HEAD(flow_offload_pending_list);
+
+struct flow_offload_work {
+ struct list_head list;
+ enum flow_cls_command cmd;
+ int priority;
+ struct nf_flowtable *flowtable;
+ struct flow_offload *flow;
+};
+
+struct nf_flow_key {
+ struct flow_dissector_key_meta meta;
+ struct flow_dissector_key_control control;
+ struct flow_dissector_key_basic basic;
+ union {
+ struct flow_dissector_key_ipv4_addrs ipv4;
+ struct flow_dissector_key_ipv6_addrs ipv6;
+ };
+ struct flow_dissector_key_tcp tcp;
+ struct flow_dissector_key_ports tp;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct nf_flow_match {
+ struct flow_dissector dissector;
+ struct nf_flow_key key;
+ struct nf_flow_key mask;
+};
+
+struct nf_flow_rule {
+ struct nf_flow_match match;
+ struct flow_rule *rule;
+};
+
+#define NF_FLOW_DISSECTOR(__match, __type, __field) \
+ (__match)->dissector.offset[__type] = \
+ offsetof(struct nf_flow_key, __field)
+
+static int nf_flow_rule_match(struct nf_flow_match *match,
+ const struct flow_offload_tuple *tuple)
+{
+ struct nf_flow_key *mask = &match->mask;
+ struct nf_flow_key *key = &match->key;
+
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
+
+ key->meta.ingress_ifindex = tuple->iifidx;
+ mask->meta.ingress_ifindex = 0xffffffff;
+
+ switch (tuple->l3proto) {
+ case AF_INET:
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ key->basic.n_proto = htons(ETH_P_IP);
+ key->ipv4.src = tuple->src_v4.s_addr;
+ mask->ipv4.src = 0xffffffff;
+ key->ipv4.dst = tuple->dst_v4.s_addr;
+ mask->ipv4.dst = 0xffffffff;
+ break;
+ case AF_INET6:
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ key->basic.n_proto = htons(ETH_P_IPV6);
+ key->ipv6.src = tuple->src_v6;
+ memset(&mask->ipv6.src, 0xff, sizeof(mask->ipv6.src));
+ key->ipv6.dst = tuple->dst_v6;
+ memset(&mask->ipv6.dst, 0xff, sizeof(mask->ipv6.dst));
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ match->dissector.used_keys |= BIT(key->control.addr_type);
+ mask->basic.n_proto = 0xffff;
+
+ switch (tuple->l4proto) {
+ case IPPROTO_TCP:
+ key->tcp.flags = 0;
+ mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16);
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP);
+ break;
+ case IPPROTO_UDP:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ key->basic.ip_proto = tuple->l4proto;
+ mask->basic.ip_proto = 0xff;
+
+ key->tp.src = tuple->src_port;
+ mask->tp.src = 0xffff;
+ key->tp.dst = tuple->dst_port;
+ mask->tp.dst = 0xffff;
+
+ match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) |
+ BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+ BIT(FLOW_DISSECTOR_KEY_BASIC) |
+ BIT(FLOW_DISSECTOR_KEY_PORTS);
+ return 0;
+}
+
+static void flow_offload_mangle(struct flow_action_entry *entry,
+ enum flow_action_mangle_base htype, u32 offset,
+ const __be32 *value, const __be32 *mask)
+{
+ entry->id = FLOW_ACTION_MANGLE;
+ entry->mangle.htype = htype;
+ entry->mangle.offset = offset;
+ memcpy(&entry->mangle.mask, mask, sizeof(u32));
+ memcpy(&entry->mangle.val, value, sizeof(u32));
+}
+
+static inline struct flow_action_entry *
+flow_action_entry_next(struct nf_flow_rule *flow_rule)
+{
+ int i = flow_rule->rule->action.num_entries++;
+
+ return &flow_rule->rule->action.entries[i];
+}
+
+static int flow_offload_eth_src(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
+ struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
+ struct net_device *dev;
+ u32 mask, val;
+ u16 val16;
+
+ dev = dev_get_by_index(net, tuple->iifidx);
+ if (!dev)
+ return -ENOENT;
+
+ mask = ~0xffff0000;
+ memcpy(&val16, dev->dev_addr, 2);
+ val = val16 << 16;
+ flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
+ &val, &mask);
+
+ mask = ~0xffffffff;
+ memcpy(&val, dev->dev_addr + 2, 4);
+ flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
+ &val, &mask);
+ dev_put(dev);
+
+ return 0;
+}
+
+static int flow_offload_eth_dst(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
+ const void *daddr = &flow->tuplehash[!dir].tuple.src_v4;
+ const struct dst_entry *dst_cache;
+ unsigned char ha[ETH_ALEN];
+ struct neighbour *n;
+ u32 mask, val;
+ u8 nud_state;
+ u16 val16;
+
+ dst_cache = flow->tuplehash[dir].tuple.dst_cache;
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -ENOENT;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+
+ if (!(nud_state & NUD_VALID)) {
+ neigh_release(n);
+ return -ENOENT;
+ }
+
+ mask = ~0xffffffff;
+ memcpy(&val, ha, 4);
+ flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0,
+ &val, &mask);
+
+ mask = ~0x0000ffff;
+ memcpy(&val16, ha + 4, 2);
+ val = val16;
+ flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
+ &val, &mask);
+ neigh_release(n);
+
+ return 0;
+}
+
+static void flow_offload_ipv4_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask = ~htonl(0xffffffff);
+ __be32 addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ offset = offsetof(struct iphdr, saddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ offset = offsetof(struct iphdr, daddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
+ &addr, &mask);
+}
+
+static void flow_offload_ipv4_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask = ~htonl(0xffffffff);
+ __be32 addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ offset = offsetof(struct iphdr, daddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ offset = offsetof(struct iphdr, saddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
+ &addr, &mask);
+}
+
+static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
+ unsigned int offset,
+ const __be32 *addr, const __be32 *mask)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) {
+ entry = flow_action_entry_next(flow_rule);
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+ offset + i, &addr[i], mask);
+ }
+}
+
+static void flow_offload_ipv6_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ u32 mask = ~htonl(0xffffffff);
+ const __be32 *addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, saddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, daddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
+}
+
+static void flow_offload_ipv6_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ u32 mask = ~htonl(0xffffffff);
+ const __be32 *addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, daddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, saddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
+}
+
+static int flow_offload_l4proto(const struct flow_offload *flow)
+{
+ u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+ u8 type = 0;
+
+ switch (protonum) {
+ case IPPROTO_TCP:
+ type = FLOW_ACT_MANGLE_HDR_TYPE_TCP;
+ break;
+ case IPPROTO_UDP:
+ type = FLOW_ACT_MANGLE_HDR_TYPE_UDP;
+ break;
+ default:
+ break;
+ }
+
+ return type;
+}
+
+static void flow_offload_port_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask, port;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port);
+ offset = 0; /* offsetof(struct tcphdr, source); */
+ port = htonl(port << 16);
+ mask = ~htonl(0xffff0000);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port);
+ offset = 0; /* offsetof(struct tcphdr, dest); */
+ port = htonl(port);
+ mask = ~htonl(0xffff);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
+ &port, &mask);
+}
+
+static void flow_offload_port_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask, port;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port);
+ offset = 0; /* offsetof(struct tcphdr, dest); */
+ port = htonl(port);
+ mask = ~htonl(0xffff);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port);
+ offset = 0; /* offsetof(struct tcphdr, source); */
+ port = htonl(port << 16);
+ mask = ~htonl(0xffff0000);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
+ &port, &mask);
+}
+
+static void flow_offload_ipv4_checksum(struct net *net,
+ const struct flow_offload *flow,
+ struct nf_flow_rule *flow_rule)
+{
+ u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+
+ entry->id = FLOW_ACTION_CSUM;
+ entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR;
+
+ switch (protonum) {
+ case IPPROTO_TCP:
+ entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
+ break;
+ case IPPROTO_UDP:
+ entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
+ break;
+ }
+}
+
+static void flow_offload_redirect(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ struct rtable *rt;
+
+ rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ entry->id = FLOW_ACTION_REDIRECT;
+ entry->dev = rt->dst.dev;
+ dev_hold(rt->dst.dev);
+}
+
+int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ flow_offload_ipv4_snat(net, flow, dir, flow_rule);
+ flow_offload_port_snat(net, flow, dir, flow_rule);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ flow_offload_ipv4_dnat(net, flow, dir, flow_rule);
+ flow_offload_port_dnat(net, flow, dir, flow_rule);
+ }
+ if (test_bit(NF_FLOW_SNAT, &flow->flags) ||
+ test_bit(NF_FLOW_DNAT, &flow->flags))
+ flow_offload_ipv4_checksum(net, flow, flow_rule);
+
+ flow_offload_redirect(flow, dir, flow_rule);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
+
+int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ flow_offload_ipv6_snat(net, flow, dir, flow_rule);
+ flow_offload_port_snat(net, flow, dir, flow_rule);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ flow_offload_ipv6_dnat(net, flow, dir, flow_rule);
+ flow_offload_port_dnat(net, flow, dir, flow_rule);
+ }
+
+ flow_offload_redirect(flow, dir, flow_rule);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6);
+
+#define NF_FLOW_RULE_ACTION_MAX 16
+
+static struct nf_flow_rule *
+nf_flow_offload_rule_alloc(struct net *net,
+ const struct flow_offload_work *offload,
+ enum flow_offload_tuple_dir dir)
+{
+ const struct nf_flowtable *flowtable = offload->flowtable;
+ const struct flow_offload *flow = offload->flow;
+ const struct flow_offload_tuple *tuple;
+ struct nf_flow_rule *flow_rule;
+ int err = -ENOMEM;
+
+ flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
+ if (!flow_rule)
+ goto err_flow;
+
+ flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX);
+ if (!flow_rule->rule)
+ goto err_flow_rule;
+
+ flow_rule->rule->match.dissector = &flow_rule->match.dissector;
+ flow_rule->rule->match.mask = &flow_rule->match.mask;
+ flow_rule->rule->match.key = &flow_rule->match.key;
+
+ tuple = &flow->tuplehash[dir].tuple;
+ err = nf_flow_rule_match(&flow_rule->match, tuple);
+ if (err < 0)
+ goto err_flow_match;
+
+ flow_rule->rule->action.num_entries = 0;
+ if (flowtable->type->action(net, flow, dir, flow_rule) < 0)
+ goto err_flow_match;
+
+ return flow_rule;
+
+err_flow_match:
+ kfree(flow_rule->rule);
+err_flow_rule:
+ kfree(flow_rule);
+err_flow:
+ return NULL;
+}
+
+static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ for (i = 0; i < flow_rule->rule->action.num_entries; i++) {
+ entry = &flow_rule->rule->action.entries[i];
+ if (entry->id != FLOW_ACTION_REDIRECT)
+ continue;
+
+ dev_put(entry->dev);
+ }
+ kfree(flow_rule->rule);
+ kfree(flow_rule);
+}
+
+static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[])
+{
+ int i;
+
+ for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++)
+ __nf_flow_offload_destroy(flow_rule[i]);
+}
+
+static int nf_flow_offload_alloc(const struct flow_offload_work *offload,
+ struct nf_flow_rule *flow_rule[])
+{
+ struct net *net = read_pnet(&offload->flowtable->net);
+
+ flow_rule[0] = nf_flow_offload_rule_alloc(net, offload,
+ FLOW_OFFLOAD_DIR_ORIGINAL);
+ if (!flow_rule[0])
+ return -ENOMEM;
+
+ flow_rule[1] = nf_flow_offload_rule_alloc(net, offload,
+ FLOW_OFFLOAD_DIR_REPLY);
+ if (!flow_rule[1]) {
+ __nf_flow_offload_destroy(flow_rule[0]);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void nf_flow_offload_init(struct flow_cls_offload *cls_flow,
+ __be16 proto, int priority,
+ enum flow_cls_command cmd,
+ const struct flow_offload_tuple *tuple,
+ struct netlink_ext_ack *extack)
+{
+ cls_flow->common.protocol = proto;
+ cls_flow->common.prio = priority;
+ cls_flow->common.extack = extack;
+ cls_flow->command = cmd;
+ cls_flow->cookie = (unsigned long)tuple;
+}
+
+static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
+ struct flow_offload *flow,
+ struct nf_flow_rule *flow_rule,
+ enum flow_offload_tuple_dir dir,
+ int priority, int cmd,
+ struct list_head *block_cb_list)
+{
+ struct flow_cls_offload cls_flow = {};
+ struct flow_block_cb *block_cb;
+ struct netlink_ext_ack extack;
+ __be16 proto = ETH_P_ALL;
+ int err, i = 0;
+
+ nf_flow_offload_init(&cls_flow, proto, priority, cmd,
+ &flow->tuplehash[dir].tuple, &extack);
+ if (cmd == FLOW_CLS_REPLACE)
+ cls_flow.rule = flow_rule->rule;
+
+ list_for_each_entry(block_cb, block_cb_list, list) {
+ err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow,
+ block_cb->cb_priv);
+ if (err < 0)
+ continue;
+
+ i++;
+ }
+
+ return i;
+}
+
+static int flow_offload_tuple_add(struct flow_offload_work *offload,
+ struct nf_flow_rule *flow_rule,
+ enum flow_offload_tuple_dir dir)
+{
+ return nf_flow_offload_tuple(offload->flowtable, offload->flow,
+ flow_rule, dir, offload->priority,
+ FLOW_CLS_REPLACE,
+ &offload->flowtable->flow_block.cb_list);
+}
+
+static void flow_offload_tuple_del(struct flow_offload_work *offload,
+ enum flow_offload_tuple_dir dir)
+{
+ nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
+ offload->priority, FLOW_CLS_DESTROY,
+ &offload->flowtable->flow_block.cb_list);
+}
+
+static int flow_offload_rule_add(struct flow_offload_work *offload,
+ struct nf_flow_rule *flow_rule[])
+{
+ int ok_count = 0;
+
+ ok_count += flow_offload_tuple_add(offload, flow_rule[0],
+ FLOW_OFFLOAD_DIR_ORIGINAL);
+ ok_count += flow_offload_tuple_add(offload, flow_rule[1],
+ FLOW_OFFLOAD_DIR_REPLY);
+ if (ok_count == 0)
+ return -ENOENT;
+
+ return 0;
+}
+
+static void flow_offload_work_add(struct flow_offload_work *offload)
+{
+ struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX];
+ int err;
+
+ err = nf_flow_offload_alloc(offload, flow_rule);
+ if (err < 0)
+ return;
+
+ err = flow_offload_rule_add(offload, flow_rule);
+ if (err < 0)
+ set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags);
+
+ nf_flow_offload_destroy(flow_rule);
+}
+
+static void flow_offload_work_del(struct flow_offload_work *offload)
+{
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
+ set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
+}
+
+static void flow_offload_tuple_stats(struct flow_offload_work *offload,
+ enum flow_offload_tuple_dir dir,
+ struct flow_stats *stats)
+{
+ struct nf_flowtable *flowtable = offload->flowtable;
+ struct flow_cls_offload cls_flow = {};
+ struct flow_block_cb *block_cb;
+ struct netlink_ext_ack extack;
+ __be16 proto = ETH_P_ALL;
+
+ nf_flow_offload_init(&cls_flow, proto, offload->priority,
+ FLOW_CLS_STATS,
+ &offload->flow->tuplehash[dir].tuple, &extack);
+
+ list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list)
+ block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv);
+ memcpy(stats, &cls_flow.stats, sizeof(*stats));
+}
+
+static void flow_offload_work_stats(struct flow_offload_work *offload)
+{
+ struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {};
+ u64 lastused;
+
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]);
+
+ lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
+ offload->flow->timeout = max_t(u64, offload->flow->timeout,
+ lastused + NF_FLOW_TIMEOUT);
+}
+
+static void flow_offload_work_handler(struct work_struct *work)
+{
+ struct flow_offload_work *offload, *next;
+ LIST_HEAD(offload_pending_list);
+
+ spin_lock_bh(&flow_offload_pending_list_lock);
+ list_replace_init(&flow_offload_pending_list, &offload_pending_list);
+ spin_unlock_bh(&flow_offload_pending_list_lock);
+
+ list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
+ switch (offload->cmd) {
+ case FLOW_CLS_REPLACE:
+ flow_offload_work_add(offload);
+ break;
+ case FLOW_CLS_DESTROY:
+ flow_offload_work_del(offload);
+ break;
+ case FLOW_CLS_STATS:
+ flow_offload_work_stats(offload);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ list_del(&offload->list);
+ kfree(offload);
+ }
+}
+
+static void flow_offload_queue_work(struct flow_offload_work *offload)
+{
+ spin_lock_bh(&flow_offload_pending_list_lock);
+ list_add_tail(&offload->list, &flow_offload_pending_list);
+ spin_unlock_bh(&flow_offload_pending_list_lock);
+
+ schedule_work(&nf_flow_offload_work);
+}
+
+static struct flow_offload_work *
+nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, unsigned int cmd)
+{
+ struct flow_offload_work *offload;
+
+ offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC);
+ if (!offload)
+ return NULL;
+
+ offload->cmd = cmd;
+ offload->flow = flow;
+ offload->priority = flowtable->priority;
+ offload->flowtable = flowtable;
+
+ return offload;
+}
+
+
+void nf_flow_offload_add(struct nf_flowtable *flowtable,
+ struct flow_offload *flow)
+{
+ struct flow_offload_work *offload;
+
+ offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE);
+ if (!offload)
+ return;
+
+ flow_offload_queue_work(offload);
+}
+
+void nf_flow_offload_del(struct nf_flowtable *flowtable,
+ struct flow_offload *flow)
+{
+ struct flow_offload_work *offload;
+
+ offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY);
+ if (!offload)
+ return;
+
+ set_bit(NF_FLOW_HW_DYING, &flow->flags);
+ flow_offload_queue_work(offload);
+}
+
+void nf_flow_offload_stats(struct nf_flowtable *flowtable,
+ struct flow_offload *flow)
+{
+ struct flow_offload_work *offload;
+ __s32 delta;
+
+ delta = nf_flow_timeout_delta(flow->timeout);
+ if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10))
+ return;
+
+ offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS);
+ if (!offload)
+ return;
+
+ flow_offload_queue_work(offload);
+}
+
+void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
+{
+ if (nf_flowtable_hw_offload(flowtable))
+ flush_work(&nf_flow_offload_work);
+}
+
+static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
+ struct flow_block_offload *bo,
+ enum flow_block_command cmd)
+{
+ struct flow_block_cb *block_cb, *next;
+ int err = 0;
+
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ list_splice(&bo->cb_list, &flowtable->flow_block.cb_list);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!nf_flowtable_hw_offload(flowtable))
+ return 0;
+
+ if (!dev->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ memset(bo, 0, sizeof(*bo));
+ bo->net = dev_net(dev);
+ bo->block = &flowtable->flow_block;
+ bo->command = cmd;
+ bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ bo->extack = extack;
+ INIT_LIST_HEAD(&bo->cb_list);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+ int err;
+
+ err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack);
+ if (err < 0)
+ return err;
+
+ return nf_flow_table_block_setup(flowtable, &bo, cmd);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
+
+int nf_flow_table_offload_init(void)
+{
+ INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler);
+
+ return 0;
+}
+
+void nf_flow_table_offload_exit(void)
+{
+ struct flow_offload_work *offload, *next;
+ LIST_HEAD(offload_pending_list);
+
+ cancel_work_sync(&nf_flow_offload_work);
+
+ list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
+ list_del(&offload->list);
+ kfree(offload);
+ }
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 3f6023ed4966..bfc555fcbc72 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -18,12 +18,12 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
-#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <uapi/linux/netfilter/nf_nat.h>
#include "nf_internals.h"
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 7ac733ebd060..64eedc17037a 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -233,6 +233,19 @@ icmp_manip_pkt(struct sk_buff *skb,
return false;
hdr = (struct icmphdr *)(skb->data + hdroff);
+ switch (hdr->type) {
+ case ICMP_ECHO:
+ case ICMP_ECHOREPLY:
+ case ICMP_TIMESTAMP:
+ case ICMP_TIMESTAMPREPLY:
+ case ICMP_INFO_REQUEST:
+ case ICMP_INFO_REPLY:
+ case ICMP_ADDRESS:
+ case ICMP_ADDRESSREPLY:
+ break;
+ default:
+ return true;
+ }
inet_proto_csum_replace2(&hdr->checksum, skb,
hdr->un.echo.id, tuple->src.u.icmp.id, false);
hdr->un.echo.id = tuple->src.u.icmp.id;
@@ -722,7 +735,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
return ret;
}
-const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv4_in,
@@ -961,7 +974,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
return ret;
}
-const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
/* Before packet filtering, change destination */
{
.hook = nf_nat_ipv6_in,
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index a2b58de82600..f8f52ff99cfb 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -189,7 +189,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
goto err;
}
- if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) {
+ if (skb_dst(skb) && !skb_dst_force(skb)) {
status = -ENETDOWN;
goto err;
}
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index c769462a839e..b0930d4aba22 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -56,7 +56,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS) {
- opts->mss = get_unaligned_be16(ptr);
+ opts->mss_option = get_unaligned_be16(ptr);
opts->options |= NF_SYNPROXY_OPT_MSS;
}
break;
@@ -115,7 +115,7 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
if (options & NF_SYNPROXY_OPT_MSS)
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
- opts->mss);
+ opts->mss_option);
if (options & NF_SYNPROXY_OPT_TIMESTAMP) {
if (options & NF_SYNPROXY_OPT_SACK_PERM)
@@ -642,7 +642,7 @@ synproxy_recv_client_ack(struct net *net,
}
this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
+ opts->mss_option = mss;
opts->options |= NF_SYNPROXY_OPT_MSS;
if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
@@ -1060,7 +1060,7 @@ synproxy_recv_client_ack_ipv6(struct net *net,
}
this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
+ opts->mss_option = mss;
opts->options |= NF_SYNPROXY_OPT_MSS;
if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d47469f824a1..d1318bdf49ca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -22,6 +22,8 @@
#include <net/net_namespace.h>
#include <net/sock.h>
+#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
+
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
@@ -151,11 +153,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
}
}
+static int nft_netdev_register_hooks(struct net *net,
+ struct list_head *hook_list)
+{
+ struct nft_hook *hook;
+ int err, j;
+
+ j = 0;
+ list_for_each_entry(hook, hook_list, list) {
+ err = nf_register_net_hook(net, &hook->ops);
+ if (err < 0)
+ goto err_register;
+
+ j++;
+ }
+ return 0;
+
+err_register:
+ list_for_each_entry(hook, hook_list, list) {
+ if (j-- <= 0)
+ break;
+
+ nf_unregister_net_hook(net, &hook->ops);
+ }
+ return err;
+}
+
+static void nft_netdev_unregister_hooks(struct net *net,
+ struct list_head *hook_list)
+{
+ struct nft_hook *hook;
+
+ list_for_each_entry(hook, hook_list, list)
+ nf_unregister_net_hook(net, &hook->ops);
+}
+
+static int nft_register_basechain_hooks(struct net *net, int family,
+ struct nft_base_chain *basechain)
+{
+ if (family == NFPROTO_NETDEV)
+ return nft_netdev_register_hooks(net, &basechain->hook_list);
+
+ return nf_register_net_hook(net, &basechain->ops);
+}
+
+static void nft_unregister_basechain_hooks(struct net *net, int family,
+ struct nft_base_chain *basechain)
+{
+ if (family == NFPROTO_NETDEV)
+ nft_netdev_unregister_hooks(net, &basechain->hook_list);
+ else
+ nf_unregister_net_hook(net, &basechain->ops);
+}
+
static int nf_tables_register_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
- const struct nft_base_chain *basechain;
+ struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
if (table->flags & NFT_TABLE_F_DORMANT ||
@@ -168,14 +223,14 @@ static int nf_tables_register_hook(struct net *net,
if (basechain->type->ops_register)
return basechain->type->ops_register(net, ops);
- return nf_register_net_hook(net, ops);
+ return nft_register_basechain_hooks(net, table->family, basechain);
}
static void nf_tables_unregister_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
- const struct nft_base_chain *basechain;
+ struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
if (table->flags & NFT_TABLE_F_DORMANT ||
@@ -187,7 +242,7 @@ static void nf_tables_unregister_hook(struct net *net,
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- nf_unregister_net_hook(net, ops);
+ nft_unregister_basechain_hooks(net, table->family, basechain);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -308,6 +363,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
{
+ struct nft_flow_rule *flow;
struct nft_trans *trans;
int err;
@@ -315,6 +371,16 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
if (trans == NULL)
return -ENOMEM;
+ if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(ctx->net, rule);
+ if (IS_ERR(flow)) {
+ nft_trans_destroy(trans);
+ return PTR_ERR(flow);
+ }
+
+ nft_trans_flow_rule(trans) = flow;
+ }
+
err = nf_tables_delrule_deactivate(ctx, rule);
if (err < 0) {
nft_trans_destroy(trans);
@@ -487,46 +553,70 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];
static const struct nft_chain_type *
+__nft_chain_type_get(u8 family, enum nft_chain_types type)
+{
+ if (family >= NFPROTO_NUMPROTO ||
+ type >= NFT_CHAIN_T_MAX)
+ return NULL;
+
+ return chain_type[family][type];
+}
+
+static const struct nft_chain_type *
__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
{
+ const struct nft_chain_type *type;
int i;
for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
- if (chain_type[family][i] != NULL &&
- !nla_strcmp(nla, chain_type[family][i]->name))
- return chain_type[family][i];
+ type = __nft_chain_type_get(family, i);
+ if (!type)
+ continue;
+ if (!nla_strcmp(nla, type->name))
+ return type;
}
return NULL;
}
-/*
- * Loading a module requires dropping mutex that guards the
- * transaction.
- * We first need to abort any pending transactions as once
- * mutex is unlocked a different client could start a new
- * transaction. It must not see any 'future generation'
- * changes * as these changes will never happen.
- */
-#ifdef CONFIG_MODULES
-static int __nf_tables_abort(struct net *net);
+struct nft_module_request {
+ struct list_head list;
+ char module[MODULE_NAME_LEN];
+ bool done;
+};
-static void nft_request_module(struct net *net, const char *fmt, ...)
+#ifdef CONFIG_MODULES
+static int nft_request_module(struct net *net, const char *fmt, ...)
{
char module_name[MODULE_NAME_LEN];
+ struct nft_module_request *req;
va_list args;
int ret;
- __nf_tables_abort(net);
-
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
- if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
- return;
+ if (ret >= MODULE_NAME_LEN)
+ return 0;
- mutex_unlock(&net->nft.commit_mutex);
- request_module("%s", module_name);
- mutex_lock(&net->nft.commit_mutex);
+ list_for_each_entry(req, &net->nft.module_list, list) {
+ if (!strcmp(req->module, module_name)) {
+ if (req->done)
+ return 0;
+
+ /* A request to load this module already exists. */
+ return -EAGAIN;
+ }
+ }
+
+ req = kmalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->done = false;
+ strlcpy(req->module, module_name, MODULE_NAME_LEN);
+ list_add_tail(&req->list, &net->nft.module_list);
+
+ return -EAGAIN;
}
#endif
@@ -550,10 +640,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (autoload) {
- nft_request_module(net, "nft-chain-%u-%.*s", family,
- nla_len(nla), (const char *)nla_data(nla));
- type = __nf_tables_chain_type_lookup(nla, family);
- if (type != NULL)
+ if (nft_request_module(net, "nft-chain-%u-%.*s", family,
+ nla_len(nla),
+ (const char *)nla_data(nla)) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
@@ -742,7 +831,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
if (cnt && i++ == cnt)
break;
- nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+ nft_unregister_basechain_hooks(net, table->family,
+ nft_base_chain(chain));
}
}
@@ -757,14 +847,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table)
if (!nft_is_base_chain(chain))
continue;
- err = nf_register_net_hook(net, &nft_base_chain(chain)->ops);
+ err = nft_register_basechain_hooks(net, table->family,
+ nft_base_chain(chain));
if (err < 0)
- goto err;
+ goto err_register_hooks;
i++;
}
return 0;
-err:
+
+err_register_hooks:
if (i)
nft_table_disable(net, table, i);
return err;
@@ -978,12 +1070,18 @@ static int nft_flush_table(struct nft_ctx *ctx)
}
list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) {
+ if (!nft_is_active_next(ctx->net, flowtable))
+ continue;
+
err = nft_delflowtable(ctx, flowtable);
if (err < 0)
goto out;
}
list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
+ if (!nft_is_active_next(ctx->net, obj))
+ continue;
+
err = nft_delobj(ctx, obj);
if (err < 0)
goto out;
@@ -1086,11 +1184,8 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
void nft_register_chain_type(const struct nft_chain_type *ctype)
{
- if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO))
- return;
-
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) {
+ if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) {
nfnl_unlock(NFNL_SUBSYS_NFTABLES);
return;
}
@@ -1174,7 +1269,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_CHAIN_HOOK] = { .type = NLA_NESTED },
[NFTA_CHAIN_POLICY] = { .type = NLA_U32 },
- [NFTA_CHAIN_TYPE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_TYPE] = { .type = NLA_STRING,
+ .len = NFT_MODULE_AUTOLOAD_LIMIT },
[NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
[NFTA_CHAIN_FLAGS] = { .type = NLA_U32 },
};
@@ -1225,6 +1321,46 @@ nla_put_failure:
return -ENOSPC;
}
+static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
+ const struct nft_base_chain *basechain)
+{
+ const struct nf_hook_ops *ops = &basechain->ops;
+ struct nft_hook *hook, *first = NULL;
+ struct nlattr *nest, *nest_devs;
+ int n = 0;
+
+ nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+ goto nla_put_failure;
+
+ if (family == NFPROTO_NETDEV) {
+ nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (!first)
+ first = hook;
+
+ if (nla_put_string(skb, NFTA_DEVICE_NAME,
+ hook->ops.dev->name))
+ goto nla_put_failure;
+ n++;
+ }
+ nla_nest_end(skb, nest_devs);
+
+ if (n == 1 &&
+ nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+nla_put_failure:
+ return -1;
+}
+
static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq, int event, u32 flags,
int family, const struct nft_table *table,
@@ -1253,21 +1389,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nft_is_base_chain(chain)) {
const struct nft_base_chain *basechain = nft_base_chain(chain);
- const struct nf_hook_ops *ops = &basechain->ops;
struct nft_stats __percpu *stats;
- struct nlattr *nest;
- nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
- if (nest == NULL)
- goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
- goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+ if (nft_dump_basechain_hook(skb, family, basechain))
goto nla_put_failure;
- if (basechain->dev_name[0] &&
- nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name))
- goto nla_put_failure;
- nla_nest_end(skb, nest);
if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
htonl(basechain->policy)))
@@ -1461,8 +1586,9 @@ static void nft_chain_stats_replace(struct nft_trans *trans)
if (!nft_trans_chain_stats(trans))
return;
- rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans),
- lockdep_commit_lock_is_held(trans->ctx.net));
+ nft_trans_chain_stats(trans) =
+ rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans),
+ lockdep_commit_lock_is_held(trans->ctx.net));
if (!nft_trans_chain_stats(trans))
static_branch_inc(&nft_counters_enabled);
@@ -1485,6 +1611,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
static void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
struct nft_chain *chain = ctx->chain;
+ struct nft_hook *hook, *next;
if (WARN_ON(chain->use > 0))
return;
@@ -1495,6 +1622,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
+ if (ctx->family == NFPROTO_NETDEV) {
+ list_for_each_entry_safe(hook, next,
+ &basechain->hook_list, list) {
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+ }
module_put(basechain->type->owner);
if (rcu_access_pointer(basechain->stats)) {
static_branch_dec(&nft_counters_enabled);
@@ -1508,13 +1642,126 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
}
}
+static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
+ const struct nlattr *attr)
+{
+ struct net_device *dev;
+ char ifname[IFNAMSIZ];
+ struct nft_hook *hook;
+ int err;
+
+ hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
+ if (!hook) {
+ err = -ENOMEM;
+ goto err_hook_alloc;
+ }
+
+ nla_strlcpy(ifname, attr, IFNAMSIZ);
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev) {
+ err = -ENOENT;
+ goto err_hook_dev;
+ }
+ hook->ops.dev = dev;
+
+ return hook;
+
+err_hook_dev:
+ kfree(hook);
+err_hook_alloc:
+ return ERR_PTR(err);
+}
+
+static bool nft_hook_list_find(struct list_head *hook_list,
+ const struct nft_hook *this)
+{
+ struct nft_hook *hook;
+
+ list_for_each_entry(hook, hook_list, list) {
+ if (this->ops.dev == hook->ops.dev)
+ return true;
+ }
+
+ return false;
+}
+
+static int nf_tables_parse_netdev_hooks(struct net *net,
+ const struct nlattr *attr,
+ struct list_head *hook_list)
+{
+ struct nft_hook *hook, *next;
+ const struct nlattr *tmp;
+ int rem, n = 0, err;
+
+ nla_for_each_nested(tmp, attr, rem) {
+ if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+ err = -EINVAL;
+ goto err_hook;
+ }
+
+ hook = nft_netdev_hook_alloc(net, tmp);
+ if (IS_ERR(hook)) {
+ err = PTR_ERR(hook);
+ goto err_hook;
+ }
+ if (nft_hook_list_find(hook_list, hook)) {
+ kfree(hook);
+ err = -EEXIST;
+ goto err_hook;
+ }
+ list_add_tail(&hook->list, hook_list);
+ n++;
+
+ if (n == NFT_NETDEVICE_MAX) {
+ err = -EFBIG;
+ goto err_hook;
+ }
+ }
+ if (!n)
+ return -EINVAL;
+
+ return 0;
+
+err_hook:
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_del(&hook->list);
+ kfree(hook);
+ }
+ return err;
+}
+
struct nft_chain_hook {
u32 num;
s32 priority;
const struct nft_chain_type *type;
- struct net_device *dev;
+ struct list_head list;
};
+static int nft_chain_parse_netdev(struct net *net,
+ struct nlattr *tb[],
+ struct list_head *hook_list)
+{
+ struct nft_hook *hook;
+ int err;
+
+ if (tb[NFTA_HOOK_DEV]) {
+ hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ list_add_tail(&hook->list, hook_list);
+ } else if (tb[NFTA_HOOK_DEVS]) {
+ err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
+ hook_list);
+ if (err < 0)
+ return err;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nft_chain_parse_hook(struct net *net,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
@@ -1522,7 +1769,6 @@ static int nft_chain_parse_hook(struct net *net,
{
struct nlattr *ha[NFTA_HOOK_MAX + 1];
const struct nft_chain_type *type;
- struct net_device *dev;
int err;
lockdep_assert_held(&net->nft.commit_mutex);
@@ -1541,7 +1787,10 @@ static int nft_chain_parse_hook(struct net *net,
hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
- type = chain_type[family][NFT_CHAIN_T_DEFAULT];
+ type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
+ if (!type)
+ return -EOPNOTSUPP;
+
if (nla[NFTA_CHAIN_TYPE]) {
type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
family, autoload);
@@ -1560,23 +1809,14 @@ static int nft_chain_parse_hook(struct net *net,
hook->type = type;
- hook->dev = NULL;
+ INIT_LIST_HEAD(&hook->list);
if (family == NFPROTO_NETDEV) {
- char ifname[IFNAMSIZ];
-
- if (!ha[NFTA_HOOK_DEV]) {
- module_put(type->owner);
- return -EOPNOTSUPP;
- }
-
- nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
- dev = __dev_get_by_name(net, ifname);
- if (!dev) {
+ err = nft_chain_parse_netdev(net, ha, &hook->list);
+ if (err < 0) {
module_put(type->owner);
- return -ENOENT;
+ return err;
}
- hook->dev = dev;
- } else if (ha[NFTA_HOOK_DEV]) {
+ } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) {
module_put(type->owner);
return -EOPNOTSUPP;
}
@@ -1586,6 +1826,12 @@ static int nft_chain_parse_hook(struct net *net,
static void nft_chain_release_hook(struct nft_chain_hook *hook)
{
+ struct nft_hook *h, *next;
+
+ list_for_each_entry_safe(h, next, &hook->list, list) {
+ list_del(&h->list);
+ kfree(h);
+ }
module_put(hook->type->owner);
}
@@ -1610,6 +1856,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
return kvmalloc(alloc, GFP_KERNEL);
}
+static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
+ const struct nft_chain_hook *hook,
+ struct nft_chain *chain)
+{
+ ops->pf = family;
+ ops->hooknum = hook->num;
+ ops->priority = hook->priority;
+ ops->priv = chain;
+ ops->hook = hook->type->hooks[ops->hooknum];
+}
+
+static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
+ struct nft_chain_hook *hook, u32 flags)
+{
+ struct nft_chain *chain;
+ struct nft_hook *h;
+
+ basechain->type = hook->type;
+ INIT_LIST_HEAD(&basechain->hook_list);
+ chain = &basechain->chain;
+
+ if (family == NFPROTO_NETDEV) {
+ list_splice_init(&hook->list, &basechain->hook_list);
+ list_for_each_entry(h, &basechain->hook_list, list)
+ nft_basechain_hook_init(&h->ops, family, hook, chain);
+
+ basechain->ops.hooknum = hook->num;
+ basechain->ops.priority = hook->priority;
+ } else {
+ nft_basechain_hook_init(&basechain->ops, family, hook, chain);
+ }
+
+ chain->flags |= NFT_BASE_CHAIN | flags;
+ basechain->policy = NF_ACCEPT;
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
+ nft_chain_offload_priority(basechain) < 0)
+ return -EOPNOTSUPP;
+
+ flow_block_init(&basechain->flow_block);
+
+ return 0;
+}
+
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
u8 policy, u32 flags)
{
@@ -1628,7 +1917,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (nla[NFTA_CHAIN_HOOK]) {
struct nft_chain_hook hook;
- struct nf_hook_ops *ops;
err = nft_chain_parse_hook(net, nla, &hook, family, true);
if (err < 0)
@@ -1639,9 +1927,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
nft_chain_release_hook(&hook);
return -ENOMEM;
}
-
- if (hook.dev != NULL)
- strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ);
+ chain = &basechain->chain;
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
@@ -1654,24 +1940,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
static_branch_inc(&nft_counters_enabled);
}
- basechain->type = hook.type;
- chain = &basechain->chain;
-
- ops = &basechain->ops;
- ops->pf = family;
- ops->hooknum = hook.num;
- ops->priority = hook.priority;
- ops->priv = chain;
- ops->hook = hook.type->hooks[ops->hooknum];
- ops->dev = hook.dev;
-
- chain->flags |= NFT_BASE_CHAIN | flags;
- basechain->policy = NF_ACCEPT;
- if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
- nft_chain_offload_priority(basechain) < 0)
- return -EOPNOTSUPP;
-
- flow_block_init(&basechain->flow_block);
+ err = nft_basechain_init(basechain, family, &hook, flags);
+ if (err < 0) {
+ nft_chain_release_hook(&hook);
+ kfree(basechain);
+ return err;
+ }
} else {
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (chain == NULL)
@@ -1715,7 +1989,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
goto err2;
}
- nft_trans_chain_policy(trans) = -1;
+ nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
if (nft_is_base_chain(chain))
nft_trans_chain_policy(trans) = policy;
@@ -1731,6 +2005,25 @@ err1:
return err;
}
+static bool nft_hook_list_equal(struct list_head *hook_list1,
+ struct list_head *hook_list2)
+{
+ struct nft_hook *hook;
+ int n = 0, m = 0;
+
+ n = 0;
+ list_for_each_entry(hook, hook_list2, list) {
+ if (!nft_hook_list_find(hook_list1, hook))
+ return false;
+
+ n++;
+ }
+ list_for_each_entry(hook, hook_list1, list)
+ m++;
+
+ return n == m;
+}
+
static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
u32 flags)
{
@@ -1762,12 +2055,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return -EBUSY;
}
- ops = &basechain->ops;
- if (ops->hooknum != hook.num ||
- ops->priority != hook.priority ||
- ops->dev != hook.dev) {
- nft_chain_release_hook(&hook);
- return -EBUSY;
+ if (ctx->family == NFPROTO_NETDEV) {
+ if (!nft_hook_list_equal(&basechain->hook_list,
+ &hook.list)) {
+ nft_chain_release_hook(&hook);
+ return -EBUSY;
+ }
+ } else {
+ ops = &basechain->ops;
+ if (ops->hooknum != hook.num ||
+ ops->priority != hook.priority) {
+ nft_chain_release_hook(&hook);
+ return -EBUSY;
+ }
}
nft_chain_release_hook(&hook);
}
@@ -1922,6 +2222,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
+ flags |= chain->flags & NFT_BASE_CHAIN;
return nf_tables_updchain(&ctx, genmask, policy, flags);
}
@@ -2049,9 +2350,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
static int nft_expr_type_request_module(struct net *net, u8 family,
struct nlattr *nla)
{
- nft_request_module(net, "nft-expr-%u-%.*s", family,
- nla_len(nla), (char *)nla_data(nla));
- if (__nft_expr_type_get(family, nla))
+ if (nft_request_module(net, "nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla)) == -EAGAIN)
return -EAGAIN;
return 0;
@@ -2077,9 +2377,9 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
if (nft_expr_type_request_module(net, family, nla) == -EAGAIN)
return ERR_PTR(-EAGAIN);
- nft_request_module(net, "nft-expr-%.*s",
- nla_len(nla), (char *)nla_data(nla));
- if (__nft_expr_type_get(family, nla))
+ if (nft_request_module(net, "nft-expr-%.*s",
+ nla_len(nla),
+ (char *)nla_data(nla)) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
@@ -2087,7 +2387,8 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
}
static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
- [NFTA_EXPR_NAME] = { .type = NLA_STRING },
+ [NFTA_EXPR_NAME] = { .type = NLA_STRING,
+ .len = NFT_MODULE_AUTOLOAD_LIMIT },
[NFTA_EXPR_DATA] = { .type = NLA_NESTED },
};
@@ -2169,9 +2470,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
err = PTR_ERR(ops);
#ifdef CONFIG_MODULES
if (err == -EAGAIN)
- nft_expr_type_request_module(ctx->net,
- ctx->family,
- tb[NFTA_EXPR_NAME]);
+ if (nft_expr_type_request_module(ctx->net,
+ ctx->family,
+ tb[NFTA_EXPR_NAME]) != -EAGAIN)
+ err = -ENOENT;
#endif
goto err1;
}
@@ -2853,7 +3155,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return nft_table_validate(net, table);
if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
- flow = nft_flow_rule_create(rule);
+ flow = nft_flow_rule_create(net, rule);
if (IS_ERR(flow))
return PTR_ERR(flow);
@@ -3008,8 +3310,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (list_empty(&nf_tables_set_types)) {
- nft_request_module(ctx->net, "nft-set");
- if (!list_empty(&nf_tables_set_types))
+ if (nft_request_module(ctx->net, "nft-set") == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
@@ -3090,6 +3391,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
+ [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED },
};
static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
@@ -3256,6 +3558,33 @@ static __be64 nf_jiffies64_to_msecs(u64 input)
return cpu_to_be64(jiffies64_to_msecs(input));
}
+static int nf_tables_fill_set_concat(struct sk_buff *skb,
+ const struct nft_set *set)
+{
+ struct nlattr *concat, *field;
+ int i;
+
+ concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT);
+ if (!concat)
+ return -ENOMEM;
+
+ for (i = 0; i < set->field_count; i++) {
+ field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
+ if (!field)
+ return -ENOMEM;
+
+ if (nla_put_be32(skb, NFTA_SET_FIELD_LEN,
+ htonl(set->field_len[i])))
+ return -ENOMEM;
+
+ nla_nest_end(skb, field);
+ }
+
+ nla_nest_end(skb, concat);
+
+ return 0;
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
@@ -3319,11 +3648,17 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
goto nla_put_failure;
desc = nla_nest_start_noflag(skb, NFTA_SET_DESC);
+
if (desc == NULL)
goto nla_put_failure;
if (set->size &&
nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
goto nla_put_failure;
+
+ if (set->field_count > 1 &&
+ nf_tables_fill_set_concat(skb, set))
+ goto nla_put_failure;
+
nla_nest_end(skb, desc);
nlmsg_end(skb, nlh);
@@ -3496,6 +3831,53 @@ err:
return err;
}
+static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
+ [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
+};
+
+static int nft_set_desc_concat_parse(const struct nlattr *attr,
+ struct nft_set_desc *desc)
+{
+ struct nlattr *tb[NFTA_SET_FIELD_MAX + 1];
+ u32 len;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
+ nft_concat_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_SET_FIELD_LEN])
+ return -EINVAL;
+
+ len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
+
+ if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT)
+ return -E2BIG;
+
+ desc->field_len[desc->field_count++] = len;
+
+ return 0;
+}
+
+static int nft_set_desc_concat(struct nft_set_desc *desc,
+ const struct nlattr *nla)
+{
+ struct nlattr *attr;
+ int rem, err;
+
+ nla_for_each_nested(attr, nla, rem) {
+ if (nla_type(attr) != NFTA_LIST_ELEM)
+ return -EINVAL;
+
+ err = nft_set_desc_concat_parse(attr, desc);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
const struct nlattr *nla)
{
@@ -3509,8 +3891,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
if (da[NFTA_SET_DESC_SIZE] != NULL)
desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
+ if (da[NFTA_SET_DESC_CONCAT])
+ err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]);
- return 0;
+ return err;
}
static int nf_tables_newset(struct net *net, struct sock *nlsk,
@@ -3533,6 +3917,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
unsigned char *udata;
u16 udlen;
int err;
+ int i;
if (nla[NFTA_SET_TABLE] == NULL ||
nla[NFTA_SET_NAME] == NULL ||
@@ -3562,8 +3947,11 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
NFT_SET_OBJECT))
return -EINVAL;
/* Only one of these operations is supported */
- if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
- (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
+ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
+ (NFT_SET_MAP | NFT_SET_OBJECT))
+ return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+ (NFT_SET_EVAL | NFT_SET_OBJECT))
return -EOPNOTSUPP;
}
@@ -3708,6 +4096,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->gc_int = gc_int;
set->handle = nf_tables_alloc_handle(table);
+ set->field_count = desc.field_count;
+ for (i = 0; i < desc.field_count; i++)
+ set->field_len[i] = desc.field_len[i];
+
err = ops->init(set, &desc, nla);
if (err < 0)
goto err3;
@@ -3911,6 +4303,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.len = sizeof(struct nft_userdata),
.align = __alignof__(struct nft_userdata),
},
+ [NFT_SET_EXT_KEY_END] = {
+ .align = __alignof__(u32),
+ },
};
EXPORT_SYMBOL_GPL(nft_set_ext_types);
@@ -3927,7 +4322,9 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED },
- [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
+ .len = NFT_OBJ_MAXNAMELEN - 1 },
+ [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -3977,6 +4374,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
NFT_DATA_VALUE, set->klen) < 0)
goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext),
+ NFT_DATA_VALUE, set->klen) < 0)
+ goto nla_put_failure;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
@@ -4219,11 +4621,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
return 0;
}
+static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_data *key, struct nlattr *attr)
+{
+ struct nft_data_desc desc;
+ int err;
+
+ err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr);
+ if (err < 0)
+ return err;
+
+ if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) {
+ nft_data_release(key, desc.type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
- struct nft_data_desc desc;
struct nft_set_elem elem;
struct sk_buff *skb;
uint32_t flags = 0;
@@ -4242,14 +4661,17 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
- nla[NFTA_SET_ELEM_KEY]);
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
return err;
- err = -EINVAL;
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
- return err;
+ if (nla[NFTA_SET_ELEM_KEY_END]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
+ nla[NFTA_SET_ELEM_KEY_END]);
+ if (err < 0)
+ return err;
+ }
priv = set->ops->get(ctx->net, set, &elem, flags);
if (IS_ERR(priv))
@@ -4376,8 +4798,8 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
- const u32 *key, const u32 *data,
- u64 timeout, u64 expiration, gfp_t gfp)
+ const u32 *key, const u32 *key_end,
+ const u32 *data, u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;
@@ -4390,6 +4812,8 @@ void *nft_set_elem_init(const struct nft_set *set,
nft_set_ext_init(ext, tmpl);
memcpy(nft_set_ext_key(ext), key, set->klen);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
+ memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
memcpy(nft_set_ext_data(ext), data, set->dlen);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
@@ -4449,13 +4873,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
u8 genmask = nft_genmask_next(ctx->net);
- struct nft_data_desc d1, d2;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
struct nft_object *obj = NULL;
struct nft_userdata *udata;
+ struct nft_data_desc desc;
struct nft_data data;
enum nft_registers dreg;
struct nft_trans *trans;
@@ -4485,14 +4909,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
!(flags & NFT_SET_ELEM_INTERVAL_END))
return -EINVAL;
- if (nla[NFTA_SET_ELEM_DATA] != NULL &&
- flags & NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
} else {
if (nla[NFTA_SET_ELEM_DATA] != NULL)
return -EINVAL;
}
+ if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
+ (nla[NFTA_SET_ELEM_DATA] ||
+ nla[NFTA_SET_ELEM_OBJREF] ||
+ nla[NFTA_SET_ELEM_TIMEOUT] ||
+ nla[NFTA_SET_ELEM_EXPIRATION] ||
+ nla[NFTA_SET_ELEM_USERDATA] ||
+ nla[NFTA_SET_ELEM_EXPR]))
+ return -EINVAL;
+
timeout = 0;
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
@@ -4515,15 +4945,22 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
- err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1,
- nla[NFTA_SET_ELEM_KEY]);
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
- goto err1;
- err = -EINVAL;
- if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
- goto err2;
+ return err;
+
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+
+ if (nla[NFTA_SET_ELEM_KEY_END]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
+ nla[NFTA_SET_ELEM_KEY_END]);
+ if (err < 0)
+ goto err_parse_key;
+
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ }
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len);
if (timeout > 0) {
nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
if (timeout != set->timeout)
@@ -4533,27 +4970,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
err = -EINVAL;
- goto err2;
+ goto err_parse_key_end;
}
obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err2;
+ goto err_parse_key_end;
}
nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
}
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
- err = nft_data_init(ctx, &data, sizeof(data), &d2,
+ err = nft_data_init(ctx, &data, sizeof(data), &desc,
nla[NFTA_SET_ELEM_DATA]);
if (err < 0)
- goto err2;
+ goto err_parse_key_end;
err = -EINVAL;
- if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen)
- goto err3;
+ if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen)
+ goto err_parse_data;
dreg = nft_type_to_reg(set->dtype);
list_for_each_entry(binding, &set->bindings, list) {
@@ -4569,18 +5006,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = nft_validate_register_store(&bind_ctx, dreg,
&data,
- d2.type, d2.len);
+ desc.type, desc.len);
if (err < 0)
- goto err3;
+ goto err_parse_data;
- if (d2.type == NFT_DATA_VERDICT &&
+ if (desc.type == NFT_DATA_VERDICT &&
(data.verdict.code == NFT_GOTO ||
data.verdict.code == NFT_JUMP))
nft_validate_state_update(ctx->net,
NFT_VALIDATE_NEED);
}
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
}
/* The full maximum length of userdata can exceed the maximum
@@ -4596,10 +5033,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
}
err = -ENOMEM;
- elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data,
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
+ elem.key_end.val.data, data.data,
timeout, expiration, GFP_KERNEL);
if (elem.priv == NULL)
- goto err3;
+ goto err_parse_data;
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
@@ -4616,7 +5054,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
- goto err4;
+ goto err_trans;
ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
err = set->ops->insert(ctx->net, set, &elem, &ext2);
@@ -4627,7 +5065,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) {
err = -EBUSY;
- goto err5;
+ goto err_element_clash;
}
if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
@@ -4640,33 +5078,35 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
else if (!(nlmsg_flags & NLM_F_EXCL))
err = 0;
}
- goto err5;
+ goto err_element_clash;
}
if (set->size &&
!atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
err = -ENFILE;
- goto err6;
+ goto err_set_full;
}
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
-err6:
+err_set_full:
set->ops->remove(ctx->net, set, &elem);
-err5:
+err_element_clash:
kfree(trans);
-err4:
+err_trans:
if (obj)
obj->use--;
kfree(elem.priv);
-err3:
+err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
- nft_data_release(&data, d2.type);
-err2:
- nft_data_release(&elem.key.val, d1.type);
-err1:
+ nft_data_release(&data, desc.type);
+err_parse_key_end:
+ nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
+err_parse_key:
+ nft_data_release(&elem.key.val, NFT_DATA_VALUE);
+
return err;
}
@@ -4761,7 +5201,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_ext_tmpl tmpl;
- struct nft_data_desc desc;
struct nft_set_elem elem;
struct nft_set_ext *ext;
struct nft_trans *trans;
@@ -4772,11 +5211,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy, NULL);
if (err < 0)
- goto err1;
+ return err;
- err = -EINVAL;
if (nla[NFTA_SET_ELEM_KEY] == NULL)
- goto err1;
+ return -EINVAL;
nft_set_ext_prepare(&tmpl);
@@ -4786,37 +5224,41 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (flags != 0)
nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
- err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
- nla[NFTA_SET_ELEM_KEY]);
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
- goto err1;
+ return err;
- err = -EINVAL;
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
- goto err2;
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len);
+ if (nla[NFTA_SET_ELEM_KEY_END]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
+ nla[NFTA_SET_ELEM_KEY_END]);
+ if (err < 0)
+ return err;
+
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ }
err = -ENOMEM;
- elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
- 0, GFP_KERNEL);
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
+ elem.key_end.val.data, NULL, 0, 0,
+ GFP_KERNEL);
if (elem.priv == NULL)
- goto err2;
+ goto fail_elem;
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
- if (trans == NULL) {
- err = -ENOMEM;
- goto err3;
- }
+ if (trans == NULL)
+ goto fail_trans;
priv = set->ops->deactivate(ctx->net, set, &elem);
if (priv == NULL) {
err = -ENOENT;
- goto err4;
+ goto fail_ops;
}
kfree(elem.priv);
elem.priv = priv;
@@ -4827,13 +5269,12 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
-err4:
+fail_ops:
kfree(trans);
-err3:
+fail_trans:
kfree(elem.priv);
-err2:
- nft_data_release(&elem.key.val, desc.type);
-err1:
+fail_elem:
+ nft_data_release(&elem.key.val, NFT_DATA_VALUE);
return err;
}
@@ -5123,14 +5564,45 @@ nft_obj_type_get(struct net *net, u32 objtype)
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nft_request_module(net, "nft-obj-%u", objtype);
- if (__nft_obj_type_get(objtype))
+ if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
return ERR_PTR(-ENOENT);
}
+static int nf_tables_updobj(const struct nft_ctx *ctx,
+ const struct nft_object_type *type,
+ const struct nlattr *attr,
+ struct nft_object *obj)
+{
+ struct nft_object *newobj;
+ struct nft_trans *trans;
+ int err;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
+ sizeof(struct nft_trans_obj));
+ if (!trans)
+ return -ENOMEM;
+
+ newobj = nft_obj_init(ctx, type, attr);
+ if (IS_ERR(newobj)) {
+ err = PTR_ERR(newobj);
+ goto err_free_trans;
+ }
+
+ nft_trans_obj(trans) = obj;
+ nft_trans_obj_update(trans) = true;
+ nft_trans_obj_newobj(trans) = newobj;
+ list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ return 0;
+
+err_free_trans:
+ kfree(trans);
+ return err;
+}
+
static int nf_tables_newobj(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -5170,7 +5642,13 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return -EEXIST;
}
- return 0;
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ type = __nft_obj_type_get(objtype);
+ nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+
+ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
}
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
@@ -5538,6 +6016,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
.len = NFT_NAME_MAXLEN - 1 },
[NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED },
[NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 },
};
struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
@@ -5554,6 +6033,22 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
}
EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
+void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
+ struct nft_flowtable *flowtable,
+ enum nft_trans_phase phase)
+{
+ switch (phase) {
+ case NFT_TRANS_PREPARE:
+ case NFT_TRANS_ABORT:
+ case NFT_TRANS_RELEASE:
+ flowtable->use--;
+ /* fall through */
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable);
+
static struct nft_flowtable *
nft_flowtable_lookup_byhandle(const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
@@ -5568,43 +6063,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table,
return ERR_PTR(-ENOENT);
}
-static int nf_tables_parse_devices(const struct nft_ctx *ctx,
- const struct nlattr *attr,
- struct net_device *dev_array[], int *len)
-{
- const struct nlattr *tmp;
- struct net_device *dev;
- char ifname[IFNAMSIZ];
- int rem, n = 0, err;
-
- nla_for_each_nested(tmp, attr, rem) {
- if (nla_type(tmp) != NFTA_DEVICE_NAME) {
- err = -EINVAL;
- goto err1;
- }
-
- nla_strlcpy(ifname, tmp, IFNAMSIZ);
- dev = __dev_get_by_name(ctx->net, ifname);
- if (!dev) {
- err = -ENOENT;
- goto err1;
- }
-
- dev_array[n++] = dev;
- if (n == NFT_FLOWTABLE_DEVICE_MAX) {
- err = -EFBIG;
- goto err1;
- }
- }
- if (!len)
- return -EINVAL;
-
- err = 0;
-err1:
- *len = n;
- return err;
-}
-
static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
[NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 },
[NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 },
@@ -5615,11 +6073,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
const struct nlattr *attr,
struct nft_flowtable *flowtable)
{
- struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX];
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
- struct nf_hook_ops *ops;
+ struct nft_hook *hook;
int hooknum, priority;
- int err, n = 0, i;
+ int err;
err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
nft_flowtable_hook_policy, NULL);
@@ -5637,27 +6094,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
- err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS],
- dev_array, &n);
+ err = nf_tables_parse_netdev_hooks(ctx->net,
+ tb[NFTA_FLOWTABLE_HOOK_DEVS],
+ &flowtable->hook_list);
if (err < 0)
return err;
- ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL);
- if (!ops)
- return -ENOMEM;
-
- flowtable->hooknum = hooknum;
- flowtable->priority = priority;
- flowtable->ops = ops;
- flowtable->ops_len = n;
+ flowtable->hooknum = hooknum;
+ flowtable->data.priority = priority;
- for (i = 0; i < n; i++) {
- flowtable->ops[i].pf = NFPROTO_NETDEV;
- flowtable->ops[i].hooknum = hooknum;
- flowtable->ops[i].priority = priority;
- flowtable->ops[i].priv = &flowtable->data;
- flowtable->ops[i].hook = flowtable->data.type->hook;
- flowtable->ops[i].dev = dev_array[i];
+ list_for_each_entry(hook, &flowtable->hook_list, list) {
+ hook->ops.pf = NFPROTO_NETDEV;
+ hook->ops.hooknum = hooknum;
+ hook->ops.priority = priority;
+ hook->ops.priv = &flowtable->data;
+ hook->ops.hook = flowtable->data.type->hook;
}
return err;
@@ -5686,25 +6137,81 @@ nft_flowtable_type_get(struct net *net, u8 family)
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nft_request_module(net, "nf-flowtable-%u", family);
- if (__nft_flowtable_type_get(family))
+ if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
return ERR_PTR(-ENOENT);
}
+/* Only called from error and netdev event paths. */
+static void nft_unregister_flowtable_hook(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nft_hook *hook)
+{
+ nf_unregister_net_hook(net, &hook->ops);
+ flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+}
+
static void nft_unregister_flowtable_net_hooks(struct net *net,
struct nft_flowtable *flowtable)
{
- int i;
+ struct nft_hook *hook;
- for (i = 0; i < flowtable->ops_len; i++) {
- if (!flowtable->ops[i].dev)
- continue;
+ list_for_each_entry(hook, &flowtable->hook_list, list)
+ nf_unregister_net_hook(net, &hook->ops);
+}
- nf_unregister_net_hook(net, &flowtable->ops[i]);
+static int nft_register_flowtable_net_hooks(struct net *net,
+ struct nft_table *table,
+ struct nft_flowtable *flowtable)
+{
+ struct nft_hook *hook, *hook2, *next;
+ struct nft_flowtable *ft;
+ int err, i = 0;
+
+ list_for_each_entry(hook, &flowtable->hook_list, list) {
+ list_for_each_entry(ft, &table->flowtables, list) {
+ list_for_each_entry(hook2, &ft->hook_list, list) {
+ if (hook->ops.dev == hook2->ops.dev &&
+ hook->ops.pf == hook2->ops.pf) {
+ err = -EBUSY;
+ goto err_unregister_net_hooks;
+ }
+ }
+ }
+
+ err = flowtable->data.type->setup(&flowtable->data,
+ hook->ops.dev,
+ FLOW_BLOCK_BIND);
+ if (err < 0)
+ goto err_unregister_net_hooks;
+
+ err = nf_register_net_hook(net, &hook->ops);
+ if (err < 0) {
+ flowtable->data.type->setup(&flowtable->data,
+ hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+ goto err_unregister_net_hooks;
+ }
+
+ i++;
}
+
+ return 0;
+
+err_unregister_net_hooks:
+ list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+ if (i-- <= 0)
+ break;
+
+ nft_unregister_flowtable_hook(net, flowtable, hook);
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
+
+ return err;
}
static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
@@ -5715,12 +6222,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nf_flowtable_type *type;
- struct nft_flowtable *flowtable, *ft;
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
+ struct nft_flowtable *flowtable;
+ struct nft_hook *hook, *next;
struct nft_table *table;
struct nft_ctx ctx;
- int err, i, k;
+ int err;
if (!nla[NFTA_FLOWTABLE_TABLE] ||
!nla[NFTA_FLOWTABLE_NAME] ||
@@ -5759,6 +6267,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
flowtable->table = table;
flowtable->handle = nf_tables_alloc_handle(table);
+ INIT_LIST_HEAD(&flowtable->hook_list);
flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
if (!flowtable->name) {
@@ -5772,6 +6281,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
goto err2;
}
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flowtable->data.flags =
+ ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD)
+ goto err3;
+ }
+
+ write_pnet(&flowtable->data.net, net);
flowtable->data.type = type;
err = type->init(&flowtable->data);
if (err < 0)
@@ -5782,43 +6299,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (err < 0)
goto err4;
- for (i = 0; i < flowtable->ops_len; i++) {
- if (!flowtable->ops[i].dev)
- continue;
-
- list_for_each_entry(ft, &table->flowtables, list) {
- for (k = 0; k < ft->ops_len; k++) {
- if (!ft->ops[k].dev)
- continue;
-
- if (flowtable->ops[i].dev == ft->ops[k].dev &&
- flowtable->ops[i].pf == ft->ops[k].pf) {
- err = -EBUSY;
- goto err5;
- }
- }
- }
-
- err = nf_register_net_hook(net, &flowtable->ops[i]);
- if (err < 0)
- goto err5;
- }
+ err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable);
+ if (err < 0)
+ goto err4;
err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
- goto err6;
+ goto err5;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
table->use++;
return 0;
-err6:
- i = flowtable->ops_len;
err5:
- for (k = i - 1; k >= 0; k--)
- nf_unregister_net_hook(net, &flowtable->ops[k]);
-
- kfree(flowtable->ops);
+ list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+ nft_unregister_flowtable_hook(net, flowtable, hook);
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
err4:
flowtable->data.type->free(&flowtable->data);
err3:
@@ -5885,8 +6383,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
{
struct nlattr *nest, *nest_devs;
struct nfgenmsg *nfmsg;
+ struct nft_hook *hook;
struct nlmsghdr *nlh;
- int i;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
@@ -5902,25 +6400,23 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
- NFTA_FLOWTABLE_PAD))
+ NFTA_FLOWTABLE_PAD) ||
+ nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
goto nla_put_failure;
nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK);
if (!nest)
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
- nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
+ nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority)))
goto nla_put_failure;
nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS);
if (!nest_devs)
goto nla_put_failure;
- for (i = 0; i < flowtable->ops_len; i++) {
- const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev);
-
- if (dev &&
- nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
+ list_for_each_entry_rcu(hook, &flowtable->hook_list, list) {
+ if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -6111,9 +6607,16 @@ err:
static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
- kfree(flowtable->ops);
- kfree(flowtable->name);
+ struct nft_hook *hook, *next;
+
flowtable->data.type->free(&flowtable->data);
+ list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+ flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+ list_del_rcu(&hook->list);
+ kfree(hook);
+ }
+ kfree(flowtable->name);
module_put(flowtable->data.type->owner);
kfree(flowtable);
}
@@ -6151,14 +6654,16 @@ nla_put_failure:
static void nft_flowtable_event(unsigned long event, struct net_device *dev,
struct nft_flowtable *flowtable)
{
- int i;
+ struct nft_hook *hook;
- for (i = 0; i < flowtable->ops_len; i++) {
- if (flowtable->ops[i].dev != dev)
+ list_for_each_entry(hook, &flowtable->hook_list, list) {
+ if (hook->ops.dev != dev)
continue;
- nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
- flowtable->ops[i].dev = NULL;
+ /* flow_offload_netdev_event() cleans up entries for us. */
+ nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
break;
}
}
@@ -6431,6 +6936,20 @@ static void nft_chain_commit_update(struct nft_trans *trans)
}
}
+static void nft_obj_commit_update(struct nft_trans *trans)
+{
+ struct nft_object *newobj;
+ struct nft_object *obj;
+
+ obj = nft_trans_obj(trans);
+ newobj = nft_trans_obj_newobj(trans);
+
+ if (obj->ops->update)
+ obj->ops->update(obj, newobj);
+
+ kfree(newobj);
+}
+
static void nft_commit_release(struct nft_trans *trans)
{
switch (trans->msg_type) {
@@ -6620,6 +7139,18 @@ static void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
+static void nf_tables_module_autoload_cleanup(struct net *net)
+{
+ struct nft_module_request *req, *next;
+
+ WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
+ list_for_each_entry_safe(req, next, &net->nft.module_list, list) {
+ WARN_ON_ONCE(!req->done);
+ list_del(&req->list);
+ kfree(req);
+ }
+}
+
static void nf_tables_commit_release(struct net *net)
{
struct nft_trans *trans;
@@ -6632,6 +7163,7 @@ static void nf_tables_commit_release(struct net *net)
* to prevent expensive synchronize_rcu() in commit phase.
*/
if (list_empty(&net->nft.commit_list)) {
+ nf_tables_module_autoload_cleanup(net);
mutex_unlock(&net->nft.commit_mutex);
return;
}
@@ -6646,6 +7178,7 @@ static void nf_tables_commit_release(struct net *net)
list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
+ nf_tables_module_autoload_cleanup(net);
mutex_unlock(&net->nft.commit_mutex);
schedule_work(&trans_destroy_work);
@@ -6795,10 +7328,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
te->set->ndeact--;
break;
case NFT_MSG_NEWOBJ:
- nft_clear(net, nft_trans_obj(trans));
- nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
- NFT_MSG_NEWOBJ);
- nft_trans_destroy(trans);
+ if (nft_trans_obj_update(trans)) {
+ nft_obj_commit_update(trans);
+ nf_tables_obj_notify(&trans->ctx,
+ nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ } else {
+ nft_clear(net, nft_trans_obj(trans));
+ nf_tables_obj_notify(&trans->ctx,
+ nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ nft_trans_destroy(trans);
+ }
break;
case NFT_MSG_DELOBJ:
nft_obj_del(nft_trans_obj(trans));
@@ -6829,6 +7370,26 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
return 0;
}
+static void nf_tables_module_autoload(struct net *net)
+{
+ struct nft_module_request *req, *next;
+ LIST_HEAD(module_list);
+
+ list_splice_init(&net->nft.module_list, &module_list);
+ mutex_unlock(&net->nft.commit_mutex);
+ list_for_each_entry_safe(req, next, &module_list, list) {
+ if (req->done) {
+ list_del(&req->list);
+ kfree(req);
+ } else {
+ request_module("%s", req->module);
+ req->done = true;
+ }
+ }
+ mutex_lock(&net->nft.commit_mutex);
+ list_splice(&module_list, &net->nft.module_list);
+}
+
static void nf_tables_abort_release(struct nft_trans *trans)
{
switch (trans->msg_type) {
@@ -6858,7 +7419,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
kfree(trans);
}
-static int __nf_tables_abort(struct net *net)
+static int __nf_tables_abort(struct net *net, bool autoload)
{
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
@@ -6945,8 +7506,13 @@ static int __nf_tables_abort(struct net *net)
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWOBJ:
- trans->ctx.table->use--;
- nft_obj_del(nft_trans_obj(trans));
+ if (nft_trans_obj_update(trans)) {
+ kfree(nft_trans_obj_newobj(trans));
+ nft_trans_destroy(trans);
+ } else {
+ trans->ctx.table->use--;
+ nft_obj_del(nft_trans_obj(trans));
+ }
break;
case NFT_MSG_DELOBJ:
trans->ctx.table->use++;
@@ -6975,6 +7541,11 @@ static int __nf_tables_abort(struct net *net)
nf_tables_abort_release(trans);
}
+ if (autoload)
+ nf_tables_module_autoload(net);
+ else
+ nf_tables_module_autoload_cleanup(net);
+
return 0;
}
@@ -6983,9 +7554,9 @@ static void nf_tables_cleanup(struct net *net)
nft_validate_state_update(net, NFT_VALIDATE_SKIP);
}
-static int nf_tables_abort(struct net *net, struct sk_buff *skb)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload)
{
- int ret = __nf_tables_abort(net);
+ int ret = __nf_tables_abort(net, autoload);
mutex_unlock(&net->nft.commit_mutex);
@@ -7235,7 +7806,7 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len)
return -EINVAL;
if (len == 0)
return -EINVAL;
- if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data))
+ if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data))
return -ERANGE;
return 0;
@@ -7283,7 +7854,7 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
if (len == 0)
return -EINVAL;
if (reg * NFT_REG32_SIZE + len >
- FIELD_SIZEOF(struct nft_regs, data))
+ sizeof_field(struct nft_regs, data))
return -ERANGE;
if (data != NULL && type != NFT_DATA_VALUE)
@@ -7580,6 +8151,7 @@ static int __net_init nf_tables_init_net(struct net *net)
{
INIT_LIST_HEAD(&net->nft.tables);
INIT_LIST_HEAD(&net->nft.commit_list);
+ INIT_LIST_HEAD(&net->nft.module_list);
mutex_init(&net->nft.commit_mutex);
net->nft.base_seq = 1;
net->nft.validate_state = NFT_VALIDATE_SKIP;
@@ -7591,7 +8163,7 @@ static void __net_exit nf_tables_exit_net(struct net *net)
{
mutex_lock(&net->nft.commit_mutex);
if (!list_empty(&net->nft.commit_list))
- __nf_tables_abort(net);
+ __nf_tables_abort(net, false);
__nft_release_tables(net);
mutex_unlock(&net->nft.commit_mutex);
WARN_ON_ONCE(!list_empty(&net->nft.tables));
@@ -7627,13 +8199,20 @@ static int __init nf_tables_module_init(void)
if (err < 0)
goto err4;
+ err = nft_offload_init();
+ if (err < 0)
+ goto err5;
+
/* must be last */
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
- goto err5;
+ goto err6;
nft_chain_route_init();
+
return err;
+err6:
+ nft_offload_exit();
err5:
rhltable_destroy(&nft_objname_ht);
err4:
@@ -7650,6 +8229,7 @@ err1:
static void __exit nf_tables_module_exit(void)
{
nfnetlink_subsys_unregister(&nf_tables_subsys);
+ nft_offload_exit();
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
nft_chain_route_fini();
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index c0d18c1d77ac..2bb28483af22 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -28,13 +28,10 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
return flow;
}
-struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule)
+struct nft_flow_rule *nft_flow_rule_create(struct net *net,
+ const struct nft_rule *rule)
{
- struct nft_offload_ctx ctx = {
- .dep = {
- .type = NFT_OFFLOAD_DEP_UNSPEC,
- },
- };
+ struct nft_offload_ctx *ctx;
struct nft_flow_rule *flow;
int num_actions = 0, err;
struct nft_expr *expr;
@@ -47,26 +44,40 @@ struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule)
expr = nft_expr_next(expr);
}
+ if (num_actions == 0)
+ return ERR_PTR(-EOPNOTSUPP);
+
flow = nft_flow_rule_alloc(num_actions);
if (!flow)
return ERR_PTR(-ENOMEM);
expr = nft_expr_first(rule);
+
+ ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ ctx->net = net;
+ ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
+
while (expr->ops && expr != nft_expr_last(rule)) {
if (!expr->ops->offload) {
err = -EOPNOTSUPP;
goto err_out;
}
- err = expr->ops->offload(&ctx, flow, expr);
+ err = expr->ops->offload(ctx, flow, expr);
if (err < 0)
goto err_out;
expr = nft_expr_next(expr);
}
- flow->proto = ctx.dep.l3num;
+ flow->proto = ctx->dep.l3num;
+ kfree(ctx);
return flow;
err_out:
+ kfree(ctx);
nft_flow_rule_destroy(flow);
return ERR_PTR(err);
@@ -74,6 +85,19 @@ err_out:
void nft_flow_rule_destroy(struct nft_flow_rule *flow)
{
+ struct flow_action_entry *entry;
+ int i;
+
+ flow_action_for_each(i, entry, &flow->rule->action) {
+ switch (entry->id) {
+ case FLOW_ACTION_REDIRECT:
+ case FLOW_ACTION_MIRRED:
+ dev_put(entry->dev);
+ break;
+ default:
+ break;
+ }
+ }
kfree(flow->rule);
kfree(flow);
}
@@ -111,13 +135,13 @@ static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
common->extack = extack;
}
-static int nft_setup_cb_call(struct nft_base_chain *basechain,
- enum tc_setup_type type, void *type_data)
+static int nft_setup_cb_call(enum tc_setup_type type, void *type_data,
+ struct list_head *cb_list)
{
struct flow_block_cb *block_cb;
int err;
- list_for_each_entry(block_cb, &basechain->flow_block.cb_list, list) {
+ list_for_each_entry(block_cb, cb_list, list) {
err = block_cb->cb(type, type_data, block_cb->cb_priv);
if (err < 0)
return err;
@@ -134,32 +158,46 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain)
return 0;
}
-static int nft_flow_offload_rule(struct nft_trans *trans,
- enum flow_cls_command command)
+static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
+ const struct nft_base_chain *basechain,
+ const struct nft_rule *rule,
+ const struct nft_flow_rule *flow,
+ struct netlink_ext_ack *extack,
+ enum flow_cls_command command)
{
- struct nft_flow_rule *flow = nft_trans_flow_rule(trans);
- struct nft_rule *rule = nft_trans_rule(trans);
- struct flow_cls_offload cls_flow = {};
- struct nft_base_chain *basechain;
- struct netlink_ext_ack extack;
__be16 proto = ETH_P_ALL;
- if (!nft_is_base_chain(trans->ctx.chain))
- return -EOPNOTSUPP;
-
- basechain = nft_base_chain(trans->ctx.chain);
+ memset(cls_flow, 0, sizeof(*cls_flow));
if (flow)
proto = flow->proto;
- nft_flow_offload_common_init(&cls_flow.common, proto,
- basechain->ops.priority, &extack);
- cls_flow.command = command;
- cls_flow.cookie = (unsigned long) rule;
+ nft_flow_offload_common_init(&cls_flow->common, proto,
+ basechain->ops.priority, extack);
+ cls_flow->command = command;
+ cls_flow->cookie = (unsigned long) rule;
if (flow)
- cls_flow.rule = flow->rule;
+ cls_flow->rule = flow->rule;
+}
+
+static int nft_flow_offload_rule(struct nft_chain *chain,
+ struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_cls_offload cls_flow;
+ struct nft_base_chain *basechain;
- return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow);
+ if (!nft_is_base_chain(chain))
+ return -EOPNOTSUPP;
+
+ basechain = nft_base_chain(chain);
+ nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack,
+ command);
+
+ return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow,
+ &basechain->flow_block.cb_list);
}
static int nft_flow_offload_bind(struct flow_block_offload *bo,
@@ -173,6 +211,18 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo,
struct nft_base_chain *basechain)
{
struct flow_block_cb *block_cb, *next;
+ struct flow_cls_offload cls_flow;
+ struct netlink_ext_ack extack;
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
+ chain = &basechain->chain;
+ list_for_each_entry(rule, &chain->rules, list) {
+ memset(&extack, 0, sizeof(extack));
+ nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL,
+ &extack, FLOW_CLS_DESTROY);
+ nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list);
+ }
list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
list_del(&block_cb->list);
@@ -182,58 +232,220 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo,
return 0;
}
+static int nft_block_setup(struct nft_base_chain *basechain,
+ struct flow_block_offload *bo,
+ enum flow_block_command cmd)
+{
+ int err;
+
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ err = nft_flow_offload_bind(bo, basechain);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = nft_flow_offload_unbind(bo, basechain);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static void nft_flow_block_offload_init(struct flow_block_offload *bo,
+ struct net *net,
+ enum flow_block_command cmd,
+ struct nft_base_chain *basechain,
+ struct netlink_ext_ack *extack)
+{
+ memset(bo, 0, sizeof(*bo));
+ bo->net = net;
+ bo->block = &basechain->flow_block;
+ bo->command = cmd;
+ bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ bo->extack = extack;
+ INIT_LIST_HEAD(&bo->cb_list);
+}
+
+static int nft_block_offload_cmd(struct nft_base_chain *chain,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+ int err;
+
+ nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0)
+ return err;
+
+ return nft_block_setup(chain, &bo, cmd);
+}
+
+static void nft_indr_block_ing_cmd(struct net_device *dev,
+ struct nft_base_chain *chain,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+
+ if (!chain)
+ return;
+
+ nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
+
+ cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
+
+ nft_block_setup(chain, &bo, cmd);
+}
+
+static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+
+ nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
+
+ flow_indr_block_call(dev, &bo, cmd);
+
+ if (list_empty(&bo.cb_list))
+ return -EOPNOTSUPP;
+
+ return nft_block_setup(chain, &bo, cmd);
+}
+
#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
-static int nft_flow_offload_chain(struct nft_trans *trans,
+static int nft_chain_offload_cmd(struct nft_base_chain *basechain,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ int err;
+
+ if (dev->netdev_ops->ndo_setup_tc)
+ err = nft_block_offload_cmd(basechain, dev, cmd);
+ else
+ err = nft_indr_block_offload_cmd(basechain, dev, cmd);
+
+ return err;
+}
+
+static int nft_flow_block_chain(struct nft_base_chain *basechain,
+ const struct net_device *this_dev,
+ enum flow_block_command cmd)
+{
+ struct net_device *dev;
+ struct nft_hook *hook;
+ int err, i = 0;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ dev = hook->ops.dev;
+ if (this_dev && this_dev != dev)
+ continue;
+
+ err = nft_chain_offload_cmd(basechain, dev, cmd);
+ if (err < 0 && cmd == FLOW_BLOCK_BIND) {
+ if (!this_dev)
+ goto err_flow_block;
+
+ return err;
+ }
+ i++;
+ }
+
+ return 0;
+
+err_flow_block:
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (i-- <= 0)
+ break;
+
+ dev = hook->ops.dev;
+ nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND);
+ }
+ return err;
+}
+
+static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
enum flow_block_command cmd)
{
- struct nft_chain *chain = trans->ctx.chain;
- struct netlink_ext_ack extack = {};
- struct flow_block_offload bo = {};
struct nft_base_chain *basechain;
- struct net_device *dev;
- int err;
+ u8 policy;
if (!nft_is_base_chain(chain))
return -EOPNOTSUPP;
basechain = nft_base_chain(chain);
- dev = basechain->ops.dev;
- if (!dev || !dev->netdev_ops->ndo_setup_tc)
- return -EOPNOTSUPP;
+ policy = ppolicy ? *ppolicy : basechain->policy;
/* Only default policy to accept is supported for now. */
- if (cmd == FLOW_BLOCK_BIND &&
- nft_trans_chain_policy(trans) != -1 &&
- nft_trans_chain_policy(trans) != NF_ACCEPT)
+ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP)
return -EOPNOTSUPP;
- bo.command = cmd;
- bo.block = &basechain->flow_block;
- bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
- bo.extack = &extack;
- INIT_LIST_HEAD(&bo.cb_list);
+ return nft_flow_block_chain(basechain, NULL, cmd);
+}
- err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo);
- if (err < 0)
- return err;
+static void nft_flow_rule_offload_abort(struct net *net,
+ struct nft_trans *trans)
+{
+ int err = 0;
- switch (cmd) {
- case FLOW_BLOCK_BIND:
- err = nft_flow_offload_bind(&bo, basechain);
- break;
- case FLOW_BLOCK_UNBIND:
- err = nft_flow_offload_unbind(&bo, basechain);
- break;
- }
+ list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) {
+ if (trans->ctx.family != NFPROTO_NETDEV)
+ continue;
- return err;
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ nft_trans_chain_update(trans))
+ continue;
+
+ err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ FLOW_BLOCK_UNBIND);
+ break;
+ case NFT_MSG_DELCHAIN:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ FLOW_BLOCK_BIND);
+ break;
+ case NFT_MSG_NEWRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(trans->ctx.chain,
+ nft_trans_rule(trans),
+ NULL, FLOW_CLS_DESTROY);
+ break;
+ case NFT_MSG_DELRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(trans->ctx.chain,
+ nft_trans_rule(trans),
+ nft_trans_flow_rule(trans),
+ FLOW_CLS_REPLACE);
+ break;
+ }
+
+ if (WARN_ON_ONCE(err))
+ break;
+ }
}
int nft_flow_rule_offload_commit(struct net *net)
{
struct nft_trans *trans;
int err = 0;
+ u8 policy;
list_for_each_entry(trans, &net->nft.commit_list, list) {
if (trans->ctx.family != NFPROTO_NETDEV)
@@ -241,39 +453,171 @@ int nft_flow_rule_offload_commit(struct net *net)
switch (trans->msg_type) {
case NFT_MSG_NEWCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ nft_trans_chain_update(trans))
continue;
- err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND);
+ policy = nft_trans_chain_policy(trans);
+ err = nft_flow_offload_chain(trans->ctx.chain, &policy,
+ FLOW_BLOCK_BIND);
break;
case NFT_MSG_DELCHAIN:
if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND);
+ policy = nft_trans_chain_policy(trans);
+ err = nft_flow_offload_chain(trans->ctx.chain, &policy,
+ FLOW_BLOCK_UNBIND);
break;
case NFT_MSG_NEWRULE:
if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
if (trans->ctx.flags & NLM_F_REPLACE ||
- !(trans->ctx.flags & NLM_F_APPEND))
- return -EOPNOTSUPP;
-
- err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE);
- nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+ !(trans->ctx.flags & NLM_F_APPEND)) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ err = nft_flow_offload_rule(trans->ctx.chain,
+ nft_trans_rule(trans),
+ nft_trans_flow_rule(trans),
+ FLOW_CLS_REPLACE);
break;
case NFT_MSG_DELRULE:
if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY);
+ err = nft_flow_offload_rule(trans->ctx.chain,
+ nft_trans_rule(trans),
+ NULL, FLOW_CLS_DESTROY);
break;
}
- if (err)
- return err;
+ if (err) {
+ nft_flow_rule_offload_abort(net, trans);
+ break;
+ }
+ }
+
+ list_for_each_entry(trans, &net->nft.commit_list, list) {
+ if (trans->ctx.family != NFPROTO_NETDEV)
+ continue;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWRULE:
+ case NFT_MSG_DELRULE:
+ if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+ break;
+ default:
+ break;
+ }
}
return err;
}
+
+static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
+{
+ struct nft_base_chain *basechain;
+ struct net *net = dev_net(dev);
+ struct nft_hook *hook, *found;
+ const struct nft_table *table;
+ struct nft_chain *chain;
+
+ list_for_each_entry(table, &net->nft.tables, list) {
+ if (table->family != NFPROTO_NETDEV)
+ continue;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_base_chain(chain) ||
+ !(chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ found = NULL;
+ basechain = nft_base_chain(chain);
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (hook->ops.dev != dev)
+ continue;
+
+ found = hook;
+ break;
+ }
+ if (!found)
+ continue;
+
+ return chain;
+ }
+ }
+
+ return NULL;
+}
+
+static void nft_indr_block_cb(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ enum flow_block_command cmd)
+{
+ struct net *net = dev_net(dev);
+ struct nft_chain *chain;
+
+ mutex_lock(&net->nft.commit_mutex);
+ chain = __nft_offload_get_chain(dev);
+ if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ struct nft_base_chain *basechain;
+
+ basechain = nft_base_chain(chain);
+ nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd);
+ }
+ mutex_unlock(&net->nft.commit_mutex);
+}
+
+static int nft_offload_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct nft_chain *chain;
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ mutex_lock(&net->nft.commit_mutex);
+ chain = __nft_offload_get_chain(dev);
+ if (chain)
+ nft_flow_block_chain(nft_base_chain(chain), dev,
+ FLOW_BLOCK_UNBIND);
+
+ mutex_unlock(&net->nft.commit_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct flow_indr_block_entry block_ing_entry = {
+ .cb = nft_indr_block_cb,
+ .list = LIST_HEAD_INIT(block_ing_entry.list),
+};
+
+static struct notifier_block nft_offload_netdev_notifier = {
+ .notifier_call = nft_offload_netdev_event,
+};
+
+int nft_offload_init(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&nft_offload_netdev_notifier);
+ if (err < 0)
+ return err;
+
+ flow_indr_add_block_cb(&block_ing_entry);
+
+ return 0;
+}
+
+void nft_offload_exit(void)
+{
+ flow_indr_del_block_cb(&block_ing_entry);
+ unregister_netdevice_notifier(&nft_offload_netdev_notifier);
+}
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
index a9fce8d10051..586b621007eb 100644
--- a/net/netfilter/nf_tables_set_core.c
+++ b/net/netfilter/nf_tables_set_core.c
@@ -9,12 +9,14 @@ static int __init nf_tables_set_module_init(void)
nft_register_set(&nft_set_rhash_type);
nft_register_set(&nft_set_bitmap_type);
nft_register_set(&nft_set_rbtree_type);
+ nft_register_set(&nft_set_pipapo_type);
return 0;
}
static void __exit nf_tables_set_module_exit(void)
{
+ nft_unregister_set(&nft_set_pipapo_type);
nft_unregister_set(&nft_set_rbtree_type);
nft_unregister_set(&nft_set_bitmap_type);
nft_unregister_set(&nft_set_rhash_type);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 4abbb452cf6c..99127e2d95a8 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -476,7 +476,7 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- ss->abort(net, oskb);
+ ss->abort(net, oskb, true);
nfnl_err_reset(&err_list);
kfree_skb(skb);
module_put(ss->owner);
@@ -487,11 +487,11 @@ done:
status |= NFNL_BATCH_REPLAY;
goto done;
} else if (err) {
- ss->abort(net, oskb);
+ ss->abort(net, oskb, false);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
}
} else {
- ss->abort(net, oskb);
+ ss->abort(net, oskb, false);
}
if (ss->cleanup)
ss->cleanup(net);
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 7525063c25f5..de3a9596b7f1 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -236,7 +236,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
nla_strlcpy(helper->name,
tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
- if (size > FIELD_SIZEOF(struct nf_conn_help, data)) {
+ if (size > sizeof_field(struct nf_conn_help, data)) {
ret = -ENOMEM;
goto err2;
}
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6dee4f9a944c..0ba020ca38e6 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -385,6 +385,57 @@ nfulnl_timer(struct timer_list *t)
instance_put(inst);
}
+static u32 nfulnl_get_bridge_size(const struct sk_buff *skb)
+{
+ u32 size = 0;
+
+ if (!skb_mac_header_was_set(skb))
+ return 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ size += nla_total_size(0); /* nested */
+ size += nla_total_size(sizeof(u16)); /* id */
+ size += nla_total_size(sizeof(u16)); /* tag */
+ }
+
+ if (skb->network_header > skb->mac_header)
+ size += nla_total_size(skb->network_header - skb->mac_header);
+
+ return size;
+}
+
+static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb)
+{
+ if (!skb_mac_header_was_set(skb))
+ return 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(inst->skb, NFULA_VLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) ||
+ nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto))
+ goto nla_put_failure;
+
+ nla_nest_end(inst->skb, nest);
+ }
+
+ if (skb->mac_header < skb->network_header) {
+ int len = (int)(skb->network_header - skb->mac_header);
+
+ if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb)))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
/* This is an inline function, we don't really care about a long
* list of arguments */
static inline int
@@ -580,6 +631,10 @@ __build_packet_message(struct nfnl_log_net *log,
NFULA_CT, NFULA_CT_INFO) < 0)
goto nla_put_failure;
+ if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) &&
+ nfulnl_put_bridge(inst, skb) < 0)
+ goto nla_put_failure;
+
if (data_len) {
struct nlattr *nla;
int size = nla_attr_size(data_len);
@@ -651,7 +706,7 @@ nfulnl_log_packet(struct net *net,
/* FIXME: do we want to make the size calculation conditional based on
* what is actually present? way more branches and checks, but more
* memory efficient... */
- size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -668,7 +723,7 @@ nfulnl_log_packet(struct net *net,
+ nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */
if (in && skb_mac_header_was_set(skb)) {
- size += nla_total_size(skb->dev->hard_header_len)
+ size += nla_total_size(skb->dev->hard_header_len)
+ nla_total_size(sizeof(u_int16_t)) /* hwtype */
+ nla_total_size(sizeof(u_int16_t)); /* hwlen */
}
@@ -687,6 +742,8 @@ nfulnl_log_packet(struct net *net,
size += nfnl_ct->build_size(ct);
}
}
+ if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
+ size += nfulnl_get_bridge_size(skb);
qthreshold = inst->qthreshold;
/* per-rule qthreshold overrides per-instance */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index b6a7ce622c72..76535fd9278c 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -394,7 +394,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
char *secdata = NULL;
u32 seclen = 0;
- size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -453,7 +453,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
if (queue->flags & NFQA_CFG_F_UID_GID) {
- size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ nla_total_size(sizeof(u_int32_t))); /* gid */
}
@@ -778,7 +778,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
unsigned int queued;
struct nfqnl_instance *queue;
- struct sk_buff *skb, *segs;
+ struct sk_buff *skb, *segs, *nskb;
int err = -ENOBUFS;
struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -815,8 +815,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
goto out_err;
queued = 0;
err = 0;
- do {
- struct sk_buff *nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
if (err == 0)
err = __nfqnl_enqueue_packet_gso(net, queue,
segs, entry);
@@ -824,8 +823,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
queued++;
else
kfree_skb(segs);
- segs = nskb;
- } while (segs);
+ }
if (queued) {
if (err) /* some segments are already queued */
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index b310b637b550..0ed2281f03be 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -13,25 +13,71 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
struct nft_bitwise {
enum nft_registers sreg:8;
enum nft_registers dreg:8;
+ enum nft_bitwise_ops op:8;
u8 len;
struct nft_data mask;
struct nft_data xor;
+ struct nft_data data;
};
+static void nft_bitwise_eval_bool(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i;
+
+ for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
+ dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
+}
+
+static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
+{
+ u32 shift = priv->data.data[0];
+ unsigned int i;
+ u32 carry = 0;
+
+ for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) {
+ dst[i - 1] = (src[i - 1] << shift) | carry;
+ carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift);
+ }
+}
+
+static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
+{
+ u32 shift = priv->data.data[0];
+ unsigned int i;
+ u32 carry = 0;
+
+ for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) {
+ dst[i] = carry | (src[i] >> shift);
+ carry = src[i] << (BITS_PER_TYPE(u32) - shift);
+ }
+}
+
void nft_bitwise_eval(const struct nft_expr *expr,
struct nft_regs *regs, const struct nft_pktinfo *pkt)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
const u32 *src = &regs->data[priv->sreg];
u32 *dst = &regs->data[priv->dreg];
- unsigned int i;
- for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
- dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
+ switch (priv->op) {
+ case NFT_BITWISE_BOOL:
+ nft_bitwise_eval_bool(dst, src, priv);
+ break;
+ case NFT_BITWISE_LSHIFT:
+ nft_bitwise_eval_lshift(dst, src, priv);
+ break;
+ case NFT_BITWISE_RSHIFT:
+ nft_bitwise_eval_rshift(dst, src, priv);
+ break;
+ }
}
static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
@@ -40,22 +86,86 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
[NFTA_BITWISE_LEN] = { .type = NLA_U32 },
[NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
[NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
+ [NFTA_BITWISE_OP] = { .type = NLA_U32 },
+ [NFTA_BITWISE_DATA] = { .type = NLA_NESTED },
};
+static int nft_bitwise_init_bool(struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ struct nft_data_desc d1, d2;
+ int err;
+
+ if (tb[NFTA_BITWISE_DATA])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_MASK] ||
+ !tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
+ tb[NFTA_BITWISE_MASK]);
+ if (err < 0)
+ return err;
+ if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) {
+ err = -EINVAL;
+ goto err1;
+ }
+
+ err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
+ tb[NFTA_BITWISE_XOR]);
+ if (err < 0)
+ goto err1;
+ if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ return 0;
+err2:
+ nft_data_release(&priv->xor, d2.type);
+err1:
+ nft_data_release(&priv->mask, d1.type);
+ return err;
+}
+
+static int nft_bitwise_init_shift(struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ struct nft_data_desc d;
+ int err;
+
+ if (tb[NFTA_BITWISE_MASK] ||
+ tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_DATA])
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d,
+ tb[NFTA_BITWISE_DATA]);
+ if (err < 0)
+ return err;
+ if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) ||
+ priv->data.data[0] >= BITS_PER_TYPE(u32)) {
+ nft_data_release(&priv->data, d.type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nft_bitwise_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_bitwise *priv = nft_expr_priv(expr);
- struct nft_data_desc d1, d2;
u32 len;
int err;
- if (tb[NFTA_BITWISE_SREG] == NULL ||
- tb[NFTA_BITWISE_DREG] == NULL ||
- tb[NFTA_BITWISE_LEN] == NULL ||
- tb[NFTA_BITWISE_MASK] == NULL ||
- tb[NFTA_BITWISE_XOR] == NULL)
+ if (!tb[NFTA_BITWISE_SREG] ||
+ !tb[NFTA_BITWISE_DREG] ||
+ !tb[NFTA_BITWISE_LEN])
return -EINVAL;
err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
@@ -75,55 +185,102 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
- tb[NFTA_BITWISE_MASK]);
- if (err < 0)
- return err;
- if (d1.len != priv->len) {
- err = -EINVAL;
- goto err1;
+ if (tb[NFTA_BITWISE_OP]) {
+ priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
+ switch (priv->op) {
+ case NFT_BITWISE_BOOL:
+ case NFT_BITWISE_LSHIFT:
+ case NFT_BITWISE_RSHIFT:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ } else {
+ priv->op = NFT_BITWISE_BOOL;
}
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
- tb[NFTA_BITWISE_XOR]);
- if (err < 0)
- goto err1;
- if (d2.len != priv->len) {
- err = -EINVAL;
- goto err2;
+ switch(priv->op) {
+ case NFT_BITWISE_BOOL:
+ err = nft_bitwise_init_bool(priv, tb);
+ break;
+ case NFT_BITWISE_LSHIFT:
+ case NFT_BITWISE_RSHIFT:
+ err = nft_bitwise_init_shift(priv, tb);
+ break;
}
- return 0;
-err2:
- nft_data_release(&priv->xor, d2.type);
-err1:
- nft_data_release(&priv->mask, d1.type);
return err;
}
+static int nft_bitwise_dump_bool(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+ NFT_DATA_VALUE, priv->len) < 0)
+ return -1;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+ NFT_DATA_VALUE, priv->len) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_bitwise_dump_shift(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+ return 0;
+}
+
static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
+ int err = 0;
if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
- goto nla_put_failure;
+ return -1;
if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
- goto nla_put_failure;
+ return -1;
if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
- goto nla_put_failure;
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op)))
+ return -1;
- if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
- NFT_DATA_VALUE, priv->len) < 0)
- goto nla_put_failure;
+ switch (priv->op) {
+ case NFT_BITWISE_BOOL:
+ err = nft_bitwise_dump_bool(skb, priv);
+ break;
+ case NFT_BITWISE_LSHIFT:
+ case NFT_BITWISE_RSHIFT:
+ err = nft_bitwise_dump_shift(skb, priv);
+ break;
+ }
- if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
- NFT_DATA_VALUE, priv->len) < 0)
- goto nla_put_failure;
+ return err;
+}
- return 0;
+static struct nft_data zero;
-nla_put_failure:
- return -1;
+static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ if (priv->op != NFT_BITWISE_BOOL)
+ return -EOPNOTSUPP;
+
+ if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) ||
+ priv->sreg != priv->dreg || priv->len != reg->len)
+ return -EOPNOTSUPP;
+
+ memcpy(&reg->mask, &priv->mask, sizeof(priv->mask));
+
+ return 0;
}
static const struct nft_expr_ops nft_bitwise_ops = {
@@ -132,6 +289,7 @@ static const struct nft_expr_ops nft_bitwise_ops = {
.eval = nft_bitwise_eval,
.init = nft_bitwise_init,
.dump = nft_bitwise_dump,
+ .offload = nft_bitwise_offload,
};
struct nft_expr_type nft_bitwise_type __read_mostly = {
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index e06318428ea0..12bed3f7bbc6 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -43,14 +43,15 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
- src64 = get_unaligned((u64 *)&src[i]);
- put_unaligned_be64(src64, &dst[i]);
+ src64 = nft_reg_load64(&src[i]);
+ nft_reg_store64(&dst[i], be64_to_cpu(src64));
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
- src64 = get_unaligned_be64(&src[i]);
- put_unaligned(src64, (u64 *)&dst[i]);
+ src64 = (__force __u64)
+ cpu_to_be64(nft_reg_load64(&src[i]));
+ nft_reg_store64(&dst[i], src64);
}
break;
}
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index b5d5d071d765..c78d01bc02e9 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
struct nft_ctx *ctx)
{
struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
+ struct nft_hook *hook, *found = NULL;
+ int n = 0;
- switch (event) {
- case NETDEV_UNREGISTER:
- if (strcmp(basechain->dev_name, dev->name) != 0)
- return;
-
- /* UNREGISTER events are also happpening on netns exit.
- *
- * Altough nf_tables core releases all tables/chains, only
- * this event handler provides guarantee that
- * basechain.ops->dev is still accessible, so we cannot
- * skip exiting net namespaces.
- */
- __nft_release_basechain(ctx);
- break;
- case NETDEV_CHANGENAME:
- if (dev->ifindex != basechain->ops.dev->ifindex)
- return;
+ if (event != NETDEV_UNREGISTER)
+ return;
- strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
- break;
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (hook->ops.dev == dev)
+ found = hook;
+
+ n++;
}
+ if (!found)
+ return;
+
+ if (n > 1) {
+ nf_unregister_net_hook(ctx->net, &found->ops);
+ list_del_rcu(&found->list);
+ kfree_rcu(found, rcu);
+ return;
+ }
+
+ /* UNREGISTER events are also happening on netns exit.
+ *
+ * Although nf_tables core releases all tables/chains, only this event
+ * handler provides guarantee that hook->ops.dev is still accessible,
+ * so we cannot skip exiting net namespaces.
+ */
+ __nft_release_basechain(ctx);
}
static int nf_tables_netdev_event(struct notifier_block *this,
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index bd173b1824c6..8a28c127effc 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
+#include <linux/if_arp.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables_offload.h>
@@ -80,6 +81,12 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (err < 0)
return err;
+ if (desc.type != NFT_DATA_VALUE) {
+ err = -EINVAL;
+ nft_data_release(&priv->data, desc.type);
+ return err;
+ }
+
priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
err = nft_validate_register_load(priv->sreg, desc.len);
if (err < 0)
@@ -116,7 +123,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
u8 *mask = (u8 *)&flow->match.mask;
u8 *key = (u8 *)&flow->match.key;
- if (priv->op != NFT_CMP_EQ)
+ if (priv->op != NFT_CMP_EQ || reg->len != priv->len)
return -EOPNOTSUPP;
memcpy(key + reg->offset, &priv->data, priv->len);
@@ -125,6 +132,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
flow->match.dissector.used_keys |= BIT(reg->key);
flow->match.dissector.offset[reg->key] = reg->base_offset;
+ if (reg->key == FLOW_DISSECTOR_KEY_META &&
+ reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) &&
+ nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
+ return -EOPNOTSUPP;
+
nft_offload_update_dependency(ctx, &priv->data, priv->len);
return 0;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index af1497ab9464..69d6173f91e2 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -218,8 +218,13 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
+ bool ret;
- return nf_conncount_gc_list(net, &priv->list);
+ local_bh_disable();
+ ret = nf_conncount_gc_list(net, &priv->list);
+ local_bh_enable();
+
+ return ret;
}
static struct nft_expr_type nft_connlimit_type;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 46ca8bcca1bd..faea72c2df32 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -440,12 +440,12 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
switch (ctx->family) {
case NFPROTO_IPV4:
- len = FIELD_SIZEOF(struct nf_conntrack_tuple,
+ len = sizeof_field(struct nf_conntrack_tuple,
src.u3.ip);
break;
case NFPROTO_IPV6:
case NFPROTO_INET:
- len = FIELD_SIZEOF(struct nf_conntrack_tuple,
+ len = sizeof_field(struct nf_conntrack_tuple,
src.u3.ip6);
break;
default:
@@ -457,20 +457,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DIRECTION] == NULL)
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip);
+ len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip);
break;
case NFT_CT_SRC_IP6:
case NFT_CT_DST_IP6:
if (tb[NFTA_CT_DIRECTION] == NULL)
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6);
+ len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6);
break;
case NFT_CT_PROTO_SRC:
case NFT_CT_PROTO_DST:
if (tb[NFTA_CT_DIRECTION] == NULL)
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all);
+ len = sizeof_field(struct nf_conntrack_tuple, src.u.all);
break;
case NFT_CT_BYTES:
case NFT_CT_PKTS:
@@ -551,7 +551,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
case NFT_CT_MARK:
if (tb[NFTA_CT_DIRECTION])
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conn, mark);
+ len = sizeof_field(struct nf_conn, mark);
break;
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index c6052fdd2c40..c2e78c160fd7 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -10,6 +10,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
struct nft_dup_netdev {
@@ -56,6 +57,16 @@ nla_put_failure:
return -1;
}
+static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_dup_netdev *priv = nft_expr_priv(expr);
+ int oif = ctx->regs[priv->sreg_dev].data.data[0];
+
+ return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif);
+}
+
static struct nft_expr_type nft_dup_netdev_type;
static const struct nft_expr_ops nft_dup_netdev_ops = {
.type = &nft_dup_netdev_type,
@@ -63,6 +74,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = {
.eval = nft_dup_netdev_eval,
.init = nft_dup_netdev_init,
.dump = nft_dup_netdev_dump,
+ .offload = nft_dup_netdev_offload,
};
static struct nft_expr_type nft_dup_netdev_type __read_mostly = {
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 33833a0cb989..683785225a3e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -54,7 +54,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
timeout = priv->timeout ? : set->timeout;
elem = nft_set_elem_init(set, &priv->tmpl,
- &regs->data[priv->sreg_key],
+ &regs->data[priv->sreg_key], NULL,
&regs->data[priv->sreg_data],
timeout, 0, GFP_ATOMIC);
if (elem == NULL)
@@ -84,6 +84,11 @@ void nft_dynset_eval(const struct nft_expr *expr,
const struct nft_expr *sexpr;
u64 timeout;
+ if (priv->op == NFT_DYNSET_OP_DELETE) {
+ set->ops->delete(set, &regs->data[priv->sreg_key]);
+ return;
+ }
+
if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
expr, regs, &ext)) {
sexpr = NULL;
@@ -161,6 +166,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP]));
switch (priv->op) {
case NFT_DYNSET_OP_ADD:
+ case NFT_DYNSET_OP_DELETE:
break;
case NFT_DYNSET_OP_UPDATE:
if (!(set->flags & NFT_SET_TIMEOUT))
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index 2cf3f32fe6d2..a2e726ae7f07 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -14,6 +14,7 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/ipv6.h>
#include <net/netfilter/nft_fib.h>
@@ -34,6 +35,8 @@ static void nft_fib_netdev_eval(const struct nft_expr *expr,
}
break;
case ETH_P_IPV6:
+ if (!ipv6_mod_enabled())
+ break;
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 060a4ed46d5e..b70b48996801 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -6,12 +6,13 @@
#include <linux/netfilter.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
#include <net/ip.h> /* for ipv4 options. */
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <linux/netfilter/nf_conntrack_common.h>
+#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_flow_table.h>
struct nft_flow_offload {
@@ -114,10 +115,13 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
if (nft_flow_route(pkt, ct, &route, dir) < 0)
goto err_flow_route;
- flow = flow_offload_alloc(ct, &route);
+ flow = flow_offload_alloc(ct);
if (!flow)
goto err_flow_alloc;
+ if (flow_offload_route_init(flow, &route) < 0)
+ goto err_flow_add;
+
if (tcph) {
ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
@@ -149,6 +153,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain, hook_mask);
}
+static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = {
+ [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING,
+ .len = NFT_NAME_MAXLEN - 1 },
+};
+
static int nft_flow_offload_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -171,12 +180,26 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_flow_offload_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
+{
+ struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+ nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase);
+}
+
+static void nft_flow_offload_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
- priv->flowtable->use--;
+ priv->flowtable->use++;
+}
+
+static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
nf_ct_netns_put(ctx->net, ctx->family);
}
@@ -199,6 +222,8 @@ static const struct nft_expr_ops nft_flow_offload_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
.eval = nft_flow_offload_eval,
.init = nft_flow_offload_init,
+ .activate = nft_flow_offload_activate,
+ .deactivate = nft_flow_offload_deactivate,
.destroy = nft_flow_offload_destroy,
.validate = nft_flow_offload_validate,
.dump = nft_flow_offload_dump,
@@ -207,6 +232,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = {
static struct nft_expr_type nft_flow_offload_type __read_mostly = {
.name = "flow_offload",
.ops = &nft_flow_offload_ops,
+ .policy = nft_flow_offload_policy,
.maxattr = NFTA_FLOW_MAX,
.owner = THIS_MODULE,
};
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 61b7f93ac681..aba11c2333f3 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,6 +12,7 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
#include <net/neighbour.h>
#include <net/ip.h>
@@ -63,6 +64,16 @@ nla_put_failure:
return -1;
}
+static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_fwd_netdev *priv = nft_expr_priv(expr);
+ int oif = ctx->regs[priv->sreg_dev].data.data[0];
+
+ return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif);
+}
+
struct nft_fwd_neigh {
enum nft_registers sreg_dev:8;
enum nft_registers sreg_addr:8;
@@ -194,6 +205,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.eval = nft_fwd_netdev_eval,
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
+ .offload = nft_fwd_netdev_offload,
};
static const struct nft_expr_ops *
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index ca2ae4b95a8d..c7f0ef73d939 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -125,17 +125,13 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
return 0;
}
-static int nft_immediate_offload(struct nft_offload_ctx *ctx,
- struct nft_flow_rule *flow,
- const struct nft_expr *expr)
+static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_immediate_expr *priv)
{
- const struct nft_immediate_expr *priv = nft_expr_priv(expr);
struct flow_action_entry *entry;
const struct nft_data *data;
- if (priv->dreg != NFT_REG_VERDICT)
- return -EOPNOTSUPP;
-
entry = &flow->rule->action.entries[ctx->num_actions++];
data = &priv->data;
@@ -153,6 +149,20 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static int nft_immediate_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ return nft_immediate_offload_verdict(ctx, flow, priv);
+
+ memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data));
+
+ return 0;
+}
+
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index c0560bf3c31b..660bad688e2b 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -73,9 +73,6 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
- if (set->flags & NFT_SET_EVAL)
- return -EOPNOTSUPP;
-
priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]);
err = nft_validate_register_load(priv->sreg, set->klen);
if (err < 0)
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 39dc94f2491e..bc9fd98c5d6d 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -43,7 +43,7 @@ static int nft_masq_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ u32 plen = sizeof_field(struct nf_nat_range, min_addr.all);
struct nft_masq *priv = nft_expr_priv(expr);
int err;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index f69afb9ff3cb..951b6e87ed5d 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -17,6 +17,7 @@
#include <linux/smp.h>
#include <linux/static_key.h>
#include <net/dst.h>
+#include <net/ip.h>
#include <net/sock.h>
#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
#include <net/netfilter/nf_tables.h>
@@ -26,16 +27,295 @@
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+#define NFT_META_SECS_PER_MINUTE 60
+#define NFT_META_SECS_PER_HOUR 3600
+#define NFT_META_SECS_PER_DAY 86400
+#define NFT_META_DAYS_PER_WEEK 7
+
static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+static u8 nft_meta_weekday(void)
+{
+ time64_t secs = ktime_get_real_seconds();
+ unsigned int dse;
+ u8 wday;
+
+ secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest;
+ dse = div_u64(secs, NFT_META_SECS_PER_DAY);
+ wday = (4 + dse) % NFT_META_DAYS_PER_WEEK;
+
+ return wday;
+}
+
+static u32 nft_meta_hour(time64_t secs)
+{
+ struct tm tm;
+
+ time64_to_tm(secs, 0, &tm);
+
+ return tm.tm_hour * NFT_META_SECS_PER_HOUR
+ + tm.tm_min * NFT_META_SECS_PER_MINUTE
+ + tm.tm_sec;
+}
+
+static noinline_for_stack void
+nft_meta_get_eval_time(enum nft_meta_keys key,
+ u32 *dest)
+{
+ switch (key) {
+ case NFT_META_TIME_NS:
+ nft_reg_store64(dest, ktime_get_real_ns());
+ break;
+ case NFT_META_TIME_DAY:
+ nft_reg_store8(dest, nft_meta_weekday());
+ break;
+ case NFT_META_TIME_HOUR:
+ *dest = nft_meta_hour(ktime_get_real_seconds());
+ break;
+ default:
+ break;
+ }
+}
+
+static noinline bool
+nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt,
+ u32 *dest)
+{
+ const struct sk_buff *skb = pkt->skb;
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr))
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ else
+ nft_reg_store8(dest, PACKET_BROADCAST);
+ break;
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ break;
+ case NFPROTO_NETDEV:
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ int noff = skb_network_offset(skb);
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(skb, noff,
+ sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ if (ipv4_is_multicast(iph->daddr))
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ else
+ nft_reg_store8(dest, PACKET_BROADCAST);
+
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ return true;
+}
+
+static noinline bool
+nft_meta_get_eval_skugid(enum nft_meta_keys key,
+ u32 *dest,
+ const struct nft_pktinfo *pkt)
+{
+ struct sock *sk = skb_to_full_sk(pkt->skb);
+ struct socket *sock;
+
+ if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
+ return false;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ sock = sk->sk_socket;
+ if (!sock || !sock->file) {
+ read_unlock_bh(&sk->sk_callback_lock);
+ return false;
+ }
+
+ switch (key) {
+ case NFT_META_SKUID:
+ *dest = from_kuid_munged(&init_user_ns,
+ sock->file->f_cred->fsuid);
+ break;
+ case NFT_META_SKGID:
+ *dest = from_kgid_munged(&init_user_ns,
+ sock->file->f_cred->fsgid);
+ break;
+ default:
+ break;
+ }
+
+ read_unlock_bh(&sk->sk_callback_lock);
+ return true;
+}
+
+#ifdef CONFIG_CGROUP_NET_CLASSID
+static noinline bool
+nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt)
+{
+ struct sock *sk = skb_to_full_sk(pkt->skb);
+
+ if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
+ return false;
+
+ *dest = sock_cgroup_classid(&sk->sk_cgrp_data);
+ return true;
+}
+#endif
+
+static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key,
+ u32 *dest,
+ const struct nft_pktinfo *pkt)
+{
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
+
+ switch (key) {
+ case NFT_META_IIFKIND:
+ if (!in || !in->rtnl_link_ops)
+ return false;
+ strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
+ break;
+ case NFT_META_OIFKIND:
+ if (!out || !out->rtnl_link_ops)
+ return false;
+ strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev)
+{
+ *dest = dev ? dev->ifindex : 0;
+}
+
+static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev)
+{
+ strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ);
+}
+
+static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev)
+{
+ if (!dev)
+ return false;
+
+ nft_reg_store16(dest, dev->type);
+ return true;
+}
+
+static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev)
+{
+ if (!dev)
+ return false;
+
+ *dest = dev->group;
+ return true;
+}
+
+static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
+ const struct nft_pktinfo *pkt)
+{
+ switch (key) {
+ case NFT_META_IIFNAME:
+ nft_meta_store_ifname(dest, nft_in(pkt));
+ break;
+ case NFT_META_OIFNAME:
+ nft_meta_store_ifname(dest, nft_out(pkt));
+ break;
+ case NFT_META_IIF:
+ nft_meta_store_ifindex(dest, nft_in(pkt));
+ break;
+ case NFT_META_OIF:
+ nft_meta_store_ifindex(dest, nft_out(pkt));
+ break;
+ case NFT_META_IIFTYPE:
+ if (!nft_meta_store_iftype(dest, nft_in(pkt)))
+ return false;
+ break;
+ case NFT_META_OIFTYPE:
+ if (!nft_meta_store_iftype(dest, nft_out(pkt)))
+ return false;
+ break;
+ case NFT_META_IIFGROUP:
+ if (!nft_meta_store_ifgroup(dest, nft_out(pkt)))
+ return false;
+ break;
+ case NFT_META_OIFGROUP:
+ if (!nft_meta_store_ifgroup(dest, nft_out(pkt)))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static noinline u32 nft_prandom_u32(void)
+{
+ struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+
+ return prandom_u32_state(state);
+}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static noinline bool
+nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
+{
+ const struct dst_entry *dst = skb_dst(skb);
+
+ if (!dst)
+ return false;
+
+ *dest = dst->tclassid;
+ return true;
+}
+#endif
+
+static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ return inet_sdif(pkt->skb);
+ case NFPROTO_IPV6:
+ return inet6_sdif(pkt->skb);
+ }
+
+ return 0;
+}
+
+static noinline void
+nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt)
+{
+ u32 sdif = nft_meta_get_eval_sdif(pkt);
+ const struct net_device *dev;
+
+ dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL;
+ nft_meta_store_ifname(dest, dev);
+}
+
void nft_meta_get_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
- const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
- struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
switch (priv->key) {
@@ -60,69 +340,26 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*dest = skb->mark;
break;
case NFT_META_IIF:
- *dest = in ? in->ifindex : 0;
- break;
case NFT_META_OIF:
- *dest = out ? out->ifindex : 0;
- break;
case NFT_META_IIFNAME:
- strncpy((char *)dest, in ? in->name : "", IFNAMSIZ);
- break;
case NFT_META_OIFNAME:
- strncpy((char *)dest, out ? out->name : "", IFNAMSIZ);
- break;
case NFT_META_IIFTYPE:
- if (in == NULL)
- goto err;
- nft_reg_store16(dest, in->type);
- break;
case NFT_META_OIFTYPE:
- if (out == NULL)
+ case NFT_META_IIFGROUP:
+ case NFT_META_OIFGROUP:
+ if (!nft_meta_get_eval_ifname(priv->key, dest, pkt))
goto err;
- nft_reg_store16(dest, out->type);
break;
case NFT_META_SKUID:
- sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk) ||
- !net_eq(nft_net(pkt), sock_net(sk)))
- goto err;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket == NULL ||
- sk->sk_socket->file == NULL) {
- read_unlock_bh(&sk->sk_callback_lock);
- goto err;
- }
-
- *dest = from_kuid_munged(&init_user_ns,
- sk->sk_socket->file->f_cred->fsuid);
- read_unlock_bh(&sk->sk_callback_lock);
- break;
case NFT_META_SKGID:
- sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk) ||
- !net_eq(nft_net(pkt), sock_net(sk)))
- goto err;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket == NULL ||
- sk->sk_socket->file == NULL) {
- read_unlock_bh(&sk->sk_callback_lock);
+ if (!nft_meta_get_eval_skugid(priv->key, dest, pkt))
goto err;
- }
- *dest = from_kgid_munged(&init_user_ns,
- sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&sk->sk_callback_lock);
break;
#ifdef CONFIG_IP_ROUTE_CLASSID
- case NFT_META_RTCLASSID: {
- const struct dst_entry *dst = skb_dst(skb);
-
- if (dst == NULL)
+ case NFT_META_RTCLASSID:
+ if (!nft_meta_get_eval_rtclassid(skb, dest))
goto err;
- *dest = dst->tclassid;
break;
- }
#endif
#ifdef CONFIG_NETWORK_SECMARK
case NFT_META_SECMARK:
@@ -135,88 +372,41 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
}
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- if (ipv4_is_multicast(ip_hdr(skb)->daddr))
- nft_reg_store8(dest, PACKET_MULTICAST);
- else
- nft_reg_store8(dest, PACKET_BROADCAST);
- break;
- case NFPROTO_IPV6:
- nft_reg_store8(dest, PACKET_MULTICAST);
- break;
- case NFPROTO_NETDEV:
- switch (skb->protocol) {
- case htons(ETH_P_IP): {
- int noff = skb_network_offset(skb);
- struct iphdr *iph, _iph;
-
- iph = skb_header_pointer(skb, noff,
- sizeof(_iph), &_iph);
- if (!iph)
- goto err;
-
- if (ipv4_is_multicast(iph->daddr))
- nft_reg_store8(dest, PACKET_MULTICAST);
- else
- nft_reg_store8(dest, PACKET_BROADCAST);
-
- break;
- }
- case htons(ETH_P_IPV6):
- nft_reg_store8(dest, PACKET_MULTICAST);
- break;
- default:
- WARN_ON_ONCE(1);
- goto err;
- }
- break;
- default:
- WARN_ON_ONCE(1);
+ if (!nft_meta_get_eval_pkttype_lo(pkt, dest))
goto err;
- }
break;
case NFT_META_CPU:
*dest = raw_smp_processor_id();
break;
- case NFT_META_IIFGROUP:
- if (in == NULL)
- goto err;
- *dest = in->group;
- break;
- case NFT_META_OIFGROUP:
- if (out == NULL)
- goto err;
- *dest = out->group;
- break;
#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
- sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk) ||
- !net_eq(nft_net(pkt), sock_net(sk)))
+ if (!nft_meta_get_eval_cgroup(dest, pkt))
goto err;
- *dest = sock_cgroup_classid(&sk->sk_cgrp_data);
break;
#endif
- case NFT_META_PRANDOM: {
- struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
- *dest = prandom_u32_state(state);
+ case NFT_META_PRANDOM:
+ *dest = nft_prandom_u32();
break;
- }
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
nft_reg_store8(dest, secpath_exists(skb));
break;
#endif
case NFT_META_IIFKIND:
- if (in == NULL || in->rtnl_link_ops == NULL)
- goto err;
- strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
- break;
case NFT_META_OIFKIND:
- if (out == NULL || out->rtnl_link_ops == NULL)
+ if (!nft_meta_get_eval_kind(priv->key, dest, pkt))
goto err;
- strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
+ break;
+ case NFT_META_TIME_NS:
+ case NFT_META_TIME_DAY:
+ case NFT_META_TIME_HOUR:
+ nft_meta_get_eval_time(priv->key, dest);
+ break;
+ case NFT_META_SDIF:
+ *dest = nft_meta_get_eval_sdif(pkt);
+ break;
+ case NFT_META_SDIFNAME:
+ nft_meta_get_eval_sdifname(dest, pkt);
break;
default:
WARN_ON(1);
@@ -298,6 +488,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_MARK:
case NFT_META_IIF:
case NFT_META_OIF:
+ case NFT_META_SDIF:
case NFT_META_SKUID:
case NFT_META_SKGID:
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -319,6 +510,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_OIFNAME:
case NFT_META_IIFKIND:
case NFT_META_OIFKIND:
+ case NFT_META_SDIFNAME:
len = IFNAMSIZ;
break;
case NFT_META_PRANDOM:
@@ -330,6 +522,15 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
#endif
+ case NFT_META_TIME_NS:
+ len = sizeof(u64);
+ break;
+ case NFT_META_TIME_DAY:
+ len = sizeof(u8);
+ break;
+ case NFT_META_TIME_HOUR:
+ len = sizeof(u32);
+ break;
default:
return -EOPNOTSUPP;
}
@@ -340,16 +541,28 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_meta_get_init);
-static int nft_meta_get_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx)
{
-#ifdef CONFIG_XFRM
- const struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
- if (priv->key != NFT_META_SECPATH)
- return 0;
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx)
+{
+#ifdef CONFIG_XFRM
+ unsigned int hooks;
switch (ctx->family) {
case NFPROTO_NETDEV:
@@ -372,6 +585,25 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
#endif
}
+static int nft_meta_get_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ switch (priv->key) {
+ case NFT_META_SECPATH:
+ return nft_meta_get_validate_xfrm(ctx);
+ case NFT_META_SDIF:
+ case NFT_META_SDIFNAME:
+ return nft_meta_get_validate_sdif(ctx);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
int nft_meta_set_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data)
@@ -501,6 +733,14 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
break;
+ case NFT_META_IIF:
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_ifindex, sizeof(__u32), reg);
+ break;
+ case NFT_META_IIFTYPE:
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_iftype, sizeof(__u16), reg);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index c3c93e95b46e..8b44a4de5329 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -141,10 +141,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
switch (family) {
case NFPROTO_IPV4:
- alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip);
+ alen = sizeof_field(struct nf_nat_range, min_addr.ip);
break;
case NFPROTO_IPV6:
- alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6);
+ alen = sizeof_field(struct nf_nat_range, min_addr.ip6);
break;
default:
return -EAFNOSUPPORT;
@@ -171,7 +171,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
}
}
- plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_addr.all);
if (tb[NFTA_NAT_REG_PROTO_MIN]) {
priv->sreg_proto_min =
nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]);
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index f54d6ae15bb1..b42247aa48a9 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -61,6 +61,9 @@ static int nft_osf_init(const struct nft_ctx *ctx,
int err;
u8 ttl;
+ if (!tb[NFTA_OSF_DREG])
+ return -EINVAL;
+
if (tb[NFTA_OSF_TTL]) {
ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
if (ttl > 2)
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 22a80eb60222..1993af3a2979 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -23,50 +23,58 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
+static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
+ struct vlan_ethhdr *veth)
+{
+ if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN))
+ return false;
+
+ veth->h_vlan_proto = skb->vlan_proto;
+ veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ veth->h_vlan_encapsulated_proto = skb->protocol;
+
+ return true;
+}
+
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
- u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d;
+ u8 *vlanh, *dst_u8 = (u8 *) d;
struct vlan_ethhdr veth;
+ u8 vlan_hlen = 0;
+
+ if ((skb->protocol == htons(ETH_P_8021AD) ||
+ skb->protocol == htons(ETH_P_8021Q)) &&
+ offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN)
+ vlan_hlen += VLAN_HLEN;
vlanh = (u8 *) &veth;
- if (offset < ETH_HLEN) {
- u8 ethlen = min_t(u8, len, ETH_HLEN - offset);
+ if (offset < VLAN_ETH_HLEN + vlan_hlen) {
+ u8 ethlen = len;
- if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN))
+ if (vlan_hlen &&
+ skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0)
+ return false;
+ else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
return false;
- veth.h_vlan_proto = skb->vlan_proto;
+ if (offset + len > VLAN_ETH_HLEN + vlan_hlen)
+ ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen;
- memcpy(dst_u8, vlanh + offset, ethlen);
+ memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen);
len -= ethlen;
if (len == 0)
return true;
dst_u8 += ethlen;
- offset = ETH_HLEN;
- } else if (offset >= VLAN_ETH_HLEN) {
- offset -= VLAN_HLEN;
- goto skip;
+ offset = ETH_HLEN + vlan_hlen;
+ } else {
+ offset -= VLAN_HLEN + vlan_hlen;
}
- veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
- veth.h_vlan_encapsulated_proto = skb->protocol;
-
- vlanh += offset;
-
- vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset);
- memcpy(dst_u8, vlanh, vlan_len);
-
- len -= vlan_len;
- if (!len)
- return true;
-
- dst_u8 += vlan_len;
- skip:
return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
}
@@ -161,13 +169,59 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ethhdr, h_source):
+ if (priv->len != ETH_ALEN)
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
src, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_dest):
+ if (priv->len != ETH_ALEN)
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
dst, ETH_ALEN, reg);
break;
+ case offsetof(struct ethhdr, h_proto):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
+ n_proto, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tci, sizeof(__be16), reg);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tpid, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
+ vlan_tci, sizeof(__be16), reg);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) +
+ sizeof(struct vlan_hdr):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
+ vlan_tpid, sizeof(__be16), reg);
+ break;
+ default:
+ return -EOPNOTSUPP;
}
return 0;
@@ -181,14 +235,23 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct iphdr, saddr):
+ if (priv->len != sizeof(struct in_addr))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
sizeof(struct in_addr), reg);
break;
case offsetof(struct iphdr, daddr):
+ if (priv->len != sizeof(struct in_addr))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
sizeof(struct in_addr), reg);
break;
case offsetof(struct iphdr, protocol):
+ if (priv->len != sizeof(__u8))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
@@ -208,14 +271,23 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ipv6hdr, saddr):
+ if (priv->len != sizeof(struct in6_addr))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
sizeof(struct in6_addr), reg);
break;
case offsetof(struct ipv6hdr, daddr):
+ if (priv->len != sizeof(struct in6_addr))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
sizeof(struct in6_addr), reg);
break;
case offsetof(struct ipv6hdr, nexthdr):
+ if (priv->len != sizeof(__u8))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
@@ -255,10 +327,16 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct tcphdr, source):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct tcphdr, dest):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
sizeof(__be16), reg);
break;
@@ -277,10 +355,16 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct udphdr, source):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct udphdr, dest):
+ if (priv->len != sizeof(__be16))
+ return -EOPNOTSUPP;
+
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
sizeof(__be16), reg);
break;
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c8745d454bf8..4413690591f2 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -13,7 +13,7 @@
#include <net/netfilter/nf_tables.h>
struct nft_quota {
- u64 quota;
+ atomic64_t quota;
unsigned long flags;
atomic64_t consumed;
};
@@ -21,7 +21,8 @@ struct nft_quota {
static inline bool nft_overquota(struct nft_quota *priv,
const struct sk_buff *skb)
{
- return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
+ return atomic64_add_return(skb->len, &priv->consumed) >=
+ atomic64_read(&priv->quota);
}
static inline bool nft_quota_invert(struct nft_quota *priv)
@@ -89,7 +90,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
- priv->quota = quota;
+ atomic64_set(&priv->quota, quota);
priv->flags = flags;
atomic64_set(&priv->consumed, consumed);
@@ -105,10 +106,22 @@ static int nft_quota_obj_init(const struct nft_ctx *ctx,
return nft_quota_do_init(tb, priv);
}
+static void nft_quota_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_quota *newpriv = nft_obj_data(newobj);
+ struct nft_quota *priv = nft_obj_data(obj);
+ u64 newquota;
+
+ newquota = atomic64_read(&newpriv->quota);
+ atomic64_set(&priv->quota, newquota);
+ priv->flags = newpriv->flags;
+}
+
static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
bool reset)
{
- u64 consumed, consumed_cap;
+ u64 consumed, consumed_cap, quota;
u32 flags = priv->flags;
/* Since we inconditionally increment consumed quota for each packet
@@ -116,14 +129,15 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
* userspace.
*/
consumed = atomic64_read(&priv->consumed);
- if (consumed >= priv->quota) {
- consumed_cap = priv->quota;
+ quota = atomic64_read(&priv->quota);
+ if (consumed >= quota) {
+ consumed_cap = quota;
flags |= NFT_QUOTA_F_DEPLETED;
} else {
consumed_cap = consumed;
}
- if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
+ if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(quota),
NFTA_QUOTA_PAD) ||
nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap),
NFTA_QUOTA_PAD) ||
@@ -155,6 +169,7 @@ static const struct nft_object_ops nft_quota_obj_ops = {
.init = nft_quota_obj_init,
.eval = nft_quota_obj_eval,
.dump = nft_quota_obj_dump,
+ .update = nft_quota_obj_update,
};
static struct nft_object_type nft_quota_obj_type __read_mostly = {
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 4701fa8a45e7..89efcc5a533d 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -66,11 +66,21 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
if (err < 0)
return err;
+ if (desc_from.type != NFT_DATA_VALUE) {
+ err = -EINVAL;
+ goto err1;
+ }
+
err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
&desc_to, tb[NFTA_RANGE_TO_DATA]);
if (err < 0)
goto err1;
+ if (desc_to.type != NFT_DATA_VALUE) {
+ err = -EINVAL;
+ goto err2;
+ }
+
if (desc_from.len != desc_to.len) {
err = -EINVAL;
goto err2;
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 43eeb1f609f1..5b779171565c 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -48,7 +48,7 @@ static int nft_redir_init(const struct nft_ctx *ctx,
unsigned int plen;
int err;
- plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_addr.all);
if (tb[NFTA_REDIR_REG_PROTO_MIN]) {
priv->sreg_proto_min =
nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]);
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index b5aeccdddb22..87e8d9ba0c9b 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -10,7 +10,7 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
struct nft_bitmap_elem {
struct list_head head;
@@ -259,8 +259,8 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[],
}
static int nft_bitmap_init(const struct nft_set *set,
- const struct nft_set_desc *desc,
- const struct nlattr * const nla[])
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
{
struct nft_bitmap *priv = nft_set_priv(set);
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 6e8d20c03e3d..d350a7cd3af0 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -16,7 +16,7 @@
#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
/* We target a hash table size of 4, element hint is 75% of final size */
#define NFT_RHASH_ELEMENT_HINT 3
@@ -234,6 +234,24 @@ static void nft_rhash_remove(const struct net *net,
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
}
+static bool nft_rhash_delete(const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_cmp_arg arg = {
+ .genmask = NFT_GENMASK_ANY,
+ .set = set,
+ .key = key,
+ };
+ struct nft_rhash_elem *he;
+
+ he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
+ if (he == NULL)
+ return false;
+
+ return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+}
+
static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_iter *iter)
{
@@ -627,7 +645,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
}
static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
- struct nft_set_estimate *est)
+ struct nft_set_estimate *est)
{
if (!desc->size)
return false;
@@ -662,6 +680,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = {
.remove = nft_rhash_remove,
.lookup = nft_rhash_lookup,
.update = nft_rhash_update,
+ .delete = nft_rhash_delete,
.walk = nft_rhash_walk,
.get = nft_rhash_get,
},
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
new file mode 100644
index 000000000000..f0cb1e13af50
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo.c
@@ -0,0 +1,2102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges
+ *
+ * Copyright (c) 2019-2020 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+/**
+ * DOC: Theory of Operation
+ *
+ *
+ * Problem
+ * -------
+ *
+ * Match packet bytes against entries composed of ranged or non-ranged packet
+ * field specifiers, mapping them to arbitrary references. For example:
+ *
+ * ::
+ *
+ * --- fields --->
+ * | [net],[port],[net]... => [reference]
+ * entries [net],[port],[net]... => [reference]
+ * | [net],[port],[net]... => [reference]
+ * V ...
+ *
+ * where [net] fields can be IP ranges or netmasks, and [port] fields are port
+ * ranges. Arbitrary packet fields can be matched.
+ *
+ *
+ * Algorithm Overview
+ * ------------------
+ *
+ * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally
+ * relies on the consideration that every contiguous range in a space of b bits
+ * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010],
+ * as also illustrated in Section 9 of [Kogan 2014].
+ *
+ * Classification against a number of entries, that require matching given bits
+ * of a packet field, is performed by grouping those bits in sets of arbitrary
+ * size, and classifying packet bits one group at a time.
+ *
+ * Example:
+ * to match the source port (16 bits) of a packet, we can divide those 16 bits
+ * in 4 groups of 4 bits each. Given the entry:
+ * 0000 0001 0101 1001
+ * and a packet with source port:
+ * 0000 0001 1010 1001
+ * first and second groups match, but the third doesn't. We conclude that the
+ * packet doesn't match the given entry.
+ *
+ * Translate the set to a sequence of lookup tables, one per field. Each table
+ * has two dimensions: bit groups to be matched for a single packet field, and
+ * all the possible values of said groups (buckets). Input entries are
+ * represented as one or more rules, depending on the number of composing
+ * netmasks for the given field specifier, and a group match is indicated as a
+ * set bit, with number corresponding to the rule index, in all the buckets
+ * whose value matches the entry for a given group.
+ *
+ * Rules are mapped between fields through an array of x, n pairs, with each
+ * item mapping a matched rule to one or more rules. The position of the pair in
+ * the array indicates the matched rule to be mapped to the next field, x
+ * indicates the first rule index in the next field, and n the amount of
+ * next-field rules the current rule maps to.
+ *
+ * The mapping array for the last field maps to the desired references.
+ *
+ * To match, we perform table lookups using the values of grouped packet bits,
+ * and use a sequence of bitwise operations to progressively evaluate rule
+ * matching.
+ *
+ * A stand-alone, reference implementation, also including notes about possible
+ * future optimisations, is available at:
+ * https://pipapo.lameexcu.se/
+ *
+ * Insertion
+ * ---------
+ *
+ * - For each packet field:
+ *
+ * - divide the b packet bits we want to classify into groups of size t,
+ * obtaining ceil(b / t) groups
+ *
+ * Example: match on destination IP address, with t = 4: 32 bits, 8 groups
+ * of 4 bits each
+ *
+ * - allocate a lookup table with one column ("bucket") for each possible
+ * value of a group, and with one row for each group
+ *
+ * Example: 8 groups, 2^4 buckets:
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0
+ * 1
+ * 2
+ * 3
+ * 4
+ * 5
+ * 6
+ * 7
+ *
+ * - map the bits we want to classify for the current field, for a given
+ * entry, to a single rule for non-ranged and netmask set items, and to one
+ * or multiple rules for ranges. Ranges are expanded to composing netmasks
+ * by pipapo_expand().
+ *
+ * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048
+ * - rule #0: 10.0.0.5
+ * - rule #1: 192.168.1.0/24
+ * - rule #2: 192.168.2.0/31
+ *
+ * - insert references to the rules in the lookup table, selecting buckets
+ * according to bit values of a rule in the given group. This is done by
+ * pipapo_insert().
+ *
+ * Example: given:
+ * - rule #0: 10.0.0.5 mapping to buckets
+ * < 0 10 0 0 0 0 0 5 >
+ * - rule #1: 192.168.1.0/24 mapping to buckets
+ * < 12 0 10 8 0 1 < 0..15 > < 0..15 > >
+ * - rule #2: 192.168.2.0/31 mapping to buckets
+ * < 12 0 10 8 0 2 0 < 0..1 > >
+ *
+ * these bits are set in the lookup table:
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * - if this is not the last field in the set, fill a mapping array that maps
+ * rules from the lookup table to rules belonging to the same entry in
+ * the next lookup table, done by pipapo_map().
+ *
+ * Note that as rules map to contiguous ranges of rules, given how netmask
+ * expansion and insertion is performed, &union nft_pipapo_map_bucket stores
+ * this information as pairs of first rule index, rule count.
+ *
+ * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048,
+ * given lookup table #0 for field 0 (see example above):
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * and lookup table #1 for field 1 with:
+ * - rule #0: 1024 mapping to buckets
+ * < 0 0 4 0 >
+ * - rule #1: 2048 mapping to buckets
+ * < 0 0 5 0 >
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024
+ * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1
+ * (rules #1, #2) to 2048 in lookup table #2 (rule #1):
+ *
+ * ::
+ *
+ * rule indices in current field: 0 1 2
+ * map to rules in next field: 0 1 1
+ *
+ * - if this is the last field in the set, fill a mapping array that maps
+ * rules from the last lookup table to element pointers, also done by
+ * pipapo_map().
+ *
+ * Note that, in this implementation, we have two elements (start, end) for
+ * each entry. The pointer to the end element is stored in this array, and
+ * the pointer to the start element is linked from it.
+ *
+ * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem
+ * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42.
+ * From the rules of lookup table #1 as mapped above:
+ *
+ * ::
+ *
+ * rule indices in last field: 0 1
+ * map to elements: 0x42 0x66
+ *
+ *
+ * Matching
+ * --------
+ *
+ * We use a result bitmap, with the size of a single lookup table bucket, to
+ * represent the matching state that applies at every algorithm step. This is
+ * done by pipapo_lookup().
+ *
+ * - For each packet field:
+ *
+ * - start with an all-ones result bitmap (res_map in pipapo_lookup())
+ *
+ * - perform a lookup into the table corresponding to the current field,
+ * for each group, and at every group, AND the current result bitmap with
+ * the value from the lookup table bucket
+ *
+ * ::
+ *
+ * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from
+ * insertion examples.
+ * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for
+ * convenience in this example. Initial result bitmap is 0xff, the steps
+ * below show the value of the result bitmap after each group is processed:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6
+ *
+ * 1 1,2 0
+ * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6
+ *
+ * 2 0 1,2
+ * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6
+ *
+ * 3 0 1,2
+ * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6
+ *
+ * 4 0,1,2
+ * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6
+ *
+ * 5 0 1 2
+ * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2
+ *
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2
+ *
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2
+ *
+ * - at the next field, start with a new, all-zeroes result bitmap. For each
+ * bit set in the previous result bitmap, fill the new result bitmap
+ * (fill_map in pipapo_lookup()) with the rule indices from the
+ * corresponding buckets of the mapping field for this field, done by
+ * pipapo_refill()
+ *
+ * Example: with mapping table from insertion examples, with the current
+ * result bitmap from the previous example, 0x02:
+ *
+ * ::
+ *
+ * rule indices in current field: 0 1 2
+ * map to rules in next field: 0 1 1
+ *
+ * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be
+ * set.
+ *
+ * We can now extend this example to cover the second iteration of the step
+ * above (lookup and AND bitmap): assuming the port field is
+ * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table
+ * for "port" field from pre-computation example:
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5]
+ * & 0x3 [bucket 0], resulting bitmap is 0x2.
+ *
+ * - if this is the last field in the set, look up the value from the mapping
+ * array corresponding to the final result bitmap
+ *
+ * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for
+ * last field from insertion example:
+ *
+ * ::
+ *
+ * rule indices in last field: 0 1
+ * map to elements: 0x42 0x66
+ *
+ * the matching element is at 0x42.
+ *
+ *
+ * References
+ * ----------
+ *
+ * [Ligatti 2010]
+ * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with
+ * Automatic Time-space Tradeoffs
+ * Jay Ligatti, Josh Kuhn, and Chris Gage.
+ * Proceedings of the IEEE International Conference on Computer
+ * Communication Networks (ICCCN), August 2010.
+ * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
+ *
+ * [Rottenstreich 2010]
+ * Worst-Case TCAM Rule Expansion
+ * Ori Rottenstreich and Isaac Keslassy.
+ * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010.
+ * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf
+ *
+ * [Kogan 2014]
+ * SAX-PAC (Scalable And eXpressive PAcket Classification)
+ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane,
+ * and Patrick Eugster.
+ * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014.
+ * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <net/ipv6.h> /* For the maximum length of a field */
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+/* Count of concatenated fields depends on count of 32-bit nftables registers */
+#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
+
+/* Largest supported field size */
+#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
+#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
+
+/* Number of bits to be grouped together in lookup table buckets, arbitrary */
+#define NFT_PIPAPO_GROUP_BITS 4
+#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS)
+
+/* Fields are padded to 32 bits in input registers */
+#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \
+ (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32)))
+#define NFT_PIPAPO_GROUPS_PADDING(x) \
+ (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE)
+
+/* Number of buckets, given by 2 ^ n, with n grouped bits */
+#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS)
+
+/* Each n-bit range maps to up to n * 2 rules */
+#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
+
+/* Use the rest of mapping table buckets for rule indices, but it makes no sense
+ * to exceed 32 bits
+ */
+#if BITS_PER_LONG == 64
+#define NFT_PIPAPO_MAP_TOBITS 32
+#else
+#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
+#endif
+
+/* ...which gives us the highest allowed index for a rule */
+#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
+ - (1UL << NFT_PIPAPO_MAP_NBITS))
+
+#define nft_pipapo_for_each_field(field, index, match) \
+ for ((field) = (match)->f, (index) = 0; \
+ (index) < (match)->field_count; \
+ (index)++, (field)++)
+
+/**
+ * union nft_pipapo_map_bucket - Bucket of mapping table
+ * @to: First rule number (in next field) this rule maps to
+ * @n: Number of rules (in next field) this rule maps to
+ * @e: If there's no next field, pointer to element this rule maps to
+ */
+union nft_pipapo_map_bucket {
+ struct {
+#if BITS_PER_LONG == 64
+ static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
+ u32 to;
+
+ static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
+ u32 n;
+#else
+ unsigned long to:NFT_PIPAPO_MAP_TOBITS;
+ unsigned long n:NFT_PIPAPO_MAP_NBITS;
+#endif
+ };
+ struct nft_pipapo_elem *e;
+};
+
+/**
+ * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
+ * @groups: Amount of 4-bit groups
+ * @rules: Number of inserted rules
+ * @bsize: Size of each bucket in lookup table, in longs
+ * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets
+ * @mt: Mapping table: one bucket per rule
+ */
+struct nft_pipapo_field {
+ int groups;
+ unsigned long rules;
+ size_t bsize;
+ unsigned long *lt;
+ union nft_pipapo_map_bucket *mt;
+};
+
+/**
+ * struct nft_pipapo_match - Data used for lookup and matching
+ * @field_count Amount of fields in set
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @bsize_max: Maximum lookup table bucket size of all fields, in longs
+ * @rcu Matching data is swapped on commits
+ * @f: Fields, with lookup and mapping tables
+ */
+struct nft_pipapo_match {
+ int field_count;
+ unsigned long * __percpu *scratch;
+ size_t bsize_max;
+ struct rcu_head rcu;
+ struct nft_pipapo_field f[0];
+};
+
+/* Current working bitmap index, toggled between field matches */
+static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
+
+/**
+ * struct nft_pipapo - Representation of a set
+ * @match: Currently in-use matching data
+ * @clone: Copy where pending insertions and deletions are kept
+ * @groups: Total amount of 4-bit groups for fields in this set
+ * @width: Total bytes to be matched for one packet, including padding
+ * @dirty: Working copy has pending insertions or deletions
+ * @last_gc: Timestamp of last garbage collection run, jiffies
+ */
+struct nft_pipapo {
+ struct nft_pipapo_match __rcu *match;
+ struct nft_pipapo_match *clone;
+ int groups;
+ int width;
+ bool dirty;
+ unsigned long last_gc;
+};
+
+struct nft_pipapo_elem;
+
+/**
+ * struct nft_pipapo_elem - API-facing representation of single set element
+ * @ext: nftables API extensions
+ */
+struct nft_pipapo_elem {
+ struct nft_set_ext ext;
+};
+
+/**
+ * pipapo_refill() - For each set bit, set bits from selected mapping table item
+ * @map: Bitmap to be scanned for set bits
+ * @len: Length of bitmap in longs
+ * @rules: Number of rules in field
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @match_only: Find a single bit and return, don't fill
+ *
+ * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain.
+ *
+ * For each bit set in map, select the bucket from mapping table with index
+ * corresponding to the position of the bit set. Use start bit and amount of
+ * bits specified in bucket to fill region in dst.
+ *
+ * Return: -1 on no match, bit position on 'match_only', 0 otherwise.
+ */
+static int pipapo_refill(unsigned long *map, int len, int rules,
+ unsigned long *dst, union nft_pipapo_map_bucket *mt,
+ bool match_only)
+{
+ unsigned long bitset;
+ int k, ret = -1;
+
+ for (k = 0; k < len; k++) {
+ bitset = map[k];
+ while (bitset) {
+ unsigned long t = bitset & -bitset;
+ int r = __builtin_ctzl(bitset);
+ int i = k * BITS_PER_LONG + r;
+
+ if (unlikely(i >= rules)) {
+ map[k] = 0;
+ return -1;
+ }
+
+ if (unlikely(match_only)) {
+ bitmap_clear(map, i, 1);
+ return i;
+ }
+
+ ret = 0;
+
+ bitmap_set(dst, mt[i].to, mt[i].n);
+
+ bitset ^= t;
+ }
+ map[k] = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_lookup() - Lookup function
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @ext: nftables API extension pointer, filled with matching reference
+ *
+ * For more details, see DOC: Theory of Operation.
+ *
+ * Return: true on match, false otherwise.
+ */
+static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ unsigned long *res_map, *fill_map;
+ u8 genmask = nft_genmask_cur(net);
+ const u8 *rp = (const u8 *)key;
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ bool map_index;
+ int i;
+
+ local_bh_disable();
+
+ map_index = raw_cpu_read(nft_pipapo_scratch_index);
+
+ m = rcu_dereference(priv->match);
+
+ if (unlikely(!m || !*raw_cpu_ptr(m->scratch)))
+ goto out;
+
+ res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0);
+ fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max);
+
+ memset(res_map, 0xff, m->bsize_max * sizeof(*res_map));
+
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1;
+ unsigned long *lt = f->lt;
+ int b, group;
+
+ /* For each 4-bit group: select lookup table bucket depending on
+ * packet bytes value, then AND bucket value
+ */
+ for (group = 0; group < f->groups; group += 2) {
+ u8 v;
+
+ v = *rp >> 4;
+ __bitmap_and(res_map, res_map, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS;
+
+ v = *rp & 0x0f;
+ rp++;
+ __bitmap_and(res_map, res_map, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS;
+ }
+
+ /* Now populate the bitmap for the next field, unless this is
+ * the last field, in which case return the matched 'ext'
+ * pointer if any.
+ *
+ * Now res_map contains the matching bitmap, and fill_map is the
+ * bitmap for the next field.
+ */
+next_match:
+ b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
+ last);
+ if (b < 0) {
+ raw_cpu_write(nft_pipapo_scratch_index, map_index);
+ local_bh_enable();
+
+ return false;
+ }
+
+ if (last) {
+ *ext = &f->mt[b].e->ext;
+ if (unlikely(nft_set_elem_expired(*ext) ||
+ !nft_set_elem_active(*ext, genmask)))
+ goto next_match;
+
+ /* Last field: we're just returning the key without
+ * filling the initial bitmap for the next field, so the
+ * current inactive bitmap is clean and can be reused as
+ * *next* bitmap (not initial) for the next packet.
+ */
+ raw_cpu_write(nft_pipapo_scratch_index, map_index);
+ local_bh_enable();
+
+ return true;
+ }
+
+ /* Swap bitmap indices: res_map is the initial bitmap for the
+ * next field, and fill_map is guaranteed to be all-zeroes at
+ * this point.
+ */
+ map_index = !map_index;
+ swap(res_map, fill_map);
+
+ rp += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ }
+
+out:
+ local_bh_enable();
+ return false;
+}
+
+/**
+ * pipapo_get() - Get matching element reference given key data
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ *
+ * This is essentially the same as the lookup function, except that it matches
+ * key data against the uncommitted copy and doesn't use preallocated maps for
+ * bitmap results.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise.
+ */
+static struct nft_pipapo_elem *pipapo_get(const struct net *net,
+ const struct nft_set *set,
+ const u8 *data, u8 genmask)
+{
+ struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = priv->clone;
+ unsigned long *res_map, *fill_map = NULL;
+ struct nft_pipapo_field *f;
+ int i;
+
+ res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
+ if (!res_map) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
+ if (!fill_map) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ memset(res_map, 0xff, m->bsize_max * sizeof(*res_map));
+
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1;
+ unsigned long *lt = f->lt;
+ int b, group;
+
+ /* For each 4-bit group: select lookup table bucket depending on
+ * packet bytes value, then AND bucket value
+ */
+ for (group = 0; group < f->groups; group++) {
+ u8 v;
+
+ if (group % 2) {
+ v = *data & 0x0f;
+ data++;
+ } else {
+ v = *data >> 4;
+ }
+ __bitmap_and(res_map, res_map, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+
+ lt += f->bsize * NFT_PIPAPO_BUCKETS;
+ }
+
+ /* Now populate the bitmap for the next field, unless this is
+ * the last field, in which case return the matched 'ext'
+ * pointer if any.
+ *
+ * Now res_map contains the matching bitmap, and fill_map is the
+ * bitmap for the next field.
+ */
+next_match:
+ b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
+ last);
+ if (b < 0)
+ goto out;
+
+ if (last) {
+ if (nft_set_elem_expired(&f->mt[b].e->ext) ||
+ (genmask &&
+ !nft_set_elem_active(&f->mt[b].e->ext, genmask)))
+ goto next_match;
+
+ ret = f->mt[b].e;
+ goto out;
+ }
+
+ data += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+
+ /* Swap bitmap indices: fill_map will be the initial bitmap for
+ * the next field (i.e. the new res_map), and res_map is
+ * guaranteed to be all-zeroes at this point, ready to be filled
+ * according to the next mapping table.
+ */
+ swap(res_map, fill_map);
+ }
+
+out:
+ kfree(fill_map);
+ kfree(res_map);
+ return ret;
+}
+
+/**
+ * nft_pipapo_get() - Get matching element reference given key data
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @flags: Unused
+ */
+void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
+{
+ return pipapo_get(net, set, (const u8 *)elem->key.val.data,
+ nft_genmask_cur(net));
+}
+
+/**
+ * pipapo_resize() - Resize lookup or mapping table, or both
+ * @f: Field containing lookup and mapping tables
+ * @old_rules: Previous amount of rules in field
+ * @rules: New amount of rules
+ *
+ * Increase, decrease or maintain tables size depending on new amount of rules,
+ * and copy data over. In case the new size is smaller, throw away data for
+ * highest-numbered rules.
+ *
+ * Return: 0 on success, -ENOMEM on allocation failure.
+ */
+static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
+{
+ long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
+ union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
+ size_t new_bucket_size, copy;
+ int group, bucket;
+
+ new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
+
+ if (new_bucket_size == f->bsize)
+ goto mt;
+
+ if (new_bucket_size > f->bsize)
+ copy = f->bsize;
+ else
+ copy = new_bucket_size;
+
+ new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size *
+ sizeof(*new_lt), GFP_KERNEL);
+ if (!new_lt)
+ return -ENOMEM;
+
+ new_p = new_lt;
+ old_p = old_lt;
+ for (group = 0; group < f->groups; group++) {
+ for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) {
+ memcpy(new_p, old_p, copy * sizeof(*new_p));
+ new_p += copy;
+ old_p += copy;
+
+ if (new_bucket_size > f->bsize)
+ new_p += new_bucket_size - f->bsize;
+ else
+ old_p += f->bsize - new_bucket_size;
+ }
+ }
+
+mt:
+ new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL);
+ if (!new_mt) {
+ kvfree(new_lt);
+ return -ENOMEM;
+ }
+
+ memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt));
+ if (rules > old_rules) {
+ memset(new_mt + old_rules, 0,
+ (rules - old_rules) * sizeof(*new_mt));
+ }
+
+ if (new_lt) {
+ f->bsize = new_bucket_size;
+ f->lt = new_lt;
+ kvfree(old_lt);
+ }
+
+ f->mt = new_mt;
+ kvfree(old_mt);
+
+ return 0;
+}
+
+/**
+ * pipapo_bucket_set() - Set rule bit in bucket given group and group value
+ * @f: Field containing lookup table
+ * @rule: Rule index
+ * @group: Group index
+ * @v: Value of bit group
+ */
+static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
+ int v)
+{
+ unsigned long *pos;
+
+ pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group;
+ pos += f->bsize * v;
+
+ __set_bit(rule, pos);
+}
+
+/**
+ * pipapo_insert() - Insert new rule in field given input key and mask length
+ * @f: Field containing lookup table
+ * @k: Input key for classification, without nftables padding
+ * @mask_bits: Length of mask; matches field length for non-ranged entry
+ *
+ * Insert a new rule reference in lookup buckets corresponding to k and
+ * mask_bits.
+ *
+ * Return: 1 on success (one rule inserted), negative error code on failure.
+ */
+static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
+ int mask_bits)
+{
+ int rule = f->rules++, group, ret;
+
+ ret = pipapo_resize(f, f->rules - 1, f->rules);
+ if (ret)
+ return ret;
+
+ for (group = 0; group < f->groups; group++) {
+ int i, v;
+ u8 mask;
+
+ if (group % 2)
+ v = k[group / 2] & 0x0f;
+ else
+ v = k[group / 2] >> 4;
+
+ if (mask_bits >= (group + 1) * 4) {
+ /* Not masked */
+ pipapo_bucket_set(f, rule, group, v);
+ } else if (mask_bits <= group * 4) {
+ /* Completely masked */
+ for (i = 0; i < NFT_PIPAPO_BUCKETS; i++)
+ pipapo_bucket_set(f, rule, group, i);
+ } else {
+ /* The mask limit falls on this group */
+ mask = 0x0f >> (mask_bits - group * 4);
+ for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) {
+ if ((i & ~mask) == (v & ~mask))
+ pipapo_bucket_set(f, rule, group, i);
+ }
+ }
+ }
+
+ return 1;
+}
+
+/**
+ * pipapo_step_diff() - Check if setting @step bit in netmask would change it
+ * @base: Mask we are expanding
+ * @step: Step bit for given expansion step
+ * @len: Total length of mask space (set and unset bits), bytes
+ *
+ * Convenience function for mask expansion.
+ *
+ * Return: true if step bit changes mask (i.e. isn't set), false otherwise.
+ */
+static bool pipapo_step_diff(u8 *base, int step, int len)
+{
+ /* Network order, byte-addressed */
+#ifdef __BIG_ENDIAN__
+ return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]);
+#else
+ return !(BIT(step % BITS_PER_BYTE) &
+ base[len - 1 - step / BITS_PER_BYTE]);
+#endif
+}
+
+/**
+ * pipapo_step_after_end() - Check if mask exceeds range end with given step
+ * @base: Mask we are expanding
+ * @end: End of range
+ * @step: Step bit for given expansion step, highest bit to be set
+ * @len: Total length of mask space (set and unset bits), bytes
+ *
+ * Convenience function for mask expansion.
+ *
+ * Return: true if mask exceeds range setting step bits, false otherwise.
+ */
+static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step,
+ int len)
+{
+ u8 tmp[NFT_PIPAPO_MAX_BYTES];
+ int i;
+
+ memcpy(tmp, base, len);
+
+ /* Network order, byte-addressed */
+ for (i = 0; i <= step; i++)
+#ifdef __BIG_ENDIAN__
+ tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE);
+#else
+ tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE);
+#endif
+
+ return memcmp(tmp, end, len) > 0;
+}
+
+/**
+ * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry
+ * @base: Netmask base
+ * @step: Step bit to sum
+ * @len: Netmask length, bytes
+ */
+static void pipapo_base_sum(u8 *base, int step, int len)
+{
+ bool carry = false;
+ int i;
+
+ /* Network order, byte-addressed */
+#ifdef __BIG_ENDIAN__
+ for (i = step / BITS_PER_BYTE; i < len; i++) {
+#else
+ for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) {
+#endif
+ if (carry)
+ base[i]++;
+ else
+ base[i] += 1 << (step % BITS_PER_BYTE);
+
+ if (base[i])
+ break;
+
+ carry = true;
+ }
+}
+
+/**
+ * pipapo_expand() - Expand to composing netmasks, insert into lookup table
+ * @f: Field containing lookup table
+ * @start: Start of range
+ * @end: End of range
+ * @len: Length of value in bits
+ *
+ * Expand range to composing netmasks and insert corresponding rule references
+ * in lookup buckets.
+ *
+ * Return: number of inserted rules on success, negative error code on failure.
+ */
+static int pipapo_expand(struct nft_pipapo_field *f,
+ const u8 *start, const u8 *end, int len)
+{
+ int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE);
+ u8 base[NFT_PIPAPO_MAX_BYTES];
+
+ memcpy(base, start, bytes);
+ while (memcmp(base, end, bytes) <= 0) {
+ int err;
+
+ step = 0;
+ while (pipapo_step_diff(base, step, bytes)) {
+ if (pipapo_step_after_end(base, end, step, bytes))
+ break;
+
+ step++;
+ if (step >= len) {
+ if (!masks) {
+ pipapo_insert(f, base, 0);
+ masks = 1;
+ }
+ goto out;
+ }
+ }
+
+ err = pipapo_insert(f, base, len - step);
+
+ if (err < 0)
+ return err;
+
+ masks++;
+ pipapo_base_sum(base, step, bytes);
+ }
+out:
+ return masks;
+}
+
+/**
+ * pipapo_map() - Insert rules in mapping tables, mapping them between fields
+ * @m: Matching data, including mapping table
+ * @map: Table of rule maps: array of first rule and amount of rules
+ * in next field a given rule maps to, for each field
+ * @ext: For last field, nft_set_ext pointer matching rules map to
+ */
+static void pipapo_map(struct nft_pipapo_match *m,
+ union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS],
+ struct nft_pipapo_elem *e)
+{
+ struct nft_pipapo_field *f;
+ int i, j;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) {
+ for (j = 0; j < map[i].n; j++) {
+ f->mt[map[i].to + j].to = map[i + 1].to;
+ f->mt[map[i].to + j].n = map[i + 1].n;
+ }
+ }
+
+ /* Last field: map to ext instead of mapping to next field */
+ for (j = 0; j < map[i].n; j++)
+ f->mt[map[i].to + j].e = e;
+}
+
+/**
+ * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results
+ * @clone: Copy of matching data with pending insertions and deletions
+ * @bsize_max Maximum bucket size, scratch maps cover two buckets
+ *
+ * Return: 0 on success, -ENOMEM on failure.
+ */
+static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
+ unsigned long bsize_max)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ unsigned long *scratch;
+
+ scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2,
+ GFP_KERNEL, cpu_to_node(i));
+ if (!scratch) {
+ /* On failure, there's no need to undo previous
+ * allocations: this means that some scratch maps have
+ * a bigger allocated size now (this is only called on
+ * insertion), but the extra space won't be used by any
+ * CPU as new elements are not inserted and m->bsize_max
+ * is not updated.
+ */
+ return -ENOMEM;
+ }
+
+ kfree(*per_cpu_ptr(clone->scratch, i));
+
+ *per_cpu_ptr(clone->scratch, i) = scratch;
+ }
+
+ return 0;
+}
+
+/**
+ * nft_pipapo_insert() - Validate and insert ranged elements
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @ext2: Filled with pointer to &struct nft_set_ext in inserted element
+ *
+ * Return: 0 on success, error pointer on failure.
+ */
+static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_set_ext **ext2)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
+ const u8 *start = (const u8 *)elem->key.val.data, *end;
+ struct nft_pipapo_elem *e = elem->priv, *dup;
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = priv->clone;
+ u8 genmask = nft_genmask_next(net);
+ struct nft_pipapo_field *f;
+ int i, bsize_max, err = 0;
+
+ dup = pipapo_get(net, set, start, genmask);
+ if (PTR_ERR(dup) == -ENOENT) {
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) {
+ end = (const u8 *)nft_set_ext_key_end(ext)->data;
+ dup = pipapo_get(net, set, end, nft_genmask_next(net));
+ } else {
+ end = start;
+ }
+ }
+
+ if (PTR_ERR(dup) != -ENOENT) {
+ if (IS_ERR(dup))
+ return PTR_ERR(dup);
+ *ext2 = &dup->ext;
+ return -EEXIST;
+ }
+
+ /* Validate */
+ nft_pipapo_for_each_field(f, i, m) {
+ const u8 *start_p = start, *end_p = end;
+
+ if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX)
+ return -ENOSPC;
+
+ if (memcmp(start_p, end_p,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0)
+ return -EINVAL;
+
+ start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ }
+
+ /* Insert */
+ priv->dirty = true;
+
+ bsize_max = m->bsize_max;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ int ret;
+
+ rulemap[i].to = f->rules;
+
+ ret = memcmp(start, end,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
+ if (!ret) {
+ ret = pipapo_insert(f, start,
+ f->groups * NFT_PIPAPO_GROUP_BITS);
+ } else {
+ ret = pipapo_expand(f, start, end,
+ f->groups * NFT_PIPAPO_GROUP_BITS);
+ }
+
+ if (f->bsize > bsize_max)
+ bsize_max = f->bsize;
+
+ rulemap[i].n = ret;
+
+ start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ }
+
+ if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
+ err = pipapo_realloc_scratch(m, bsize_max);
+ if (err)
+ return err;
+
+ this_cpu_write(nft_pipapo_scratch_index, false);
+
+ m->bsize_max = bsize_max;
+ }
+
+ *ext2 = &e->ext;
+
+ pipapo_map(m, rulemap, e);
+
+ return 0;
+}
+
+/**
+ * pipapo_clone() - Clone matching data to create new working copy
+ * @old: Existing matching data
+ *
+ * Return: copy of matching data passed as 'old', error pointer on failure
+ */
+static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
+{
+ struct nft_pipapo_field *dst, *src;
+ struct nft_pipapo_match *new;
+ int i;
+
+ new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count,
+ GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ new->field_count = old->field_count;
+ new->bsize_max = old->bsize_max;
+
+ new->scratch = alloc_percpu(*new->scratch);
+ if (!new->scratch)
+ goto out_scratch;
+
+ rcu_head_init(&new->rcu);
+
+ src = old->f;
+ dst = new->f;
+
+ for (i = 0; i < old->field_count; i++) {
+ memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
+
+ dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS *
+ src->bsize * sizeof(*dst->lt),
+ GFP_KERNEL);
+ if (!dst->lt)
+ goto out_lt;
+
+ memcpy(dst->lt, src->lt,
+ src->bsize * sizeof(*dst->lt) *
+ src->groups * NFT_PIPAPO_BUCKETS);
+
+ dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
+ if (!dst->mt)
+ goto out_mt;
+
+ memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
+ src++;
+ dst++;
+ }
+
+ return new;
+
+out_mt:
+ kvfree(dst->lt);
+out_lt:
+ for (dst--; i > 0; i--) {
+ kvfree(dst->mt);
+ kvfree(dst->lt);
+ dst--;
+ }
+ free_percpu(new->scratch);
+out_scratch:
+ kfree(new);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * pipapo_rules_same_key() - Get number of rules originated from the same entry
+ * @f: Field containing mapping table
+ * @first: Index of first rule in set of rules mapping to same entry
+ *
+ * Using the fact that all rules in a field that originated from the same entry
+ * will map to the same set of rules in the next field, or to the same element
+ * reference, return the cardinality of the set of rules that originated from
+ * the same entry as the rule with index @first, @first rule included.
+ *
+ * In pictures:
+ * rules
+ * field #0 0 1 2 3 4
+ * map to: 0 1 2-4 2-4 5-9
+ * . . ....... . ...
+ * | | | | \ \
+ * | | | | \ \
+ * | | | | \ \
+ * ' ' ' ' ' \
+ * in field #1 0 1 2 3 4 5 ...
+ *
+ * if this is called for rule 2 on field #0, it will return 3, as also rules 2
+ * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field.
+ *
+ * For the last field in a set, we can rely on associated entries to map to the
+ * same element references.
+ *
+ * Return: Number of rules that originated from the same entry as @first.
+ */
+static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
+{
+ struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */
+ int r;
+
+ for (r = first; r < f->rules; r++) {
+ if (r != first && e != f->mt[r].e)
+ return r - first;
+
+ e = f->mt[r].e;
+ }
+
+ if (r != first)
+ return r - first;
+
+ return 0;
+}
+
+/**
+ * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones
+ * @mt: Mapping array
+ * @rules: Original amount of rules in mapping table
+ * @start: First rule index to be removed
+ * @n: Amount of rules to be removed
+ * @to_offset: First rule index, in next field, this group of rules maps to
+ * @is_last: If this is the last field, delete reference from mapping array
+ *
+ * This is used to unmap rules from the mapping table for a single field,
+ * maintaining consistency and compactness for the existing ones.
+ *
+ * In pictures: let's assume that we want to delete rules 2 and 3 from the
+ * following mapping array:
+ *
+ * rules
+ * 0 1 2 3 4
+ * map to: 4-10 4-10 11-15 11-15 16-18
+ *
+ * the result will be:
+ *
+ * rules
+ * 0 1 2
+ * map to: 4-10 4-10 11-13
+ *
+ * for fields before the last one. In case this is the mapping table for the
+ * last field in a set, and rules map to pointers to &struct nft_pipapo_elem:
+ *
+ * rules
+ * 0 1 2 3 4
+ * element pointers: 0x42 0x42 0x33 0x33 0x44
+ *
+ * the result will be:
+ *
+ * rules
+ * 0 1 2
+ * element pointers: 0x42 0x42 0x44
+ */
+static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
+ int start, int n, int to_offset, bool is_last)
+{
+ int i;
+
+ memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt));
+ memset(mt + rules - n, 0, n * sizeof(*mt));
+
+ if (is_last)
+ return;
+
+ for (i = start; i < rules - n; i++)
+ mt[i].to -= to_offset;
+}
+
+/**
+ * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map
+ * @m: Matching data
+ * @rulemap Table of rule maps, arrays of first rule and amount of rules
+ * in next field a given entry maps to, for each field
+ *
+ * For each rule in lookup table buckets mapping to this set of rules, drop
+ * all bits set in lookup table mapping. In pictures, assuming we want to drop
+ * rules 0 and 1 from this lookup table:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * rule 2 becomes rule 0, and the result will be:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0
+ * 1 0
+ * 2 0
+ * 3 0
+ * 4 0
+ * 5 0
+ * 6 0
+ * 7 0 0
+ *
+ * once this is done, call unmap() to drop all the corresponding rule references
+ * from mapping tables.
+ */
+static void pipapo_drop(struct nft_pipapo_match *m,
+ union nft_pipapo_map_bucket rulemap[])
+{
+ struct nft_pipapo_field *f;
+ int i;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ int g;
+
+ for (g = 0; g < f->groups; g++) {
+ unsigned long *pos;
+ int b;
+
+ pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ bitmap_cut(pos, pos, rulemap[i].to,
+ rulemap[i].n,
+ f->bsize * BITS_PER_LONG);
+
+ pos += f->bsize;
+ }
+ }
+
+ pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n,
+ rulemap[i + 1].n, i == m->field_count - 1);
+ if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) {
+ /* We can ignore this, a failure to shrink tables down
+ * doesn't make tables invalid.
+ */
+ ;
+ }
+ f->rules -= rulemap[i].n;
+ }
+}
+
+/**
+ * pipapo_gc() - Drop expired entries from set, destroy start and end elements
+ * @set: nftables API set representation
+ * @m: Matching data
+ */
+static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ int rules_f0, first_rule = 0;
+
+ while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
+ union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
+ struct nft_pipapo_field *f;
+ struct nft_pipapo_elem *e;
+ int i, start, rules_fx;
+
+ start = first_rule;
+ rules_fx = rules_f0;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ rulemap[i].to = start;
+ rulemap[i].n = rules_fx;
+
+ if (i < m->field_count - 1) {
+ rules_fx = f->mt[start].n;
+ start = f->mt[start].to;
+ }
+ }
+
+ /* Pick the last field, and its last index */
+ f--;
+ i--;
+ e = f->mt[rulemap[i].to].e;
+ if (nft_set_elem_expired(&e->ext) &&
+ !nft_set_elem_mark_busy(&e->ext)) {
+ priv->dirty = true;
+ pipapo_drop(m, rulemap);
+
+ rcu_barrier();
+ nft_set_elem_destroy(set, e, true);
+
+ /* And check again current first rule, which is now the
+ * first we haven't checked.
+ */
+ } else {
+ first_rule += rules_f0;
+ }
+ }
+
+ priv->last_gc = jiffies;
+}
+
+/**
+ * pipapo_free_fields() - Free per-field tables contained in matching data
+ * @m: Matching data
+ */
+static void pipapo_free_fields(struct nft_pipapo_match *m)
+{
+ struct nft_pipapo_field *f;
+ int i;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ kvfree(f->lt);
+ kvfree(f->mt);
+ }
+}
+
+/**
+ * pipapo_reclaim_match - RCU callback to free fields from old matching data
+ * @rcu: RCU head
+ */
+static void pipapo_reclaim_match(struct rcu_head *rcu)
+{
+ struct nft_pipapo_match *m;
+ int i;
+
+ m = container_of(rcu, struct nft_pipapo_match, rcu);
+
+ for_each_possible_cpu(i)
+ kfree(*per_cpu_ptr(m->scratch, i));
+
+ free_percpu(m->scratch);
+
+ pipapo_free_fields(m);
+
+ kfree(m);
+}
+
+/**
+ * pipapo_commit() - Replace lookup data with current working copy
+ * @set: nftables API set representation
+ *
+ * While at it, check if we should perform garbage collection on the working
+ * copy before committing it for lookup, and don't replace the table if the
+ * working copy doesn't have pending changes.
+ *
+ * We also need to create a new working copy for subsequent insertions and
+ * deletions.
+ */
+static void pipapo_commit(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *new_clone, *old;
+
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ pipapo_gc(set, priv->clone);
+
+ if (!priv->dirty)
+ return;
+
+ new_clone = pipapo_clone(priv->clone);
+ if (IS_ERR(new_clone))
+ return;
+
+ priv->dirty = false;
+
+ old = rcu_access_pointer(priv->match);
+ rcu_assign_pointer(priv->match, priv->clone);
+ if (old)
+ call_rcu(&old->rcu, pipapo_reclaim_match);
+
+ priv->clone = new_clone;
+}
+
+/**
+ * nft_pipapo_activate() - Mark element reference as active given key, commit
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ *
+ * On insertion, elements are added to a copy of the matching data currently
+ * in use for lookups, and not directly inserted into current lookup data, so
+ * we'll take care of that by calling pipapo_commit() here. Both
+ * nft_pipapo_insert() and nft_pipapo_activate() are called once for each
+ * element, hence we can't purpose either one as a real commit operation.
+ */
+static void nft_pipapo_activate(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_pipapo_elem *e;
+
+ e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0);
+ if (IS_ERR(e))
+ return;
+
+ nft_set_elem_change_active(net, set, &e->ext);
+ nft_set_elem_clear_busy(&e->ext);
+
+ pipapo_commit(set);
+}
+
+/**
+ * pipapo_deactivate() - Check that element is in set, mark as inactive
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @data: Input key data
+ * @ext: nftables API extension pointer, used to check for end element
+ *
+ * This is a convenience function that can be called from both
+ * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same
+ * operation.
+ *
+ * Return: deactivated element if found, NULL otherwise.
+ */
+static void *pipapo_deactivate(const struct net *net, const struct nft_set *set,
+ const u8 *data, const struct nft_set_ext *ext)
+{
+ struct nft_pipapo_elem *e;
+
+ e = pipapo_get(net, set, data, nft_genmask_next(net));
+ if (IS_ERR(e))
+ return NULL;
+
+ nft_set_elem_change_active(net, set, &e->ext);
+
+ return e;
+}
+
+/**
+ * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ *
+ * Return: deactivated element if found, NULL otherwise.
+ */
+static void *nft_pipapo_deactivate(const struct net *net,
+ const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext);
+}
+
+/**
+ * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ *
+ * This is functionally the same as nft_pipapo_deactivate(), with a slightly
+ * different interface, and it's also called once for each element in a set
+ * being flushed, so we can't implement, strictly speaking, a flush operation,
+ * which would otherwise be as simple as allocating an empty copy of the
+ * matching data.
+ *
+ * Note that we could in theory do that, mark the set as flushed, and ignore
+ * subsequent calls, but we would leak all the elements after the first one,
+ * because they wouldn't then be freed as result of API calls.
+ *
+ * Return: true if element was found and deactivated.
+ */
+static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
+ void *elem)
+{
+ struct nft_pipapo_elem *e = elem;
+
+ return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext),
+ &e->ext);
+}
+
+/**
+ * pipapo_get_boundaries() - Get byte interval for associated rules
+ * @f: Field including lookup table
+ * @first_rule: First rule (lowest index)
+ * @rule_count: Number of associated rules
+ * @left: Byte expression for left boundary (start of range)
+ * @right: Byte expression for right boundary (end of range)
+ *
+ * Given the first rule and amount of rules that originated from the same entry,
+ * build the original range associated with the entry, and calculate the length
+ * of the originating netmask.
+ *
+ * In pictures:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 1,2
+ * 1 1,2
+ * 2 1,2
+ * 3 1,2
+ * 4 1,2
+ * 5 1 2
+ * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ *
+ * this is the lookup table corresponding to the IPv4 range
+ * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks,
+ * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31.
+ *
+ * This function fills @left and @right with the byte values of the leftmost
+ * and rightmost bucket indices for the lowest and highest rule indices,
+ * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in
+ * nibbles:
+ * left: < 12, 0, 10, 8, 0, 1, 0, 0 >
+ * right: < 12, 0, 10, 8, 0, 2, 2, 1 >
+ * corresponding to bytes:
+ * left: < 192, 168, 1, 0 >
+ * right: < 192, 168, 2, 1 >
+ * with mask length irrelevant here, unused on return, as the range is already
+ * defined by its start and end points. The mask length is relevant for a single
+ * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore
+ * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes
+ * < 192, 168, 1, 255 >, and the mask length, calculated from the distances
+ * between leftmost and rightmost bucket indices for each group, would be 24.
+ *
+ * Return: mask length, in bits.
+ */
+static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule,
+ int rule_count, u8 *left, u8 *right)
+{
+ u8 *l = left, *r = right;
+ int g, mask_len = 0;
+
+ for (g = 0; g < f->groups; g++) {
+ int b, x0, x1;
+
+ x0 = -1;
+ x1 = -1;
+ for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ unsigned long *pos;
+
+ pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize;
+ if (test_bit(first_rule, pos) && x0 == -1)
+ x0 = b;
+ if (test_bit(first_rule + rule_count - 1, pos))
+ x1 = b;
+ }
+
+ if (g % 2) {
+ *(l++) |= x0 & 0x0f;
+ *(r++) |= x1 & 0x0f;
+ } else {
+ *l |= x0 << 4;
+ *r |= x1 << 4;
+ }
+
+ if (x1 - x0 == 0)
+ mask_len += 4;
+ else if (x1 - x0 == 1)
+ mask_len += 3;
+ else if (x1 - x0 == 3)
+ mask_len += 2;
+ else if (x1 - x0 == 7)
+ mask_len += 1;
+ }
+
+ return mask_len;
+}
+
+/**
+ * pipapo_match_field() - Match rules against byte ranges
+ * @f: Field including the lookup table
+ * @first_rule: First of associated rules originating from same entry
+ * @rule_count: Amount of associated rules
+ * @start: Start of range to be matched
+ * @end: End of range to be matched
+ *
+ * Return: true on match, false otherwise.
+ */
+static bool pipapo_match_field(struct nft_pipapo_field *f,
+ int first_rule, int rule_count,
+ const u8 *start, const u8 *end)
+{
+ u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 };
+ u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 };
+
+ pipapo_get_boundaries(f, first_rule, rule_count, left, right);
+
+ return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) &&
+ !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
+}
+
+/**
+ * nft_pipapo_remove() - Remove element given key, commit
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ *
+ * Similarly to nft_pipapo_activate(), this is used as commit operation by the
+ * API, but it's called once per element in the pending transaction, so we can't
+ * implement this as a single commit operation. Closest we can get is to remove
+ * the matched element here, if any, and commit the updated matching data.
+ */
+static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ const u8 *data = (const u8 *)elem->key.val.data;
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = priv->clone;
+ int rules_f0, first_rule = 0;
+ struct nft_pipapo_elem *e;
+
+ e = pipapo_get(net, set, data, 0);
+ if (IS_ERR(e))
+ return;
+
+ while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
+ union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
+ const u8 *match_start, *match_end;
+ struct nft_pipapo_field *f;
+ int i, start, rules_fx;
+
+ match_start = data;
+ match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data;
+
+ start = first_rule;
+ rules_fx = rules_f0;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ if (!pipapo_match_field(f, start, rules_fx,
+ match_start, match_end))
+ break;
+
+ rulemap[i].to = start;
+ rulemap[i].n = rules_fx;
+
+ rules_fx = f->mt[start].n;
+ start = f->mt[start].to;
+
+ match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ }
+
+ if (i == m->field_count) {
+ priv->dirty = true;
+ pipapo_drop(m, rulemap);
+ pipapo_commit(set);
+ return;
+ }
+
+ first_rule += rules_f0;
+ }
+}
+
+/**
+ * nft_pipapo_walk() - Walk over elements
+ * @ctx: nftables API context
+ * @set: nftables API set representation
+ * @iter: Iterator
+ *
+ * As elements are referenced in the mapping array for the last field, directly
+ * scan that array: there's no need to follow rule mappings from the first
+ * field.
+ */
+static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ int i, r;
+
+ rcu_read_lock();
+ m = rcu_dereference(priv->match);
+
+ if (unlikely(!m))
+ goto out;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+ struct nft_set_elem elem;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ if (iter->count < iter->skip)
+ goto cont;
+
+ e = f->mt[r].e;
+ if (nft_set_elem_expired(&e->ext))
+ goto cont;
+
+ elem.priv = e;
+
+ iter->err = iter->fn(ctx, set, iter, &elem);
+ if (iter->err < 0)
+ goto out;
+
+cont:
+ iter->count++;
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+/**
+ * nft_pipapo_privsize() - Return the size of private data for the set
+ * @nla: netlink attributes, ignored as size doesn't depend on them
+ * @desc: Set description, ignored as size doesn't depend on it
+ *
+ * Return: size of private data for this set implementation, in bytes
+ */
+static u64 nft_pipapo_privsize(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc)
+{
+ return sizeof(struct nft_pipapo);
+}
+
+/**
+ * nft_pipapo_estimate() - Estimate set size, space and lookup complexity
+ * @desc: Set description, element count and field description used here
+ * @features: Flags: NFT_SET_INTERVAL needs to be there
+ * @est: Storage for estimation data
+ *
+ * The size for this set type can vary dramatically, as it depends on the number
+ * of rules (composing netmasks) the entries expand to. We compute the worst
+ * case here.
+ *
+ * In general, for a non-ranged entry or a single composing netmask, we need
+ * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
+ * is, each input bit needs four bits of matching data), plus a bucket in the
+ * mapping table for each field.
+ *
+ * Return: true only for compatible range concatenations
+ */
+static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned long entry_size;
+ int i;
+
+ if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1)
+ return false;
+
+ for (i = 0, entry_size = 0; i < desc->field_count; i++) {
+ unsigned long rules;
+
+ if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
+ return false;
+
+ /* Worst-case ranges for each concatenated field: each n-bit
+ * field can expand to up to n * 2 rules in each bucket, and
+ * each rule also needs a mapping bucket.
+ */
+ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
+ entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE;
+ entry_size += rules * sizeof(union nft_pipapo_map_bucket);
+ }
+
+ /* Rules in lookup and mapping tables are needed for each entry */
+ est->size = desc->size * entry_size;
+ if (est->size && div_u64(est->size, desc->size) != entry_size)
+ return false;
+
+ est->size += sizeof(struct nft_pipapo) +
+ sizeof(struct nft_pipapo_match) * 2;
+
+ est->size += sizeof(struct nft_pipapo_field) * desc->field_count;
+
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * nft_pipapo_init() - Initialise data for a set instance
+ * @set: nftables API set representation
+ * @desc: Set description
+ * @nla: netlink attributes
+ *
+ * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink
+ * attributes, initialise internal set parameters, current instance of matching
+ * data and a copy for subsequent insertions.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int nft_pipapo_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ int err, i;
+
+ if (desc->field_count > NFT_PIPAPO_MAX_FIELDS)
+ return -EINVAL;
+
+ m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count,
+ GFP_KERNEL);
+ if (!m)
+ return -ENOMEM;
+
+ m->field_count = desc->field_count;
+ m->bsize_max = 0;
+
+ m->scratch = alloc_percpu(unsigned long *);
+ if (!m->scratch) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(m->scratch, i) = NULL;
+
+ rcu_head_init(&m->rcu);
+
+ nft_pipapo_for_each_field(f, i, m) {
+ f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE;
+ priv->groups += f->groups;
+
+ priv->width += round_up(desc->field_len[i], sizeof(u32));
+
+ f->bsize = 0;
+ f->rules = 0;
+ f->lt = NULL;
+ f->mt = NULL;
+ }
+
+ /* Create an initial clone of matching data for next insertion */
+ priv->clone = pipapo_clone(m);
+ if (IS_ERR(priv->clone)) {
+ err = PTR_ERR(priv->clone);
+ goto out_free;
+ }
+
+ priv->dirty = false;
+
+ rcu_assign_pointer(priv->match, m);
+
+ return 0;
+
+out_free:
+ free_percpu(m->scratch);
+ kfree(m);
+
+ return err;
+}
+
+/**
+ * nft_pipapo_destroy() - Free private data for set and all committed elements
+ * @set: nftables API set representation
+ */
+static void nft_pipapo_destroy(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ int i, r, cpu;
+
+ m = rcu_dereference_protected(priv->match, true);
+ if (m) {
+ rcu_barrier();
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ e = f->mt[r].e;
+
+ nft_set_elem_destroy(set, e, true);
+ }
+
+ for_each_possible_cpu(cpu)
+ kfree(*per_cpu_ptr(m->scratch, cpu));
+ free_percpu(m->scratch);
+
+ pipapo_free_fields(m);
+ kfree(m);
+ priv->match = NULL;
+ }
+
+ if (priv->clone) {
+ for_each_possible_cpu(cpu)
+ kfree(*per_cpu_ptr(priv->clone->scratch, cpu));
+ free_percpu(priv->clone->scratch);
+
+ pipapo_free_fields(priv->clone);
+ kfree(priv->clone);
+ priv->clone = NULL;
+ }
+}
+
+/**
+ * nft_pipapo_gc_init() - Initialise garbage collection
+ * @set: nftables API set representation
+ *
+ * Instead of actually setting up a periodic work for garbage collection, as
+ * this operation requires a swap of matching data with the working copy, we'll
+ * do that opportunistically with other commit operations if the interval is
+ * elapsed, so we just need to set the current jiffies timestamp here.
+ */
+static void nft_pipapo_gc_init(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+
+ priv->last_gc = jiffies;
+}
+
+struct nft_set_type nft_set_pipapo_type __read_mostly = {
+ .owner = THIS_MODULE,
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT,
+ .ops = {
+ .lookup = nft_pipapo_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 419d58ef802b..5000b938ab1e 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -13,7 +13,7 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
struct nft_rbtree {
struct rb_root root;
@@ -74,8 +74,13 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
parent = rcu_dereference_raw(parent->rb_left);
continue;
}
- if (nft_rbtree_interval_end(rbe))
- goto out;
+ if (nft_rbtree_interval_end(rbe)) {
+ if (nft_set_is_anonymous(set))
+ return false;
+ parent = rcu_dereference_raw(parent->rb_left);
+ interval = NULL;
+ continue;
+ }
*ext = &rbe->ext;
return true;
@@ -88,7 +93,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
*ext = &interval->ext;
return true;
}
-out:
+
return false;
}
@@ -139,8 +144,10 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
if (flags & NFT_SET_ELEM_INTERVAL_END)
interval = rbe;
} else {
- if (!nft_set_elem_active(&rbe->ext, genmask))
+ if (!nft_set_elem_active(&rbe->ext, genmask)) {
parent = rcu_dereference_raw(parent->rb_left);
+ continue;
+ }
if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
@@ -148,7 +155,11 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
*elem = rbe;
return true;
}
- return false;
+
+ if (nft_rbtree_interval_end(rbe))
+ interval = NULL;
+
+ parent = rcu_dereference_raw(parent->rb_left);
}
}
@@ -455,6 +466,9 @@ static void nft_rbtree_destroy(const struct nft_set *set)
static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
+ if (desc->field_count > 1)
+ return false;
+
if (desc->size)
est->size = sizeof(struct nft_rbtree) +
desc->size * sizeof(struct nft_rbtree_elem);
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index d7f3776dfd71..637ce3e8c575 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -47,9 +47,6 @@ static void nft_socket_eval(const struct nft_expr *expr,
return;
}
- /* So that subsequent socket matching not to require other lookups. */
- skb->sk = sk;
-
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
nft_reg_store8(dest, inet_sk_transparent(sk));
@@ -66,6 +63,9 @@ static void nft_socket_eval(const struct nft_expr *expr,
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
}
+
+ if (sk != skb->sk)
+ sock_gen_put(sk);
}
static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index 928e661d1517..e2c1fc608841 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -24,15 +24,15 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts,
const struct tcphdr *tcp,
struct synproxy_net *snet,
struct nf_synproxy_info *info,
- struct nft_synproxy *priv)
+ const struct nft_synproxy *priv)
{
this_cpu_inc(snet->stats->syn_received);
if (tcp->ece && tcp->cwr)
opts->options |= NF_SYNPROXY_OPT_ECN;
opts->options &= priv->info.options;
- opts->mss_encode = opts->mss;
- opts->mss = info->mss;
+ opts->mss_encode = opts->mss_option;
+ opts->mss_option = info->mss;
if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, opts);
else
@@ -41,14 +41,13 @@ static void nft_synproxy_tcp_options(struct synproxy_options *opts,
NF_SYNPROXY_OPT_ECN);
}
-static void nft_synproxy_eval_v4(const struct nft_expr *expr,
+static void nft_synproxy_eval_v4(const struct nft_synproxy *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt,
const struct tcphdr *tcp,
struct tcphdr *_tcph,
struct synproxy_options *opts)
{
- struct nft_synproxy *priv = nft_expr_priv(expr);
struct nf_synproxy_info info = priv->info;
struct net *net = nft_net(pkt);
struct synproxy_net *snet = synproxy_pernet(net);
@@ -73,14 +72,13 @@ static void nft_synproxy_eval_v4(const struct nft_expr *expr,
}
#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
-static void nft_synproxy_eval_v6(const struct nft_expr *expr,
+static void nft_synproxy_eval_v6(const struct nft_synproxy *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt,
const struct tcphdr *tcp,
struct tcphdr *_tcph,
struct synproxy_options *opts)
{
- struct nft_synproxy *priv = nft_expr_priv(expr);
struct nf_synproxy_info info = priv->info;
struct net *net = nft_net(pkt);
struct synproxy_net *snet = synproxy_pernet(net);
@@ -105,9 +103,9 @@ static void nft_synproxy_eval_v6(const struct nft_expr *expr,
}
#endif /* CONFIG_NF_TABLES_IPV6*/
-static void nft_synproxy_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct synproxy_options opts = {};
struct sk_buff *skb = pkt->skb;
@@ -140,23 +138,22 @@ static void nft_synproxy_eval(const struct nft_expr *expr,
switch (skb->protocol) {
case htons(ETH_P_IP):
- nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts);
+ nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts);
return;
#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case htons(ETH_P_IPV6):
- nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts);
+ nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts);
return;
#endif
}
regs->verdict.code = NFT_BREAK;
}
-static int nft_synproxy_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_synproxy_do_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_synproxy *priv)
{
struct synproxy_net *snet = synproxy_pernet(ctx->net);
- struct nft_synproxy *priv = nft_expr_priv(expr);
u32 flags;
int err;
@@ -206,8 +203,7 @@ nf_ct_failure:
return err;
}
-static void nft_synproxy_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
{
struct synproxy_net *snet = synproxy_pernet(ctx->net);
@@ -229,10 +225,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv)
{
- const struct nft_synproxy *priv = nft_expr_priv(expr);
-
if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) ||
nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) ||
nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options)))
@@ -244,6 +238,15 @@ nla_put_failure:
return -1;
}
+static void nft_synproxy_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ nft_synproxy_do_eval(priv, regs, pkt);
+}
+
static int nft_synproxy_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data)
@@ -252,6 +255,28 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx,
(1 << NF_INET_FORWARD));
}
+static int nft_synproxy_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ return nft_synproxy_do_init(ctx, tb, priv);
+}
+
+static void nft_synproxy_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nft_synproxy_do_destroy(ctx);
+}
+
+static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ return nft_synproxy_do_dump(skb, priv);
+}
+
static struct nft_expr_type nft_synproxy_type;
static const struct nft_expr_ops nft_synproxy_ops = {
.eval = nft_synproxy_eval,
@@ -271,14 +296,89 @@ static struct nft_expr_type nft_synproxy_type __read_mostly = {
.maxattr = NFTA_SYNPROXY_MAX,
};
+static int nft_synproxy_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_synproxy *priv = nft_obj_data(obj);
+
+ return nft_synproxy_do_init(ctx, tb, priv);
+}
+
+static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ nft_synproxy_do_destroy(ctx);
+}
+
+static int nft_synproxy_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_synproxy *priv = nft_obj_data(obj);
+
+ return nft_synproxy_do_dump(skb, priv);
+}
+
+static void nft_synproxy_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_synproxy *priv = nft_obj_data(obj);
+
+ nft_synproxy_do_eval(priv, regs, pkt);
+}
+
+static void nft_synproxy_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_synproxy *newpriv = nft_obj_data(newobj);
+ struct nft_synproxy *priv = nft_obj_data(obj);
+
+ priv->info = newpriv->info;
+}
+
+static struct nft_object_type nft_synproxy_obj_type;
+static const struct nft_object_ops nft_synproxy_obj_ops = {
+ .type = &nft_synproxy_obj_type,
+ .size = sizeof(struct nft_synproxy),
+ .init = nft_synproxy_obj_init,
+ .destroy = nft_synproxy_obj_destroy,
+ .dump = nft_synproxy_obj_dump,
+ .eval = nft_synproxy_obj_eval,
+ .update = nft_synproxy_obj_update,
+};
+
+static struct nft_object_type nft_synproxy_obj_type __read_mostly = {
+ .type = NFT_OBJECT_SYNPROXY,
+ .ops = &nft_synproxy_obj_ops,
+ .maxattr = NFTA_SYNPROXY_MAX,
+ .policy = nft_synproxy_policy,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_synproxy_module_init(void)
{
- return nft_register_expr(&nft_synproxy_type);
+ int err;
+
+ err = nft_register_obj(&nft_synproxy_obj_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_synproxy_type);
+ if (err < 0)
+ goto err;
+
+ return 0;
+
+err:
+ nft_unregister_obj(&nft_synproxy_obj_type);
+ return err;
}
static void __exit nft_synproxy_module_exit(void)
{
- return nft_unregister_expr(&nft_synproxy_type);
+ nft_unregister_expr(&nft_synproxy_type);
+ nft_unregister_obj(&nft_synproxy_obj_type);
}
module_init(nft_synproxy_module_init);
@@ -287,3 +387,4 @@ module_exit(nft_synproxy_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
MODULE_ALIAS_NFT_EXPR("synproxy");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY);
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index f92a82c73880..d67f83a0958d 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -50,7 +50,7 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
if (priv->sreg_port)
- tport = regs->data[priv->sreg_port];
+ tport = nft_reg_load16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -117,7 +117,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
if (priv->sreg_port)
- tport = regs->data[priv->sreg_port];
+ tport = nft_reg_load16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -218,14 +218,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
switch (priv->family) {
case NFPROTO_IPV4:
- alen = FIELD_SIZEOF(union nf_inet_addr, in);
+ alen = sizeof_field(union nf_inet_addr, in);
err = nf_defrag_ipv4_enable(ctx->net);
if (err)
return err;
break;
#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case NFPROTO_IPV6:
- alen = FIELD_SIZEOF(union nf_inet_addr, in6);
+ alen = sizeof_field(union nf_inet_addr, in6);
err = nf_defrag_ipv6_enable(ctx->net);
if (err)
return err;
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 3d4c2ae605a8..4c3f2e24c7cb 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -76,7 +76,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
struct nft_tunnel *priv = nft_expr_priv(expr);
u32 len;
- if (!tb[NFTA_TUNNEL_KEY] &&
+ if (!tb[NFTA_TUNNEL_KEY] ||
!tb[NFTA_TUNNEL_DREG])
return -EINVAL;
@@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
}
static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 },
[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 },
- [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 },
};
@@ -266,6 +267,9 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
if (err < 0)
return err;
+ if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])
+ return -EINVAL;
+
version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]));
switch (version) {
case ERSPAN_VERSION:
@@ -442,10 +446,15 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
if (!nest)
return -1;
- if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 ||
- nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 ||
- nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label))
+ if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC,
+ &info->key.u.ipv6.src) < 0 ||
+ nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST,
+ &info->key.u.ipv6.dst) < 0 ||
+ nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL,
+ info->key.label)) {
+ nla_nest_cancel(skb, nest);
return -1;
+ }
nla_nest_end(skb, nest);
} else {
@@ -453,9 +462,13 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
if (!nest)
return -1;
- if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 ||
- nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0)
+ if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC,
+ info->key.u.ipv4.src) < 0 ||
+ nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST,
+ info->key.u.ipv4.dst) < 0) {
+ nla_nest_cancel(skb, nest);
return -1;
+ }
nla_nest_end(skb, nest);
}
@@ -467,42 +480,58 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
struct nft_tunnel_obj *priv)
{
struct nft_tunnel_opts *opts = &priv->opts;
- struct nlattr *nest;
+ struct nlattr *nest, *inner;
nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS);
if (!nest)
return -1;
if (opts->flags & TUNNEL_VXLAN_OPT) {
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN);
+ if (!inner)
+ goto failure;
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP,
htonl(opts->u.vxlan.gbp)))
- return -1;
+ goto inner_failure;
+ nla_nest_end(skb, inner);
} else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN);
+ if (!inner)
+ goto failure;
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION,
+ htonl(opts->u.erspan.version)))
+ goto inner_failure;
switch (opts->u.erspan.version) {
case ERSPAN_VERSION:
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
opts->u.erspan.u.index))
- return -1;
+ goto inner_failure;
break;
case ERSPAN_VERSION2:
if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
get_hwid(&opts->u.erspan.u.md2)) ||
nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
opts->u.erspan.u.md2.dir))
- return -1;
+ goto inner_failure;
break;
}
+ nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
-
return 0;
+
+inner_failure:
+ nla_nest_cancel(skb, inner);
+failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
}
static int nft_tunnel_ports_dump(struct sk_buff *skb,
struct ip_tunnel_info *info)
{
- if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
- nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 ||
+ nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0)
return -1;
return 0;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ce70c2576bb2..e27c6c5ba9df 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -939,14 +939,14 @@ EXPORT_SYMBOL(xt_check_entry_offsets);
*
* @size: number of entries
*
- * Return: NULL or kmalloc'd or vmalloc'd array
+ * Return: NULL or zeroed kmalloc'd or vmalloc'd array
*/
unsigned int *xt_alloc_entry_offsets(unsigned int size)
{
if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int))
return NULL;
- return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO);
+ return kvcalloc(size, sizeof(unsigned int), GFP_KERNEL);
}
EXPORT_SYMBOL(xt_alloc_entry_offsets);
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
index be7798a50546..713fb38541df 100644
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -239,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff)
return 0;
/* Error message? */
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_SOURCE_QUENCH &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB &&
- icmph->type != ICMP_REDIRECT)
+ if (!icmp_is_err(icmph->type))
return 0;
*nhoff += iphsz + sizeof(_ih);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 9cec9eae556a..f56d3ed93e56 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -283,7 +283,7 @@ static int __init idletimer_tg_init(void)
idletimer_tg_kobj = &idletimer_tg_device->kobj;
- err = xt_register_target(&idletimer_tg);
+ err = xt_register_target(&idletimer_tg);
if (err < 0) {
pr_debug("couldn't register xt target\n");
goto out_dev;
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 2236455b10a3..37253d399c6b 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -30,7 +30,7 @@ static unsigned int jhash_rnd __read_mostly;
static unsigned int xt_rateest_hash(const char *name)
{
- return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) &
+ return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) &
(RATEEST_HSIZE - 1);
}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index bc6c8ab0fa62..46fcac75f726 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -13,6 +13,8 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/ip.h>
+#include <linux/ipv6.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 2d2691dd51e0..bccd47cd7190 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -34,9 +34,14 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter/xt_hashlimit.h>
#include <linux/mutex.h>
#include <linux/kernel.h>
+#include <uapi/linux/netfilter/xt_hashlimit.h>
+
+#define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
+ XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \
+ XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\
+ XT_HASHLIMIT_RATE_MATCH)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@@ -352,21 +357,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
return 0;
}
-static bool select_all(const struct xt_hashlimit_htable *ht,
- const struct dsthash_ent *he)
-{
- return true;
-}
-
-static bool select_gc(const struct xt_hashlimit_htable *ht,
- const struct dsthash_ent *he)
-{
- return time_after_eq(jiffies, he->expires);
-}
-
-static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
- bool (*select)(const struct xt_hashlimit_htable *ht,
- const struct dsthash_ent *he))
+static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all)
{
unsigned int i;
@@ -376,7 +367,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
spin_lock_bh(&ht->lock);
hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
- if ((*select)(ht, dh))
+ if (time_after_eq(jiffies, dh->expires) || select_all)
dsthash_free(ht, dh);
}
spin_unlock_bh(&ht->lock);
@@ -390,7 +381,7 @@ static void htable_gc(struct work_struct *work)
ht = container_of(work, struct xt_hashlimit_htable, gc_work.work);
- htable_selective_cleanup(ht, select_gc);
+ htable_selective_cleanup(ht, false);
queue_delayed_work(system_power_efficient_wq,
&ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval));
@@ -414,7 +405,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo)
{
cancel_delayed_work_sync(&hinfo->gc_work);
htable_remove_proc_entry(hinfo);
- htable_selective_cleanup(hinfo, select_all);
+ htable_selective_cleanup(hinfo, true);
kfree(hinfo->name);
vfree(hinfo);
}
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index d0ab1adf5bff..5aab6df74e0f 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -54,25 +54,39 @@ nfacct_mt_destroy(const struct xt_mtdtor_param *par)
nfnl_acct_put(info->nfacct);
}
-static struct xt_match nfacct_mt_reg __read_mostly = {
- .name = "nfacct",
- .family = NFPROTO_UNSPEC,
- .checkentry = nfacct_mt_checkentry,
- .match = nfacct_mt,
- .destroy = nfacct_mt_destroy,
- .matchsize = sizeof(struct xt_nfacct_match_info),
- .usersize = offsetof(struct xt_nfacct_match_info, nfacct),
- .me = THIS_MODULE,
+static struct xt_match nfacct_mt_reg[] __read_mostly = {
+ {
+ .name = "nfacct",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfacct_mt_checkentry,
+ .match = nfacct_mt,
+ .destroy = nfacct_mt_destroy,
+ .matchsize = sizeof(struct xt_nfacct_match_info),
+ .usersize = offsetof(struct xt_nfacct_match_info, nfacct),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "nfacct",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfacct_mt_checkentry,
+ .match = nfacct_mt,
+ .destroy = nfacct_mt_destroy,
+ .matchsize = sizeof(struct xt_nfacct_match_info_v1),
+ .usersize = offsetof(struct xt_nfacct_match_info_v1, nfacct),
+ .me = THIS_MODULE,
+ },
};
static int __init nfacct_mt_init(void)
{
- return xt_register_match(&nfacct_mt_reg);
+ return xt_register_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg));
}
static void __exit nfacct_mt_exit(void)
{
- xt_unregister_match(&nfacct_mt_reg);
+ xt_unregister_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg));
}
module_init(nfacct_mt_init);
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index ead7c6022208..ec6ed6fda96c 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -5,12 +5,13 @@
/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter_bridge.h>
-#include <linux/netfilter/xt_physdev.h>
#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/br_netfilter.h>
+#include <uapi/linux/netfilter/xt_physdev.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
@@ -101,11 +102,9 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) &&
(!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
- par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
+ par->hook_mask & (1 << NF_INET_LOCAL_OUT)) {
pr_info_ratelimited("--physdev-out and --physdev-is-out only supported in the FORWARD and POSTROUTING chains with bridged traffic\n");
- if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
- return -EINVAL;
+ return -EINVAL;
}
if (!brnf_probed) {
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 781e0b482189..0a9708004e20 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(recent_lock);
static DEFINE_MUTEX(recent_mutex);
#ifdef CONFIG_PROC_FS
-static const struct file_operations recent_mt_fops;
+static const struct proc_ops recent_mt_proc_ops;
#endif
static u_int32_t hash_rnd __read_mostly;
@@ -405,7 +405,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
goto out;
}
pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
- &recent_mt_fops, t);
+ &recent_mt_proc_ops, t);
if (pde == NULL) {
recent_table_free(t);
ret = -ENOMEM;
@@ -616,13 +616,12 @@ recent_mt_proc_write(struct file *file, const char __user *input,
return size + 1;
}
-static const struct file_operations recent_mt_fops = {
- .open = recent_seq_open,
- .read = seq_read,
- .write = recent_mt_proc_write,
- .release = seq_release_private,
- .owner = THIS_MODULE,
- .llseek = seq_lseek,
+static const struct proc_ops recent_mt_proc_ops = {
+ .proc_open = recent_seq_open,
+ .proc_read = seq_read,
+ .proc_write = recent_mt_proc_write,
+ .proc_release = seq_release_private,
+ .proc_lseek = seq_lseek,
};
static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index ecbfa291fb70..731bc2cafae4 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -14,7 +14,6 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <uapi/linux/netfilter/xt_set.h>
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 8dbb4d48f2ed..67cb98489415 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y)
* This is done in three separate functions so that the most expensive
* calculations are done last, in case a "simple match" can be found earlier.
*/
-static inline unsigned int localtime_1(struct xtm *r, time_t time)
+static inline unsigned int localtime_1(struct xtm *r, time64_t time)
{
unsigned int v, w;
/* Each day has 86400s, so finding the hour/minute is actually easy. */
- v = time % SECONDS_PER_DAY;
+ div_u64_rem(time, SECONDS_PER_DAY, &v);
r->second = v % 60;
w = v / 60;
r->minute = w % 60;
@@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time)
return v;
}
-static inline void localtime_2(struct xtm *r, time_t time)
+static inline void localtime_2(struct xtm *r, time64_t time)
{
/*
* Here comes the rest (weekday, monthday). First, divide the SSTE
* by seconds-per-day to get the number of _days_ since the epoch.
*/
- r->dse = time / 86400;
+ r->dse = div_u64(time, SECONDS_PER_DAY);
/*
* 1970-01-01 (w=0) was a Thursday (4).
@@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time)
r->weekday = (4 + r->dse - 1) % 7 + 1;
}
-static void localtime_3(struct xtm *r, time_t time)
+static void localtime_3(struct xtm *r, time64_t time)
{
unsigned int year, i, w = r->dse;
@@ -160,7 +160,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_time_info *info = par->matchinfo;
unsigned int packet_time;
struct xtm current_time;
- s64 stamp;
+ time64_t stamp;
/*
* We need real time here, but we can neither use skb->tstamp
@@ -173,14 +173,14 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* 1. match before 13:00
* 2. match after 13:00
*
- * If you match against processing time (get_seconds) it
+ * If you match against processing time (ktime_get_real_seconds) it
* may happen that the same packet matches both rules if
* it arrived at the right moment before 13:00, so it would be
* better to check skb->tstamp and set it via __net_timestamp()
* if needed. This however breaks outgoing packets tx timestamp,
* and causes them to get delayed forever by fq packet scheduler.
*/
- stamp = get_seconds();
+ stamp = ktime_get_real_seconds();
if (info->flags & XT_TIME_LOCAL_TZ)
/* Adjust for local timezone */
@@ -193,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* - 'now' is in the weekday mask
* - 'now' is in the daytime range time_start..time_end
* (and by default, libxt_time will set these so as to match)
+ *
+ * note: info->date_start/stop are unsigned 32-bit values that
+ * can hold values beyond y2038, but not after y2106.
*/
if (stamp < info->date_start || stamp > info->date_stop)
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 2b0ef55cf89e..409a3ae47ce2 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -607,7 +607,7 @@ catmap_getnode_alloc:
*/
int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
{
- struct netlbl_lsm_catmap *iter = catmap;
+ struct netlbl_lsm_catmap *iter;
u32 idx;
u32 bit;
NETLBL_CATMAP_MAPTYPE bitmap;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 90b2ab9dd449..4e31721e7293 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2755,7 +2755,7 @@ static int __init netlink_proto_init(void)
if (err != 0)
goto out;
- BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
if (!nl_table)
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index efccd1ac9a66..0522b2b1fd95 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -458,10 +458,63 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
}
EXPORT_SYMBOL(genlmsg_put);
+static struct genl_dumpit_info *genl_dumpit_info_alloc(void)
+{
+ return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL);
+}
+
+static void genl_dumpit_info_free(const struct genl_dumpit_info *info)
+{
+ kfree(info);
+}
+
+static struct nlattr **
+genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ const struct genl_ops *ops,
+ int hdrlen,
+ enum genl_validate_flags no_strict_flag,
+ bool parallel)
+{
+ enum netlink_validation validate = ops->validate & no_strict_flag ?
+ NL_VALIDATE_LIBERAL :
+ NL_VALIDATE_STRICT;
+ struct nlattr **attrbuf;
+ int err;
+
+ if (!family->maxattr)
+ return NULL;
+
+ if (parallel) {
+ attrbuf = kmalloc_array(family->maxattr + 1,
+ sizeof(struct nlattr *), GFP_KERNEL);
+ if (!attrbuf)
+ return ERR_PTR(-ENOMEM);
+ } else {
+ attrbuf = family->attrbuf;
+ }
+
+ err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
+ family->policy, validate, extack);
+ if (err && parallel) {
+ kfree(attrbuf);
+ return ERR_PTR(err);
+ }
+ return attrbuf;
+}
+
+static void genl_family_rcv_msg_attrs_free(const struct genl_family *family,
+ struct nlattr **attrbuf,
+ bool parallel)
+{
+ if (parallel)
+ kfree(attrbuf);
+}
+
static int genl_lock_start(struct netlink_callback *cb)
{
- /* our ops are always const - netlink API doesn't propagate that */
- const struct genl_ops *ops = cb->data;
+ const struct genl_ops *ops = genl_dumpit_info(cb)->ops;
int rc = 0;
if (ops->start) {
@@ -474,8 +527,7 @@ static int genl_lock_start(struct netlink_callback *cb)
static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
- /* our ops are always const - netlink API doesn't propagate that */
- const struct genl_ops *ops = cb->data;
+ const struct genl_ops *ops = genl_dumpit_info(cb)->ops;
int rc;
genl_lock();
@@ -486,8 +538,8 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
static int genl_lock_done(struct netlink_callback *cb)
{
- /* our ops are always const - netlink API doesn't propagate that */
- const struct genl_ops *ops = cb->data;
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ const struct genl_ops *ops = info->ops;
int rc = 0;
if (ops->done) {
@@ -495,120 +547,111 @@ static int genl_lock_done(struct netlink_callback *cb)
rc = ops->done(cb);
genl_unlock();
}
+ genl_family_rcv_msg_attrs_free(info->family, info->attrs, true);
+ genl_dumpit_info_free(info);
return rc;
}
-static int genl_family_rcv_msg(const struct genl_family *family,
- struct sk_buff *skb,
- struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int genl_parallel_done(struct netlink_callback *cb)
{
- const struct genl_ops *ops;
- struct net *net = sock_net(skb->sk);
- struct genl_info info;
- struct genlmsghdr *hdr = nlmsg_data(nlh);
- struct nlattr **attrbuf;
- int hdrlen, err;
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ const struct genl_ops *ops = info->ops;
+ int rc = 0;
- /* this family doesn't exist in this netns */
- if (!family->netnsok && !net_eq(net, &init_net))
- return -ENOENT;
+ if (ops->done)
+ rc = ops->done(cb);
+ genl_family_rcv_msg_attrs_free(info->family, info->attrs, true);
+ genl_dumpit_info_free(info);
+ return rc;
+}
- hdrlen = GENL_HDRLEN + family->hdrsize;
- if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
- return -EINVAL;
+static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
+ struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ const struct genl_ops *ops,
+ int hdrlen, struct net *net)
+{
+ struct genl_dumpit_info *info;
+ struct nlattr **attrs = NULL;
+ int err;
- ops = genl_get_cmd(hdr->cmd, family);
- if (ops == NULL)
+ if (!ops->dumpit)
return -EOPNOTSUPP;
- if ((ops->flags & GENL_ADMIN_PERM) &&
- !netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
- !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
- int rc;
-
- if (ops->dumpit == NULL)
- return -EOPNOTSUPP;
-
- if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) {
- int hdrlen = GENL_HDRLEN + family->hdrsize;
-
- if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
- return -EINVAL;
+ if (ops->validate & GENL_DONT_VALIDATE_DUMP)
+ goto no_attrs;
- if (family->maxattr) {
- unsigned int validate = NL_VALIDATE_STRICT;
-
- if (ops->validate &
- GENL_DONT_VALIDATE_DUMP_STRICT)
- validate = NL_VALIDATE_LIBERAL;
- rc = __nla_validate(nlmsg_attrdata(nlh, hdrlen),
- nlmsg_attrlen(nlh, hdrlen),
- family->maxattr,
- family->policy,
- validate, extack);
- if (rc)
- return rc;
- }
- }
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return -EINVAL;
- if (!family->parallel_ops) {
- struct netlink_dump_control c = {
- .module = family->module,
- /* we have const, but the netlink API doesn't */
- .data = (void *)ops,
- .start = genl_lock_start,
- .dump = genl_lock_dumpit,
- .done = genl_lock_done,
- };
+ attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack,
+ ops, hdrlen,
+ GENL_DONT_VALIDATE_DUMP_STRICT,
+ true);
+ if (IS_ERR(attrs))
+ return PTR_ERR(attrs);
+
+no_attrs:
+ /* Allocate dumpit info. It is going to be freed by done() callback. */
+ info = genl_dumpit_info_alloc();
+ if (!info) {
+ genl_family_rcv_msg_attrs_free(family, attrs, true);
+ return -ENOMEM;
+ }
- genl_unlock();
- rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
- genl_lock();
+ info->family = family;
+ info->ops = ops;
+ info->attrs = attrs;
- } else {
- struct netlink_dump_control c = {
- .module = family->module,
- .start = ops->start,
- .dump = ops->dumpit,
- .done = ops->done,
- };
+ if (!family->parallel_ops) {
+ struct netlink_dump_control c = {
+ .module = family->module,
+ .data = info,
+ .start = genl_lock_start,
+ .dump = genl_lock_dumpit,
+ .done = genl_lock_done,
+ };
- rc = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
- }
+ genl_unlock();
+ err = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
+ genl_lock();
- return rc;
+ } else {
+ struct netlink_dump_control c = {
+ .module = family->module,
+ .data = info,
+ .start = ops->start,
+ .dump = ops->dumpit,
+ .done = genl_parallel_done,
+ };
+
+ err = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
}
- if (ops->doit == NULL)
- return -EOPNOTSUPP;
-
- if (family->maxattr && family->parallel_ops) {
- attrbuf = kmalloc_array(family->maxattr + 1,
- sizeof(struct nlattr *),
- GFP_KERNEL);
- if (attrbuf == NULL)
- return -ENOMEM;
- } else
- attrbuf = family->attrbuf;
+ return err;
+}
- if (attrbuf) {
- enum netlink_validation validate = NL_VALIDATE_STRICT;
+static int genl_family_rcv_msg_doit(const struct genl_family *family,
+ struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ const struct genl_ops *ops,
+ int hdrlen, struct net *net)
+{
+ struct nlattr **attrbuf;
+ struct genl_info info;
+ int err;
- if (ops->validate & GENL_DONT_VALIDATE_STRICT)
- validate = NL_VALIDATE_LIBERAL;
+ if (!ops->doit)
+ return -EOPNOTSUPP;
- err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
- family->policy, validate, extack);
- if (err < 0)
- goto out;
- }
+ attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack,
+ ops, hdrlen,
+ GENL_DONT_VALIDATE_STRICT,
+ family->parallel_ops);
+ if (IS_ERR(attrbuf))
+ return PTR_ERR(attrbuf);
info.snd_seq = nlh->nlmsg_seq;
info.snd_portid = NETLINK_CB(skb).portid;
@@ -632,12 +675,49 @@ static int genl_family_rcv_msg(const struct genl_family *family,
family->post_doit(ops, skb, &info);
out:
- if (family->parallel_ops)
- kfree(attrbuf);
+ genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops);
return err;
}
+static int genl_family_rcv_msg(const struct genl_family *family,
+ struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ const struct genl_ops *ops;
+ struct net *net = sock_net(skb->sk);
+ struct genlmsghdr *hdr = nlmsg_data(nlh);
+ int hdrlen;
+
+ /* this family doesn't exist in this netns */
+ if (!family->netnsok && !net_eq(net, &init_net))
+ return -ENOENT;
+
+ hdrlen = GENL_HDRLEN + family->hdrsize;
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return -EINVAL;
+
+ ops = genl_get_cmd(hdr->cmd, family);
+ if (ops == NULL)
+ return -EOPNOTSUPP;
+
+ if ((ops->flags & GENL_ADMIN_PERM) &&
+ !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP)
+ return genl_family_rcv_msg_dumpit(family, skb, nlh, extack,
+ ops, hdrlen, net);
+ else
+ return genl_family_rcv_msg_doit(family, skb, nlh, extack,
+ ops, hdrlen, net);
+}
+
static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -1088,25 +1168,6 @@ problem:
subsys_initcall(genl_init);
-/**
- * genl_family_attrbuf - return family's attrbuf
- * @family: the family
- *
- * Return the family's attrbuf, while validating that it's
- * actually valid to access it.
- *
- * You cannot use this function with a family that has parallel_ops
- * and you can only use it within (pre/post) doit/dumpit callbacks.
- */
-struct nlattr **genl_family_attrbuf(const struct genl_family *family)
-{
- if (!WARN_ON(family->parallel_ops))
- lockdep_assert_held(&genl_mutex);
-
- return family->attrbuf;
-}
-EXPORT_SYMBOL(genl_family_attrbuf);
-
static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
gfp_t flags)
{
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index c4f54ad2b98a..58d5373c513c 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -64,28 +64,6 @@ static DEFINE_SPINLOCK(nr_list_lock);
static const struct proto_ops nr_proto_ops;
/*
- * NETROM network devices are virtual network devices encapsulating NETROM
- * frames into AX.25 which will be sent through an AX.25 device, so form a
- * special "super class" of normal net devices; split their locks off into a
- * separate class since they always nest.
- */
-static struct lock_class_key nr_netdev_xmit_lock_key;
-static struct lock_class_key nr_netdev_addr_lock_key;
-
-static void nr_set_lockdep_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key);
-}
-
-static void nr_set_lockdep_key(struct net_device *dev)
-{
- lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key);
- netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL);
-}
-
-/*
* Socket removal during an interrupt is now safe.
*/
static void nr_remove_socket(struct sock *sk)
@@ -1414,7 +1392,6 @@ static int __init nr_proto_init(void)
free_netdev(dev);
goto fail;
}
- nr_set_lockdep_key(dev);
dev_nr[i] = dev;
}
diff --git a/net/nfc/hci/Kconfig b/net/nfc/hci/Kconfig
index 97bd3a2c5c98..4822d6f46947 100644
--- a/net/nfc/hci/Kconfig
+++ b/net/nfc/hci/Kconfig
@@ -1,12 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
config NFC_HCI
- depends on NFC
- tristate "NFC HCI implementation"
- default n
- help
- Say Y here if you want to build support for a kernel NFC HCI
- implementation. This is mostly needed for devices that only process
- HCI frames, like for example the NXP pn544.
+ depends on NFC
+ tristate "NFC HCI implementation"
+ default n
+ help
+ Say Y here if you want to build support for a kernel NFC HCI
+ implementation. This is mostly needed for devices that only process
+ HCI frames, like for example the NXP pn544.
config NFC_SHDLC
depends on NFC_HCI
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 9b8742947aff..28604414dec1 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -107,9 +107,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
llcp_sock->service_name = kmemdup(llcp_addr.service_name,
llcp_sock->service_name_len,
GFP_KERNEL);
-
+ if (!llcp_sock->service_name) {
+ ret = -ENOMEM;
+ goto put_dev;
+ }
llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock);
if (llcp_sock->ssap == LLCP_SAP_MAX) {
+ kfree(llcp_sock->service_name);
+ llcp_sock->service_name = NULL;
ret = -EADDRINUSE;
goto put_dev;
}
@@ -549,11 +554,11 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
if (sk->sk_state == LLCP_LISTEN)
return llcp_accept_poll(sk);
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
if (sk->sk_state == LLCP_CLOSED)
@@ -1004,10 +1009,13 @@ static int llcp_sock_create(struct net *net, struct socket *sock,
sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- if (sock->type == SOCK_RAW)
+ if (sock->type == SOCK_RAW) {
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
sock->ops = &llcp_rawsock_ops;
- else
+ } else {
sock->ops = &llcp_sock_ops;
+ }
sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
if (sk == NULL)
diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c
index 9dd8a1096916..7d8e10e27c20 100644
--- a/net/nfc/nci/spi.c
+++ b/net/nfc/nci/spi.c
@@ -44,7 +44,8 @@ static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb,
t.len = 0;
}
t.cs_change = cs_change;
- t.delay_usecs = nspi->xfer_udelay;
+ t.delay.value = nspi->xfer_udelay;
+ t.delay.unit = SPI_DELAY_UNIT_USECS;
t.speed_hz = nspi->xfer_speed_hz;
spi_message_init(&m);
@@ -216,7 +217,8 @@ static struct sk_buff *__nci_spi_read(struct nci_spi *nspi)
rx.rx_buf = skb_put(skb, rx_len);
rx.len = rx_len;
rx.cs_change = 0;
- rx.delay_usecs = nspi->xfer_udelay;
+ rx.delay.value = nspi->xfer_udelay;
+ rx.delay.unit = SPI_DELAY_UNIT_USECS;
rx.speed_hz = nspi->xfer_speed_hz;
spi_message_add_tail(&rx, &m);
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 78fe622eba65..11b554ce07ff 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -346,7 +346,7 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data,
nu->rx_packet_len = -1;
nu->rx_skb = nci_skb_alloc(nu->ndev,
NCI_MAX_PACKET_SIZE,
- GFP_KERNEL);
+ GFP_ATOMIC);
if (!nu->rx_skb)
return -ENOMEM;
}
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index ea64c90b14e8..eee0dddb7749 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -102,22 +102,14 @@ nla_put_failure:
static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb)
{
- struct nlattr **attrbuf = genl_family_attrbuf(&nfc_genl_family);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
struct nfc_dev *dev;
- int rc;
u32 idx;
- rc = nlmsg_parse_deprecated(cb->nlh,
- GENL_HDRLEN + nfc_genl_family.hdrsize,
- attrbuf, nfc_genl_family.maxattr,
- nfc_genl_policy, NULL);
- if (rc < 0)
- return ERR_PTR(rc);
-
- if (!attrbuf[NFC_ATTR_DEVICE_INDEX])
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
return ERR_PTR(-EINVAL);
- idx = nla_get_u32(attrbuf[NFC_ATTR_DEVICE_INDEX]);
+ idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
dev = nfc_get_device(idx);
if (!dev)
@@ -970,7 +962,8 @@ static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info)
int rc;
u32 idx;
- if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_TARGET_INDEX])
return -EINVAL;
idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
@@ -1018,7 +1011,8 @@ static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *msg = NULL;
u32 idx;
- if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+ if (!info->attrs[NFC_ATTR_DEVICE_INDEX] ||
+ !info->attrs[NFC_ATTR_FIRMWARE_NAME])
return -EINVAL;
idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
@@ -1097,7 +1091,6 @@ static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info)
local = nfc_llcp_find_local(dev);
if (!local) {
- nfc_put_device(dev);
rc = -ENODEV;
goto exit;
}
@@ -1157,7 +1150,6 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info)
local = nfc_llcp_find_local(dev);
if (!local) {
- nfc_put_device(dev);
rc = -ENODEV;
goto exit;
}
@@ -1695,7 +1687,8 @@ static const struct genl_ops nfc_genl_ops[] = {
},
{
.cmd = NFC_CMD_GET_TARGET,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = nfc_genl_dump_targets,
.done = nfc_genl_dump_targets_done,
},
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 3572e11b6f21..7fbfe2adfffa 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -161,14 +161,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *attr, int len);
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
- const struct ovs_action_push_mpls *mpls)
+ __be32 mpls_lse, __be16 mpls_ethertype, __u16 mac_len)
{
int err;
- err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype);
+ err = skb_mpls_push(skb, mpls_lse, mpls_ethertype, mac_len, !!mac_len);
if (err)
return err;
+ if (!mac_len)
+ key->mac_proto = MAC_PROTO_NONE;
+
invalidate_flow_key(key);
return 0;
}
@@ -178,10 +181,14 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
{
int err;
- err = skb_mpls_pop(skb, ethertype);
+ err = skb_mpls_pop(skb, ethertype, skb->mac_len,
+ ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET);
if (err)
return err;
+ if (ethertype == htons(ETH_P_TEB))
+ key->mac_proto = MAC_PROTO_ETHERNET;
+
invalidate_flow_key(key);
return 0;
}
@@ -199,7 +206,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
if (err)
return err;
- flow_key->mpls.top_lse = lse;
+ flow_key->mpls.lse[0] = lse;
return 0;
}
@@ -1226,10 +1233,24 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
execute_hash(skb, key, a);
break;
- case OVS_ACTION_ATTR_PUSH_MPLS:
- err = push_mpls(skb, key, nla_data(a));
+ case OVS_ACTION_ATTR_PUSH_MPLS: {
+ struct ovs_action_push_mpls *mpls = nla_data(a);
+
+ err = push_mpls(skb, key, mpls->mpls_lse,
+ mpls->mpls_ethertype, skb->mac_len);
break;
+ }
+ case OVS_ACTION_ATTR_ADD_MPLS: {
+ struct ovs_action_add_mpls *mpls = nla_data(a);
+ __u16 mac_len = 0;
+
+ if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK)
+ mac_len = skb->mac_len;
+ err = push_mpls(skb, key, mpls->mpls_lse,
+ mpls->mpls_ethertype, mac_len);
+ break;
+ }
case OVS_ACTION_ATTR_POP_MPLS:
err = pop_mpls(skb, key, nla_get_be16(a));
break;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 848c6eb55064..e726159cfcfa 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -67,6 +67,7 @@ struct ovs_conntrack_info {
struct md_mark mark;
struct md_labels labels;
char timeout[CTNL_TIMEOUT_NAME_MAX];
+ struct nf_ct_timeout *nf_ct_timeout;
#if IS_ENABLED(CONFIG_NF_NAT)
struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */
#endif
@@ -524,6 +525,11 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
return -EPFNOSUPPORT;
}
+ /* The key extracted from the fragment that completed this datagram
+ * likely didn't have an L4 header, so regenerate it.
+ */
+ ovs_flow_key_update_l3l4(skb, key);
+
key->ip.frag = OVS_FRAG_TYPE_NONE;
skb_clear_hash(skb);
skb->ignore_df = 1;
@@ -697,6 +703,14 @@ static bool skb_nfct_cached(struct net *net,
if (help && rcu_access_pointer(help->helper) != info->helper)
return false;
}
+ if (info->nf_ct_timeout) {
+ struct nf_conn_timeout *timeout_ext;
+
+ timeout_ext = nf_ct_timeout_find(ct);
+ if (!timeout_ext || info->nf_ct_timeout !=
+ rcu_dereference(timeout_ext->timeout))
+ return false;
+ }
/* Force conntrack entry direction to the current packet? */
if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
/* Delete the conntrack entry if confirmed, else just release
@@ -889,6 +903,17 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
}
err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+ if (err == NF_ACCEPT &&
+ ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
+
+ err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range,
+ maniptype);
+ }
+
/* Mark NAT done if successful and update the flow key. */
if (err == NF_ACCEPT)
ovs_nat_update_key(key, skb, maniptype);
@@ -957,6 +982,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
+ bool add_helper = false;
+
/* Packets starting a new connection must be NATted before the
* helper, so that the helper knows about the NAT. We enforce
* this by delaying both NAT and helper calls for unconfirmed
@@ -974,16 +1001,17 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
}
/* Userspace may decide to perform a ct lookup without a helper
- * specified followed by a (recirculate and) commit with one.
- * Therefore, for unconfirmed connections which we will commit,
- * we need to attach the helper here.
+ * specified followed by a (recirculate and) commit with one,
+ * or attach a helper in a later commit. Therefore, for
+ * connections which we will commit, we may need to attach
+ * the helper here.
*/
- if (!nf_ct_is_confirmed(ct) && info->commit &&
- info->helper && !nfct_help(ct)) {
+ if (info->commit && info->helper && !nfct_help(ct)) {
int err = __nf_ct_try_assign_helper(ct, info->ct,
GFP_ATOMIC);
if (err)
return err;
+ add_helper = true;
/* helper installed, add seqadj if NAT is required */
if (info->nat && !nfct_seqadj(ct)) {
@@ -993,11 +1021,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
}
/* Call the helper only if:
- * - nf_conntrack_in() was executed above ("!cached") for a
- * confirmed connection, or
+ * - nf_conntrack_in() was executed above ("!cached") or a
+ * helper was just attached ("add_helper") for a confirmed
+ * connection, or
* - When committing an unconfirmed connection.
*/
- if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+ if ((nf_ct_is_confirmed(ct) ? !cached || add_helper :
+ info->commit) &&
ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
return -EINVAL;
}
@@ -1565,7 +1595,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
case OVS_CT_ATTR_TIMEOUT:
memcpy(info->timeout, nla_data(a), nla_len(a));
if (!memchr(info->timeout, '\0', nla_len(a))) {
- OVS_NLERR(log, "Invalid conntrack helper");
+ OVS_NLERR(log, "Invalid conntrack timeout");
return -EINVAL;
}
break;
@@ -1657,6 +1687,10 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
ct_info.timeout))
pr_info_ratelimited("Failed to associated timeout "
"policy `%s'\n", ct_info.timeout);
+ else
+ ct_info.nf_ct_timeout = rcu_dereference(
+ nf_ct_timeout_find(ct_info.ct)->timeout);
+
}
if (helper) {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d01410e52097..659c2a790fe7 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -222,14 +222,15 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
struct dp_stats_percpu *stats;
u64 *stats_counter;
u32 n_mask_hit;
+ int error;
stats = this_cpu_ptr(dp->stats_percpu);
/* Look up flow. */
- flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
+ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb),
+ &n_mask_hit);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
- int error;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
@@ -246,7 +247,10 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts);
- ovs_execute_actions(dp, skb, sf_acts, key);
+ error = ovs_execute_actions(dp, skb, sf_acts, key);
+ if (unlikely(error))
+ net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n",
+ ovs_dp_name(dp), error);
stats_counter = &stats->n_hit;
@@ -317,8 +321,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
}
/* Queue all of the segments. */
- skb = segs;
- do {
+ skb_list_walk_safe(segs, skb, nskb) {
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;
@@ -326,17 +329,15 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
if (err)
break;
- } while ((skb = skb->next));
+ }
/* Free all of the segments. */
- skb = segs;
- do {
- nskb = skb->next;
+ skb_list_walk_safe(segs, skb, nskb) {
if (err)
kfree_skb(skb);
else
consume_skb(skb);
- } while ((skb = nskb));
+ }
return err;
}
@@ -346,7 +347,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
+ nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
- + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
+ + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
+ + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
/* OVS_PACKET_ATTR_USERDATA */
if (upcall_info->userdata)
@@ -389,6 +391,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
size_t len;
unsigned int hlen;
int err, dp_ifindex;
+ u64 hash;
dp_ifindex = get_dpifindex(dp);
if (!dp_ifindex)
@@ -481,23 +484,30 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
}
/* Add OVS_PACKET_ATTR_MRU */
- if (upcall_info->mru) {
- if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU,
- upcall_info->mru)) {
- err = -ENOBUFS;
- goto out;
- }
- pad_packet(dp, user_skb);
+ if (upcall_info->mru &&
+ nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) {
+ err = -ENOBUFS;
+ goto out;
}
/* Add OVS_PACKET_ATTR_LEN when packet is truncated */
- if (cutlen > 0) {
- if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
- skb->len)) {
- err = -ENOBUFS;
- goto out;
- }
- pad_packet(dp, user_skb);
+ if (cutlen > 0 &&
+ nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
+ /* Add OVS_PACKET_ATTR_HASH */
+ hash = skb_get_hash_raw(skb);
+ if (skb->sw_hash)
+ hash |= OVS_PACKET_HASH_SW_BIT;
+
+ if (skb->l4_hash)
+ hash |= OVS_PACKET_HASH_L4_BIT;
+
+ if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) {
+ err = -ENOBUFS;
+ goto out;
}
/* Only reserve room for attribute header, packet data is added
@@ -539,6 +549,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
struct vport *input_vport;
u16 mru = 0;
+ u64 hash;
int len;
int err;
bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -564,6 +575,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
}
OVS_CB(packet)->mru = mru;
+ if (a[OVS_PACKET_ATTR_HASH]) {
+ hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
+
+ __skb_set_hash(packet, hash & 0xFFFFFFFFULL,
+ !!(hash & OVS_PACKET_HASH_SW_BIT),
+ !!(hash & OVS_PACKET_HASH_L4_BIT));
+ }
+
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
@@ -701,9 +720,13 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts,
{
size_t len = NLMSG_ALIGN(sizeof(struct ovs_header));
- /* OVS_FLOW_ATTR_UFID */
+ /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback
+ * see ovs_nla_put_identifier()
+ */
if (sfid && ovs_identifier_is_ufid(sfid))
len += nla_total_size(sfid->ufid_len);
+ else
+ len += nla_total_size(ovs_key_attr_size());
/* OVS_FLOW_ATTR_KEY */
if (!sfid || should_fill_key(sfid, ufid_flags))
@@ -879,7 +902,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
info->snd_portid, info->snd_seq, 0,
cmd, ufid_flags);
- BUG_ON(retval < 0);
+ if (WARN_ON_ONCE(retval < 0)) {
+ kfree_skb(skb);
+ skb = ERR_PTR(retval);
+ }
return skb;
}
@@ -1343,7 +1369,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
OVS_FLOW_CMD_DEL,
ufid_flags);
rcu_read_unlock();
- BUG_ON(err < 0);
+ if (WARN_ON_ONCE(err < 0)) {
+ kfree_skb(reply);
+ goto out_free;
+ }
ovs_notify(&dp_flow_genl_family, reply, info);
} else {
@@ -1351,6 +1380,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
}
}
+out_free:
ovs_flow_free(flow, true);
return 0;
unlock:
@@ -1542,10 +1572,59 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *in
dp->user_features = 0;
}
-static void ovs_dp_change(struct datapath *dp, struct nlattr *a[])
+DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
+
+static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
+{
+ u32 user_features = 0;
+
+ if (a[OVS_DP_ATTR_USER_FEATURES]) {
+ user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+
+ if (user_features & ~(OVS_DP_F_VPORT_PIDS |
+ OVS_DP_F_UNALIGNED |
+ OVS_DP_F_TC_RECIRC_SHARING))
+ return -EOPNOTSUPP;
+
+#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ if (user_features & OVS_DP_F_TC_RECIRC_SHARING)
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ dp->user_features = user_features;
+
+ if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
+ static_branch_enable(&tc_recirc_sharing_support);
+ else
+ static_branch_disable(&tc_recirc_sharing_support);
+
+ return 0;
+}
+
+static int ovs_dp_stats_init(struct datapath *dp)
+{
+ dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
+ if (!dp->stats_percpu)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ovs_dp_vport_init(struct datapath *dp)
{
- if (a[OVS_DP_ATTR_USER_FEATURES])
- dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
+ int i;
+
+ dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!dp->ports)
+ return -ENOMEM;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(&dp->ports[i]);
+
+ return 0;
}
static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
@@ -1556,7 +1635,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
struct vport *vport;
struct ovs_net *ovs_net;
- int err, i;
+ int err;
err = -EINVAL;
if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
@@ -1569,35 +1648,26 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
err = -ENOMEM;
dp = kzalloc(sizeof(*dp), GFP_KERNEL);
if (dp == NULL)
- goto err_free_reply;
+ goto err_destroy_reply;
ovs_dp_set_net(dp, sock_net(skb->sk));
/* Allocate table. */
err = ovs_flow_tbl_init(&dp->table);
if (err)
- goto err_free_dp;
+ goto err_destroy_dp;
- dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu);
- if (!dp->stats_percpu) {
- err = -ENOMEM;
+ err = ovs_dp_stats_init(dp);
+ if (err)
goto err_destroy_table;
- }
-
- dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS,
- sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!dp->ports) {
- err = -ENOMEM;
- goto err_destroy_percpu;
- }
- for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
- INIT_HLIST_HEAD(&dp->ports[i]);
+ err = ovs_dp_vport_init(dp);
+ if (err)
+ goto err_destroy_stats;
err = ovs_meters_init(dp);
if (err)
- goto err_destroy_ports_array;
+ goto err_destroy_ports;
/* Set up our datapath device. */
parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
@@ -1607,7 +1677,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
parms.port_no = OVSP_LOCAL;
parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID];
- ovs_dp_change(dp, a);
+ err = ovs_dp_change(dp, a);
+ if (err)
+ goto err_destroy_meters;
/* So far only local changes have been made, now need the lock. */
ovs_lock();
@@ -1627,6 +1699,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_dp_reset_user_features(skb, info);
}
+ ovs_unlock();
goto err_destroy_meters;
}
@@ -1643,17 +1716,16 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
return 0;
err_destroy_meters:
- ovs_unlock();
ovs_meters_exit(dp);
-err_destroy_ports_array:
+err_destroy_ports:
kfree(dp->ports);
-err_destroy_percpu:
+err_destroy_stats:
free_percpu(dp->stats_percpu);
err_destroy_table:
ovs_flow_tbl_destroy(&dp->table);
-err_free_dp:
+err_destroy_dp:
kfree(dp);
-err_free_reply:
+err_destroy_reply:
kfree_skb(reply);
err:
return err;
@@ -1733,7 +1805,9 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(dp))
goto err_unlock_free;
- ovs_dp_change(dp, info->attrs);
+ err = ovs_dp_change(dp, info->attrs);
+ if (err)
+ goto err_unlock_free;
err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
info->snd_seq, 0, OVS_DP_CMD_SET);
@@ -1850,7 +1924,7 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = {
/* Called with ovs_mutex or RCU read lock. */
static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
struct net *net, u32 portid, u32 seq,
- u32 flags, u8 cmd)
+ u32 flags, u8 cmd, gfp_t gfp)
{
struct ovs_header *ovs_header;
struct ovs_vport_stats vport_stats;
@@ -1871,7 +1945,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
goto nla_put_failure;
if (!net_eq(net, dev_net(vport->dev))) {
- int id = peernet2id_alloc(net, dev_net(vport->dev));
+ int id = peernet2id_alloc(net, dev_net(vport->dev), gfp);
if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
goto nla_put_failure;
@@ -1912,11 +1986,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
struct sk_buff *skb;
int retval;
- skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!skb)
return ERR_PTR(-ENOMEM);
- retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd);
+ retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd,
+ GFP_KERNEL);
BUG_ON(retval < 0);
return skb;
@@ -2058,7 +2133,7 @@ restart:
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_NEW);
+ OVS_VPORT_CMD_NEW, GFP_KERNEL);
new_headroom = netdev_get_fwd_headroom(vport->dev);
@@ -2119,7 +2194,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_SET);
+ OVS_VPORT_CMD_SET, GFP_KERNEL);
BUG_ON(err < 0);
ovs_unlock();
@@ -2159,7 +2234,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_DEL);
+ OVS_VPORT_CMD_DEL, GFP_KERNEL);
BUG_ON(err < 0);
/* the vport deletion may trigger dp headroom update */
@@ -2206,7 +2281,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock_free;
err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
info->snd_portid, info->snd_seq, 0,
- OVS_VPORT_CMD_GET);
+ OVS_VPORT_CMD_GET, GFP_ATOMIC);
BUG_ON(err < 0);
rcu_read_unlock();
@@ -2242,7 +2317,8 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
- OVS_VPORT_CMD_GET) < 0)
+ OVS_VPORT_CMD_GET,
+ GFP_ATOMIC) < 0)
goto out;
j++;
@@ -2263,7 +2339,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
[OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
[OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
- [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
+ [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC },
[OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
[OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 },
[OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 },
@@ -2418,7 +2494,7 @@ static int __init dp_init(void)
{
int err;
- BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb));
pr_info("Open vSwitch switching datapath\n");
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 751d34accdf9..e239a46c2f94 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -139,6 +139,18 @@ struct ovs_net {
bool xt_label;
};
+/**
+ * enum ovs_pkt_hash_types - hash info to include with a packet
+ * to send to userspace.
+ * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
+ * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
+ * over transport ports.
+ */
+enum ovs_pkt_hash_types {
+ OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
+ OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
+};
+
extern unsigned int ovs_net_id;
void ovs_lock(void);
void ovs_unlock(void);
@@ -218,6 +230,8 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
extern struct notifier_block ovs_dp_device_notifier;
extern struct genl_family dp_vport_genl_family;
+DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
+
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key);
void ovs_dp_detach_port(struct vport *);
int ovs_dp_upcall(struct datapath *, struct sk_buff *,
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index bc89e16e0505..9d375e74b607 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -523,78 +523,15 @@ static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
}
/**
- * key_extract - extracts a flow key from an Ethernet frame.
+ * key_extract_l3l4 - extracts L3/L4 header information.
* @skb: sk_buff that contains the frame, with skb->data pointing to the
- * Ethernet header
+ * L3 header
* @key: output flow key
*
- * The caller must ensure that skb->len >= ETH_HLEN.
- *
- * Returns 0 if successful, otherwise a negative errno value.
- *
- * Initializes @skb header fields as follows:
- *
- * - skb->mac_header: the L2 header.
- *
- * - skb->network_header: just past the L2 header, or just past the
- * VLAN header, to the first byte of the L2 payload.
- *
- * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
- * on output, then just past the IP header, if one is present and
- * of a correct length, otherwise the same as skb->network_header.
- * For other key->eth.type values it is left untouched.
- *
- * - skb->protocol: the type of the data starting at skb->network_header.
- * Equals to key->eth.type.
*/
-static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
+static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
{
int error;
- struct ethhdr *eth;
-
- /* Flags are always used as part of stats */
- key->tp.flags = 0;
-
- skb_reset_mac_header(skb);
-
- /* Link layer. */
- clear_vlan(key);
- if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
- if (unlikely(eth_type_vlan(skb->protocol)))
- return -EINVAL;
-
- skb_reset_network_header(skb);
- key->eth.type = skb->protocol;
- } else {
- eth = eth_hdr(skb);
- ether_addr_copy(key->eth.src, eth->h_source);
- ether_addr_copy(key->eth.dst, eth->h_dest);
-
- __skb_pull(skb, 2 * ETH_ALEN);
- /* We are going to push all headers that we pull, so no need to
- * update skb->csum here.
- */
-
- if (unlikely(parse_vlan(skb, key)))
- return -ENOMEM;
-
- key->eth.type = parse_ethertype(skb);
- if (unlikely(key->eth.type == htons(0)))
- return -ENOMEM;
-
- /* Multiple tagged packets need to retain TPID to satisfy
- * skb_vlan_pop(), which will later shift the ethertype into
- * skb->protocol.
- */
- if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK))
- skb->protocol = key->eth.cvlan.tpid;
- else
- skb->protocol = key->eth.type;
-
- skb_reset_network_header(skb);
- __skb_push(skb, skb->data - skb_mac_header(skb));
- }
- skb_reset_mac_len(skb);
/* Network layer. */
if (key->eth.type == htons(ETH_P_IP)) {
@@ -623,6 +560,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
offset = nh->frag_off & htons(IP_OFFSET);
if (offset) {
key->ip.frag = OVS_FRAG_TYPE_LATER;
+ memset(&key->tp, 0, sizeof(key->tp));
return 0;
}
if (nh->frag_off & htons(IP_MF) ||
@@ -699,27 +637,35 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
memset(&key->ipv4, 0, sizeof(key->ipv4));
}
} else if (eth_p_mpls(key->eth.type)) {
- size_t stack_len = MPLS_HLEN;
+ u8 label_count = 1;
+ memset(&key->mpls, 0, sizeof(key->mpls));
skb_set_inner_network_header(skb, skb->mac_len);
while (1) {
__be32 lse;
- error = check_header(skb, skb->mac_len + stack_len);
+ error = check_header(skb, skb->mac_len +
+ label_count * MPLS_HLEN);
if (unlikely(error))
return 0;
memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN);
- if (stack_len == MPLS_HLEN)
- memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+ if (label_count <= MPLS_LABEL_DEPTH)
+ memcpy(&key->mpls.lse[label_count - 1], &lse,
+ MPLS_HLEN);
- skb_set_inner_network_header(skb, skb->mac_len + stack_len);
+ skb_set_inner_network_header(skb, skb->mac_len +
+ label_count * MPLS_HLEN);
if (lse & htonl(MPLS_LS_S_MASK))
break;
- stack_len += MPLS_HLEN;
+ label_count++;
}
+ if (label_count > MPLS_LABEL_DEPTH)
+ label_count = MPLS_LABEL_DEPTH;
+
+ key->mpls.num_labels_mask = GENMASK(label_count - 1, 0);
} else if (key->eth.type == htons(ETH_P_IPV6)) {
int nh_len; /* IPv6 Header + Extensions */
@@ -740,8 +686,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
return error;
}
- if (key->ip.frag == OVS_FRAG_TYPE_LATER)
+ if (key->ip.frag == OVS_FRAG_TYPE_LATER) {
+ memset(&key->tp, 0, sizeof(key->tp));
return 0;
+ }
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
key->ip.frag = OVS_FRAG_TYPE_FIRST;
@@ -788,6 +736,92 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
return 0;
}
+/**
+ * key_extract - extracts a flow key from an Ethernet frame.
+ * @skb: sk_buff that contains the frame, with skb->data pointing to the
+ * Ethernet header
+ * @key: output flow key
+ *
+ * The caller must ensure that skb->len >= ETH_HLEN.
+ *
+ * Returns 0 if successful, otherwise a negative errno value.
+ *
+ * Initializes @skb header fields as follows:
+ *
+ * - skb->mac_header: the L2 header.
+ *
+ * - skb->network_header: just past the L2 header, or just past the
+ * VLAN header, to the first byte of the L2 payload.
+ *
+ * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
+ * on output, then just past the IP header, if one is present and
+ * of a correct length, otherwise the same as skb->network_header.
+ * For other key->eth.type values it is left untouched.
+ *
+ * - skb->protocol: the type of the data starting at skb->network_header.
+ * Equals to key->eth.type.
+ */
+static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ struct ethhdr *eth;
+
+ /* Flags are always used as part of stats */
+ key->tp.flags = 0;
+
+ skb_reset_mac_header(skb);
+
+ /* Link layer. */
+ clear_vlan(key);
+ if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
+ if (unlikely(eth_type_vlan(skb->protocol)))
+ return -EINVAL;
+
+ skb_reset_network_header(skb);
+ key->eth.type = skb->protocol;
+ } else {
+ eth = eth_hdr(skb);
+ ether_addr_copy(key->eth.src, eth->h_source);
+ ether_addr_copy(key->eth.dst, eth->h_dest);
+
+ __skb_pull(skb, 2 * ETH_ALEN);
+ /* We are going to push all headers that we pull, so no need to
+ * update skb->csum here.
+ */
+
+ if (unlikely(parse_vlan(skb, key)))
+ return -ENOMEM;
+
+ key->eth.type = parse_ethertype(skb);
+ if (unlikely(key->eth.type == htons(0)))
+ return -ENOMEM;
+
+ /* Multiple tagged packets need to retain TPID to satisfy
+ * skb_vlan_pop(), which will later shift the ethertype into
+ * skb->protocol.
+ */
+ if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK))
+ skb->protocol = key->eth.cvlan.tpid;
+ else
+ skb->protocol = key->eth.type;
+
+ skb_reset_network_header(skb);
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+ }
+
+ skb_reset_mac_len(skb);
+
+ /* Fill out L3/L4 key info, if any */
+ return key_extract_l3l4(skb, key);
+}
+
+/* In the case of conntrack fragment handling it expects L3 headers,
+ * add a helper.
+ */
+int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ return key_extract_l3l4(skb, key);
+}
+
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
{
int res;
@@ -816,6 +850,9 @@ static int key_extract_mac_proto(struct sk_buff *skb)
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *tc_ext;
+#endif
int res, err;
/* Extract metadata from packet. */
@@ -848,7 +885,17 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
if (res < 0)
return res;
key->mac_proto = res;
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ if (static_branch_unlikely(&tc_recirc_sharing_support)) {
+ tc_ext = skb_ext_find(skb, TC_SKB_EXT);
+ key->recirc_id = tc_ext ? tc_ext->chain : 0;
+ } else {
+ key->recirc_id = 0;
+ }
+#else
key->recirc_id = 0;
+#endif
err = key_extract(skb, key);
if (!err)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a5506e2d4b7a..758a8c77f736 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -30,13 +30,14 @@ enum sw_flow_mac_proto {
MAC_PROTO_ETHERNET,
};
#define SW_FLOW_KEY_INVALID 0x80
+#define MPLS_LABEL_DEPTH 3
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
*/
#define TUN_METADATA_OFFSET(opt_len) \
- (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len)
+ (sizeof_field(struct sw_flow_key, tun_opts) - opt_len)
#define TUN_METADATA_OPTS(flow_key, opt_len) \
((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
@@ -51,7 +52,7 @@ struct vlan_head {
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
- FIELD_SIZEOF(struct sw_flow_key, recirc_id))
+ sizeof_field(struct sw_flow_key, recirc_id))
struct ovs_key_nsh {
struct ovs_nsh_key_base base;
@@ -85,9 +86,6 @@ struct sw_flow_key {
*/
union {
struct {
- __be32 top_lse; /* top label stack entry */
- } mpls;
- struct {
u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
u8 tos; /* IP ToS. */
u8 ttl; /* IP TTL/hop limit. */
@@ -135,6 +133,11 @@ struct sw_flow_key {
} nd;
};
} ipv6;
+ struct {
+ u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */
+ __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */
+ } mpls;
+
struct ovs_key_nsh nsh; /* network service header */
};
struct {
@@ -166,7 +169,6 @@ struct sw_flow_key_range {
struct sw_flow_mask {
int ref_count;
struct rcu_head rcu;
- struct list_head list;
struct sw_flow_key_range range;
struct sw_flow_key key;
};
@@ -270,6 +272,7 @@ void ovs_flow_stats_clear(struct sw_flow *);
u64 ovs_flow_used_time(unsigned long flow_jiffies);
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
+int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key);
int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb,
struct sw_flow_key *key);
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d7559c64795d..7da4230627f5 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -79,6 +79,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
case OVS_ACTION_ATTR_SET_MASKED:
case OVS_ACTION_ATTR_METER:
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
+ case OVS_ACTION_ATTR_ADD_MPLS:
default:
return true;
}
@@ -424,7 +425,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) },
[OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED,
.next = ovs_tunnel_key_lens, },
- [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) },
+ [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE },
[OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) },
[OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) },
[OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) },
@@ -1628,10 +1629,25 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match,
if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
const struct ovs_key_mpls *mpls_key;
+ u32 hdr_len;
+ u32 label_count, label_count_mask, i;
mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
- SW_FLOW_KEY_PUT(match, mpls.top_lse,
- mpls_key->mpls_lse, is_mask);
+ hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]);
+ label_count = hdr_len / sizeof(struct ovs_key_mpls);
+
+ if (label_count == 0 || label_count > MPLS_LABEL_DEPTH ||
+ hdr_len % sizeof(struct ovs_key_mpls))
+ return -EINVAL;
+
+ label_count_mask = GENMASK(label_count - 1, 0);
+
+ for (i = 0 ; i < label_count; i++)
+ SW_FLOW_KEY_PUT(match, mpls.lse[i],
+ mpls_key[i].mpls_lse, is_mask);
+
+ SW_FLOW_KEY_PUT(match, mpls.num_labels_mask,
+ label_count_mask, is_mask);
attrs &= ~(1 << OVS_KEY_ATTR_MPLS);
}
@@ -2114,13 +2130,18 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
} else if (eth_p_mpls(swkey->eth.type)) {
+ u8 i, num_labels;
struct ovs_key_mpls *mpls_key;
- nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+ num_labels = hweight_long(output->mpls.num_labels_mask);
+ nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS,
+ num_labels * sizeof(*mpls_key));
if (!nla)
goto nla_put_failure;
+
mpls_key = nla_data(nla);
- mpls_key->mpls_lse = output->mpls.top_lse;
+ for (i = 0; i < num_labels; i++)
+ mpls_key[i].mpls_lse = output->mpls.lse[i];
}
if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -2406,13 +2427,14 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa,
static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa,
- __be16 eth_type, __be16 vlan_tci, bool log);
+ __be16 eth_type, __be16 vlan_tci,
+ u32 mpls_label_count, bool log);
static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci,
- bool log, bool last)
+ u32 mpls_label_count, bool log, bool last)
{
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
const struct nlattr *probability, *actions;
@@ -2463,7 +2485,7 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
return err;
err = __ovs_nla_copy_actions(net, actions, key, sfa,
- eth_type, vlan_tci, log);
+ eth_type, vlan_tci, mpls_label_count, log);
if (err)
return err;
@@ -2478,7 +2500,7 @@ static int validate_and_copy_clone(struct net *net,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci,
- bool log, bool last)
+ u32 mpls_label_count, bool log, bool last)
{
int start, err;
u32 exec;
@@ -2498,7 +2520,7 @@ static int validate_and_copy_clone(struct net *net,
return err;
err = __ovs_nla_copy_actions(net, attr, key, sfa,
- eth_type, vlan_tci, log);
+ eth_type, vlan_tci, mpls_label_count, log);
if (err)
return err;
@@ -2864,6 +2886,7 @@ static int validate_and_copy_check_pkt_len(struct net *net,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa,
__be16 eth_type, __be16 vlan_tci,
+ u32 mpls_label_count,
bool log, bool last)
{
const struct nlattr *acts_if_greater, *acts_if_lesser_eq;
@@ -2912,7 +2935,7 @@ static int validate_and_copy_check_pkt_len(struct net *net,
return nested_acts_start;
err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa,
- eth_type, vlan_tci, log);
+ eth_type, vlan_tci, mpls_label_count, log);
if (err)
return err;
@@ -2925,7 +2948,7 @@ static int validate_and_copy_check_pkt_len(struct net *net,
return nested_acts_start;
err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa,
- eth_type, vlan_tci, log);
+ eth_type, vlan_tci, mpls_label_count, log);
if (err)
return err;
@@ -2952,7 +2975,8 @@ static int copy_action(const struct nlattr *from,
static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa,
- __be16 eth_type, __be16 vlan_tci, bool log)
+ __be16 eth_type, __be16 vlan_tci,
+ u32 mpls_label_count, bool log)
{
u8 mac_proto = ovs_key_mac_proto(key);
const struct nlattr *a;
@@ -2982,6 +3006,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_METER] = sizeof(u32),
[OVS_ACTION_ATTR_CLONE] = (u32)-1,
[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
+ [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls),
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -3049,6 +3074,33 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
case OVS_ACTION_ATTR_RECIRC:
break;
+ case OVS_ACTION_ATTR_ADD_MPLS: {
+ const struct ovs_action_add_mpls *mpls = nla_data(a);
+
+ if (!eth_p_mpls(mpls->mpls_ethertype))
+ return -EINVAL;
+
+ if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) {
+ if (vlan_tci & htons(VLAN_CFI_MASK) ||
+ (eth_type != htons(ETH_P_IP) &&
+ eth_type != htons(ETH_P_IPV6) &&
+ eth_type != htons(ETH_P_ARP) &&
+ eth_type != htons(ETH_P_RARP) &&
+ !eth_p_mpls(eth_type)))
+ return -EINVAL;
+ mpls_label_count++;
+ } else {
+ if (mac_proto == MAC_PROTO_ETHERNET) {
+ mpls_label_count = 1;
+ mac_proto = MAC_PROTO_NONE;
+ } else {
+ mpls_label_count++;
+ }
+ }
+ eth_type = mpls->mpls_ethertype;
+ break;
+ }
+
case OVS_ACTION_ATTR_PUSH_MPLS: {
const struct ovs_action_push_mpls *mpls = nla_data(a);
@@ -3065,25 +3117,41 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
!eth_p_mpls(eth_type)))
return -EINVAL;
eth_type = mpls->mpls_ethertype;
+ mpls_label_count++;
break;
}
- case OVS_ACTION_ATTR_POP_MPLS:
+ case OVS_ACTION_ATTR_POP_MPLS: {
+ __be16 proto;
if (vlan_tci & htons(VLAN_CFI_MASK) ||
!eth_p_mpls(eth_type))
return -EINVAL;
- /* Disallow subsequent L2.5+ set and mpls_pop actions
- * as there is no check here to ensure that the new
- * eth_type is valid and thus set actions could
- * write off the end of the packet or otherwise
- * corrupt it.
+ /* Disallow subsequent L2.5+ set actions and mpls_pop
+ * actions once the last MPLS label in the packet is
+ * is popped as there is no check here to ensure that
+ * the new eth type is valid and thus set actions could
+ * write off the end of the packet or otherwise corrupt
+ * it.
*
* Support for these actions is planned using packet
* recirculation.
*/
- eth_type = htons(0);
+ proto = nla_get_be16(a);
+
+ if (proto == htons(ETH_P_TEB) &&
+ mac_proto != MAC_PROTO_NONE)
+ return -EINVAL;
+
+ mpls_label_count--;
+
+ if (!eth_p_mpls(proto) || !mpls_label_count)
+ eth_type = htons(0);
+ else
+ eth_type = proto;
+
break;
+ }
case OVS_ACTION_ATTR_SET:
err = validate_set(a, key, sfa,
@@ -3106,6 +3174,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
err = validate_and_copy_sample(net, a, key, sfa,
eth_type, vlan_tci,
+ mpls_label_count,
log, last);
if (err)
return err;
@@ -3176,6 +3245,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
err = validate_and_copy_clone(net, a, key, sfa,
eth_type, vlan_tci,
+ mpls_label_count,
log, last);
if (err)
return err;
@@ -3188,8 +3258,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
err = validate_and_copy_check_pkt_len(net, a, key, sfa,
eth_type,
- vlan_tci, log,
- last);
+ vlan_tci,
+ mpls_label_count,
+ log, last);
if (err)
return err;
skip_copy = true;
@@ -3219,14 +3290,18 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
struct sw_flow_actions **sfa, bool log)
{
int err;
+ u32 mpls_label_count = 0;
*sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE));
if (IS_ERR(*sfa))
return PTR_ERR(*sfa);
+ if (eth_p_mpls(key->eth.type))
+ mpls_label_count = hweight_long(key->mpls.num_labels_mask);
+
(*sfa)->orig_len = nla_len(attr);
err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type,
- key->eth.vlan.tci, log);
+ key->eth.vlan.tci, mpls_label_count, log);
if (err)
ovs_nla_free_flow_actions(*sfa);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index cf3582c5ed70..5904e93e5765 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -34,8 +34,13 @@
#include <net/ndisc.h>
#define TBL_MIN_BUCKETS 1024
+#define MASK_ARRAY_SIZE_MIN 16
#define REHASH_INTERVAL (10 * 60 * HZ)
+#define MC_HASH_SHIFT 8
+#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT)
+#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
+
static struct kmem_cache *flow_cache;
struct kmem_cache *flow_stats_cache __read_mostly;
@@ -164,14 +169,133 @@ static struct table_instance *table_instance_alloc(int new_size)
return ti;
}
+static struct mask_array *tbl_mask_array_alloc(int size)
+{
+ struct mask_array *new;
+
+ size = max(MASK_ARRAY_SIZE_MIN, size);
+ new = kzalloc(sizeof(struct mask_array) +
+ sizeof(struct sw_flow_mask *) * size, GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->count = 0;
+ new->max = size;
+
+ return new;
+}
+
+static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
+{
+ struct mask_array *old;
+ struct mask_array *new;
+
+ new = tbl_mask_array_alloc(size);
+ if (!new)
+ return -ENOMEM;
+
+ old = ovsl_dereference(tbl->mask_array);
+ if (old) {
+ int i;
+
+ for (i = 0; i < old->max; i++) {
+ if (ovsl_dereference(old->masks[i]))
+ new->masks[new->count++] = old->masks[i];
+ }
+ }
+
+ rcu_assign_pointer(tbl->mask_array, new);
+ kfree_rcu(old, rcu);
+
+ return 0;
+}
+
+static int tbl_mask_array_add_mask(struct flow_table *tbl,
+ struct sw_flow_mask *new)
+{
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+ int err, ma_count = READ_ONCE(ma->count);
+
+ if (ma_count >= ma->max) {
+ err = tbl_mask_array_realloc(tbl, ma->max +
+ MASK_ARRAY_SIZE_MIN);
+ if (err)
+ return err;
+
+ ma = ovsl_dereference(tbl->mask_array);
+ }
+
+ BUG_ON(ovsl_dereference(ma->masks[ma_count]));
+
+ rcu_assign_pointer(ma->masks[ma_count], new);
+ WRITE_ONCE(ma->count, ma_count +1);
+
+ return 0;
+}
+
+static void tbl_mask_array_del_mask(struct flow_table *tbl,
+ struct sw_flow_mask *mask)
+{
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+ int i, ma_count = READ_ONCE(ma->count);
+
+ /* Remove the deleted mask pointers from the array */
+ for (i = 0; i < ma_count; i++) {
+ if (mask == ovsl_dereference(ma->masks[i]))
+ goto found;
+ }
+
+ BUG();
+ return;
+
+found:
+ WRITE_ONCE(ma->count, ma_count -1);
+
+ rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]);
+ RCU_INIT_POINTER(ma->masks[ma_count -1], NULL);
+
+ kfree_rcu(mask, rcu);
+
+ /* Shrink the mask array if necessary. */
+ if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
+ ma_count <= (ma->max / 3))
+ tbl_mask_array_realloc(tbl, ma->max / 2);
+}
+
+/* Remove 'mask' from the mask list, if it is not needed any more. */
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+ if (mask) {
+ /* ovs-lock is required to protect mask-refcount and
+ * mask list.
+ */
+ ASSERT_OVSL();
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count)
+ tbl_mask_array_del_mask(tbl, mask);
+ }
+}
+
int ovs_flow_tbl_init(struct flow_table *table)
{
struct table_instance *ti, *ufid_ti;
+ struct mask_array *ma;
- ti = table_instance_alloc(TBL_MIN_BUCKETS);
+ table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) *
+ MC_HASH_ENTRIES,
+ __alignof__(struct mask_cache_entry));
+ if (!table->mask_cache)
+ return -ENOMEM;
+ ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
+ if (!ma)
+ goto free_mask_cache;
+
+ ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!ti)
- return -ENOMEM;
+ goto free_mask_array;
ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
if (!ufid_ti)
@@ -179,7 +303,7 @@ int ovs_flow_tbl_init(struct flow_table *table)
rcu_assign_pointer(table->ti, ti);
rcu_assign_pointer(table->ufid_ti, ufid_ti);
- INIT_LIST_HEAD(&table->mask_list);
+ rcu_assign_pointer(table->mask_array, ma);
table->last_rehash = jiffies;
table->count = 0;
table->ufid_count = 0;
@@ -187,6 +311,10 @@ int ovs_flow_tbl_init(struct flow_table *table)
free_ti:
__table_instance_destroy(ti);
+free_mask_array:
+ kfree(ma);
+free_mask_cache:
+ free_percpu(table->mask_cache);
return -ENOMEM;
}
@@ -197,7 +325,28 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
__table_instance_destroy(ti);
}
-static void table_instance_destroy(struct table_instance *ti,
+static void table_instance_flow_free(struct flow_table *table,
+ struct table_instance *ti,
+ struct table_instance *ufid_ti,
+ struct sw_flow *flow,
+ bool count)
+{
+ hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
+ if (count)
+ table->count--;
+
+ if (ovs_identifier_is_ufid(&flow->id)) {
+ hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
+
+ if (count)
+ table->ufid_count--;
+ }
+
+ flow_mask_remove(table, flow->mask);
+}
+
+static void table_instance_destroy(struct flow_table *table,
+ struct table_instance *ti,
struct table_instance *ufid_ti,
bool deferred)
{
@@ -214,13 +363,12 @@ static void table_instance_destroy(struct table_instance *ti,
struct sw_flow *flow;
struct hlist_head *head = &ti->buckets[i];
struct hlist_node *n;
- int ver = ti->node_ver;
- int ufid_ver = ufid_ti->node_ver;
- hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) {
- hlist_del_rcu(&flow->flow_table.node[ver]);
- if (ovs_identifier_is_ufid(&flow->id))
- hlist_del_rcu(&flow->ufid_table.node[ufid_ver]);
+ hlist_for_each_entry_safe(flow, n, head,
+ flow_table.node[ti->node_ver]) {
+
+ table_instance_flow_free(table, ti, ufid_ti,
+ flow, false);
ovs_flow_free(flow, deferred);
}
}
@@ -243,7 +391,9 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
struct table_instance *ti = rcu_dereference_raw(table->ti);
struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
- table_instance_destroy(ti, ufid_ti, false);
+ free_percpu(table->mask_cache);
+ kfree_rcu(rcu_dereference_raw(table->mask_array), rcu);
+ table_instance_destroy(table, ti, ufid_ti, false);
}
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
@@ -359,7 +509,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
flow_table->count = 0;
flow_table->ufid_count = 0;
- table_instance_destroy(old_ti, old_ufid_ti, true);
+ table_instance_destroy(flow_table, old_ti, old_ufid_ti, true);
return 0;
err_free_ti:
@@ -370,13 +520,10 @@ err_free_ti:
static u32 flow_hash(const struct sw_flow_key *key,
const struct sw_flow_key_range *range)
{
- int key_start = range->start;
- int key_end = range->end;
- const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
- int hash_u32s = (key_end - key_start) >> 2;
+ const u32 *hash_key = (const u32 *)((const u8 *)key + range->start);
/* Make sure number of hash bytes are multiple of u32. */
- BUILD_BUG_ON(sizeof(long) % sizeof(u32));
+ int hash_u32s = range_n_bytes(range) >> 2;
return jhash2(hash_key, hash_u32s, 0);
}
@@ -425,7 +572,8 @@ static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
const struct sw_flow_key *unmasked,
- const struct sw_flow_mask *mask)
+ const struct sw_flow_mask *mask,
+ u32 *n_mask_hit)
{
struct sw_flow *flow;
struct hlist_head *head;
@@ -435,6 +583,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
ovs_flow_mask_key(&masked_key, unmasked, false, mask);
hash = flow_hash(&masked_key, &mask->range);
head = find_bucket(ti, hash);
+ (*n_mask_hit)++;
+
hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
@@ -443,46 +593,147 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
return NULL;
}
-struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
- const struct sw_flow_key *key,
- u32 *n_mask_hit)
+/* Flow lookup does full lookup on flow table. It starts with
+ * mask from index passed in *index.
+ */
+static struct sw_flow *flow_lookup(struct flow_table *tbl,
+ struct table_instance *ti,
+ struct mask_array *ma,
+ const struct sw_flow_key *key,
+ u32 *n_mask_hit,
+ u32 *index)
{
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct sw_flow *flow;
struct sw_flow_mask *mask;
+ int i;
+
+ if (likely(*index < ma->max)) {
+ mask = rcu_dereference_ovsl(ma->masks[*index]);
+ if (mask) {
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
+ if (flow)
+ return flow;
+ }
+ }
+
+ for (i = 0; i < ma->max; i++) {
+
+ if (i == *index)
+ continue;
+
+ mask = rcu_dereference_ovsl(ma->masks[i]);
+ if (unlikely(!mask))
+ break;
+
+ flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
+ if (flow) { /* Found */
+ *index = i;
+ return flow;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * mask_cache maps flow to probable mask. This cache is not tightly
+ * coupled cache, It means updates to mask list can result in inconsistent
+ * cache entry in mask cache.
+ * This is per cpu cache and is divided in MC_HASH_SEGS segments.
+ * In case of a hash collision the entry is hashed in next segment.
+ * */
+struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
+ const struct sw_flow_key *key,
+ u32 skb_hash,
+ u32 *n_mask_hit)
+{
+ struct mask_array *ma = rcu_dereference(tbl->mask_array);
+ struct table_instance *ti = rcu_dereference(tbl->ti);
+ struct mask_cache_entry *entries, *ce;
struct sw_flow *flow;
+ u32 hash;
+ int seg;
*n_mask_hit = 0;
- list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
- (*n_mask_hit)++;
- flow = masked_flow_lookup(ti, key, mask);
- if (flow) /* Found */
+ if (unlikely(!skb_hash)) {
+ u32 mask_index = 0;
+
+ return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
+ }
+
+ /* Pre and post recirulation flows usually have the same skb_hash
+ * value. To avoid hash collisions, rehash the 'skb_hash' with
+ * 'recirc_id'. */
+ if (key->recirc_id)
+ skb_hash = jhash_1word(skb_hash, key->recirc_id);
+
+ ce = NULL;
+ hash = skb_hash;
+ entries = this_cpu_ptr(tbl->mask_cache);
+
+ /* Find the cache entry 'ce' to operate on. */
+ for (seg = 0; seg < MC_HASH_SEGS; seg++) {
+ int index = hash & (MC_HASH_ENTRIES - 1);
+ struct mask_cache_entry *e;
+
+ e = &entries[index];
+ if (e->skb_hash == skb_hash) {
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
+ &e->mask_index);
+ if (!flow)
+ e->skb_hash = 0;
return flow;
+ }
+
+ if (!ce || e->skb_hash < ce->skb_hash)
+ ce = e; /* A better replacement cache candidate. */
+
+ hash >>= MC_HASH_SHIFT;
}
- return NULL;
+
+ /* Cache miss, do full lookup. */
+ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
+ if (flow)
+ ce->skb_hash = skb_hash;
+
+ return flow;
}
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
const struct sw_flow_key *key)
{
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
u32 __always_unused n_mask_hit;
+ u32 index = 0;
- return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
+ return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index);
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
const struct sw_flow_match *match)
{
- struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
- struct sw_flow_mask *mask;
- struct sw_flow *flow;
+ struct mask_array *ma = ovsl_dereference(tbl->mask_array);
+ int i;
/* Always called under ovs-mutex. */
- list_for_each_entry(mask, &tbl->mask_list, list) {
- flow = masked_flow_lookup(ti, match->key, mask);
+ for (i = 0; i < ma->max; i++) {
+ struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+ u32 __always_unused n_mask_hit;
+ struct sw_flow_mask *mask;
+ struct sw_flow *flow;
+
+ mask = ovsl_dereference(ma->masks[i]);
+ if (!mask)
+ continue;
+
+ flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
if (flow && ovs_identifier_is_key(&flow->id) &&
- ovs_flow_cmp_unmasked_key(flow, match))
+ ovs_flow_cmp_unmasked_key(flow, match)) {
return flow;
+ }
}
+
return NULL;
}
@@ -528,13 +779,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
int ovs_flow_tbl_num_masks(const struct flow_table *table)
{
- struct sw_flow_mask *mask;
- int num = 0;
-
- list_for_each_entry(mask, &table->mask_list, list)
- num++;
-
- return num;
+ struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
+ return READ_ONCE(ma->count);
}
static struct table_instance *table_instance_expand(struct table_instance *ti,
@@ -543,24 +789,6 @@ static struct table_instance *table_instance_expand(struct table_instance *ti,
return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
}
-/* Remove 'mask' from the mask list, if it is not needed any more. */
-static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
-{
- if (mask) {
- /* ovs-lock is required to protect mask-refcount and
- * mask list.
- */
- ASSERT_OVSL();
- BUG_ON(!mask->ref_count);
- mask->ref_count--;
-
- if (!mask->ref_count) {
- list_del_rcu(&mask->list);
- kfree_rcu(mask, rcu);
- }
- }
-}
-
/* Must be called with OVS mutex held. */
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
@@ -568,17 +796,7 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
BUG_ON(table->count == 0);
- hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
- table->count--;
- if (ovs_identifier_is_ufid(&flow->id)) {
- hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
- table->ufid_count--;
- }
-
- /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
- * accessible as long as the RCU read lock is held.
- */
- flow_mask_remove(table, flow->mask);
+ table_instance_flow_free(table, ti, ufid_ti, flow, true);
}
static struct sw_flow_mask *mask_alloc(void)
@@ -606,13 +824,16 @@ static bool mask_equal(const struct sw_flow_mask *a,
static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
const struct sw_flow_mask *mask)
{
- struct list_head *ml;
+ struct mask_array *ma;
+ int i;
+
+ ma = ovsl_dereference(tbl->mask_array);
+ for (i = 0; i < ma->max; i++) {
+ struct sw_flow_mask *t;
+ t = ovsl_dereference(ma->masks[i]);
- list_for_each(ml, &tbl->mask_list) {
- struct sw_flow_mask *m;
- m = container_of(ml, struct sw_flow_mask, list);
- if (mask_equal(mask, m))
- return m;
+ if (t && mask_equal(mask, t))
+ return t;
}
return NULL;
@@ -623,6 +844,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
const struct sw_flow_mask *new)
{
struct sw_flow_mask *mask;
+
mask = flow_mask_find(tbl, new);
if (!mask) {
/* Allocate a new mask if none exsits. */
@@ -631,7 +853,12 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
return -ENOMEM;
mask->key = new->key;
mask->range = new->range;
- list_add_rcu(&mask->list, &tbl->mask_list);
+
+ /* Add mask to mask-list. */
+ if (tbl_mask_array_add_mask(tbl, mask)) {
+ kfree(mask);
+ return -ENOMEM;
+ }
} else {
BUG_ON(!mask->ref_count);
mask->ref_count++;
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index bc52045b63ff..8a5cea6ae111 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -22,6 +22,17 @@
#include "flow.h"
+struct mask_cache_entry {
+ u32 skb_hash;
+ u32 mask_index;
+};
+
+struct mask_array {
+ struct rcu_head rcu;
+ int count, max;
+ struct sw_flow_mask __rcu *masks[];
+};
+
struct table_instance {
struct hlist_head *buckets;
unsigned int n_buckets;
@@ -34,7 +45,8 @@ struct table_instance {
struct flow_table {
struct table_instance __rcu *ti;
struct table_instance __rcu *ufid_ti;
- struct list_head mask_list;
+ struct mask_cache_entry __percpu *mask_cache;
+ struct mask_array __rcu *mask_array;
unsigned long last_rehash;
unsigned int count;
unsigned int ufid_count;
@@ -60,8 +72,9 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table);
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table,
u32 *bucket, u32 *idx);
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
- const struct sw_flow_key *,
- u32 *n_mask_hit);
+ const struct sw_flow_key *,
+ u32 skb_hash,
+ u32 *n_mask_hit);
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
const struct sw_flow_key *);
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d2437b5b2f6a..58a7b8312c28 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -137,7 +137,7 @@ static void do_setup(struct net_device *netdev)
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
IFF_NO_QUEUE;
netdev->needs_free_netdev = true;
- netdev->priv_destructor = internal_dev_destructor;
+ netdev->priv_destructor = NULL;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -159,7 +159,6 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
struct internal_dev *internal_dev;
struct net_device *dev;
int err;
- bool free_vport = true;
vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
if (IS_ERR(vport)) {
@@ -190,10 +189,9 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
rtnl_lock();
err = register_netdevice(vport->dev);
- if (err) {
- free_vport = false;
+ if (err)
goto error_unlock;
- }
+ vport->dev->priv_destructor = internal_dev_destructor;
dev_set_promiscuity(vport->dev, 1);
rtnl_unlock();
@@ -207,8 +205,7 @@ error_unlock:
error_free_netdev:
free_netdev(dev);
error_free_vport:
- if (free_vport)
- ovs_vport_free(vport);
+ ovs_vport_free(vport);
error:
return ERR_PTR(err);
}
@@ -237,7 +234,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
}
skb_dst_drop(skb);
- nf_reset(skb);
+ nf_reset_ct(skb);
secpath_reset(skb);
skb->pkt_type = PACKET_HOST;
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 3fc38d16c456..5da9392b03d6 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -403,8 +403,9 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
ids = rcu_dereference(vport->upcall_portids);
- if (ids->n_ids == 1 && ids->ids[0] == 0)
- return 0;
+ /* If there is only one portid, select it in the fast-path. */
+ if (ids->n_ids == 1)
+ return ids->ids[0];
hash = skb_get_hash(skb);
ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e2742b006d25..30c6879d6774 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -408,17 +408,17 @@ static int __packet_get_status(const struct packet_sock *po, void *frame)
}
}
-static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
unsigned int flags)
{
struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
if (shhwtstamps &&
(flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
- ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
+ ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
return TP_STATUS_TS_RAW_HARDWARE;
- if (ktime_to_timespec_cond(skb->tstamp, ts))
+ if (ktime_to_timespec64_cond(skb->tstamp, ts))
return TP_STATUS_TS_SOFTWARE;
return 0;
@@ -428,13 +428,20 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
struct sk_buff *skb)
{
union tpacket_uhdr h;
- struct timespec ts;
+ struct timespec64 ts;
__u32 ts_status;
if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
return 0;
h.raw = frame;
+ /*
+ * versions 1 through 3 overflow the timestamps in y2106, since they
+ * all store the seconds in a 32-bit unsigned integer.
+ * If we create a version 4, that should have a 64-bit timestamp,
+ * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
+ * nanoseconds.
+ */
switch (po->tp_version) {
case TPACKET_V1:
h.h1->tp_sec = ts.tv_sec;
@@ -520,7 +527,7 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
int blk_size_in_bytes)
{
struct net_device *dev;
- unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
+ unsigned int mbits, div;
struct ethtool_link_ksettings ecmd;
int err;
@@ -532,30 +539,25 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
}
err = __ethtool_get_link_ksettings(dev, &ecmd);
rtnl_unlock();
- if (!err) {
- /*
- * If the link speed is so slow you don't really
- * need to worry about perf anyways
- */
- if (ecmd.base.speed < SPEED_1000 ||
- ecmd.base.speed == SPEED_UNKNOWN) {
- return DEFAULT_PRB_RETIRE_TOV;
- } else {
- msec = 1;
- div = ecmd.base.speed / 1000;
- }
- }
+ if (err)
+ return DEFAULT_PRB_RETIRE_TOV;
+
+ /* If the link speed is so slow you don't really
+ * need to worry about perf anyways
+ */
+ if (ecmd.base.speed < SPEED_1000 ||
+ ecmd.base.speed == SPEED_UNKNOWN)
+ return DEFAULT_PRB_RETIRE_TOV;
+ div = ecmd.base.speed / 1000;
mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
if (div)
mbits /= div;
- tmo = mbits * msec;
-
if (div)
- return tmo+1;
- return tmo;
+ return mbits + 1;
+ return mbits;
}
static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
@@ -774,8 +776,8 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
* It shouldn't really happen as we don't close empty
* blocks. See prb_retire_rx_blk_timer_expired().
*/
- struct timespec ts;
- getnstimeofday(&ts);
+ struct timespec64 ts;
+ ktime_get_real_ts64(&ts);
h1->ts_last_pkt.ts_sec = ts.tv_sec;
h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
}
@@ -805,7 +807,7 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_block_desc *pbd1)
{
- struct timespec ts;
+ struct timespec64 ts;
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
smp_rmb();
@@ -818,7 +820,7 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
BLOCK_NUM_PKTS(pbd1) = 0;
BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
- getnstimeofday(&ts);
+ ktime_get_real_ts64(&ts);
h1->ts_first_pkt.ts_sec = ts.tv_sec;
h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
@@ -1295,15 +1297,21 @@ static void packet_sock_destruct(struct sock *sk)
static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
{
- u32 rxhash;
+ u32 *history = po->rollover->history;
+ u32 victim, rxhash;
int i, count = 0;
rxhash = skb_get_hash(skb);
for (i = 0; i < ROLLOVER_HLEN; i++)
- if (po->rollover->history[i] == rxhash)
+ if (READ_ONCE(history[i]) == rxhash)
count++;
- po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
+ victim = prandom_u32() % ROLLOVER_HLEN;
+
+ /* Avoid dirtying the cache line if possible */
+ if (READ_ONCE(history[victim]) != rxhash)
+ WRITE_ONCE(history[victim], rxhash);
+
return count > (ROLLOVER_HLEN >> 1);
}
@@ -1821,7 +1829,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
skb_dst_drop(skb);
/* drop conntrack reference */
- nf_reset(skb);
+ nf_reset_ct(skb);
spkt = &PACKET_SKB_CB(skb)->sa.pkt;
@@ -2121,7 +2129,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
skb_dst_drop(skb);
/* drop conntrack reference */
- nf_reset(skb);
+ nf_reset_ct(skb);
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_packets++;
@@ -2162,7 +2170,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
unsigned long status = TP_STATUS_USER;
unsigned short macoff, netoff, hdrlen;
struct sk_buff *copy_skb = NULL;
- struct timespec ts;
+ struct timespec64 ts;
__u32 ts_status;
bool is_drop_n_account = false;
bool do_vnet = false;
@@ -2294,7 +2302,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
- getnstimeofday(&ts);
+ ktime_get_real_ts64(&ts);
status |= ts_status;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index 49bd76a87461..ac0fae06cc15 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -35,8 +35,6 @@ static unsigned int phonet_net_id __read_mostly;
static struct phonet_net *phonet_pernet(struct net *net)
{
- BUG_ON(!net);
-
return net_generic(net, phonet_net_id);
}
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 96ea9f254ae9..76d499f6af9a 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -338,9 +338,9 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
if (sk->sk_state == TCP_CLOSE)
return EPOLLERR;
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
- if (!skb_queue_empty(&pn->ctrlreq_queue))
+ if (!skb_queue_empty_lockless(&pn->ctrlreq_queue))
mask |= EPOLLPRI;
if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
return EPOLLHUP;
diff --git a/net/psample/psample.c b/net/psample/psample.c
index 841f198ea1a8..6f2fbc6b9eb2 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -73,7 +73,7 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
int idx = 0;
int err;
- spin_lock(&psample_groups_lock);
+ spin_lock_bh(&psample_groups_lock);
list_for_each_entry(group, &psample_groups_list, list) {
if (!net_eq(group->net, sock_net(msg->sk)))
continue;
@@ -89,7 +89,7 @@ static int psample_nl_cmd_get_group_dumpit(struct sk_buff *msg,
idx++;
}
- spin_unlock(&psample_groups_lock);
+ spin_unlock_bh(&psample_groups_lock);
cb->args[0] = idx;
return msg->len;
}
@@ -154,7 +154,7 @@ static void psample_group_destroy(struct psample_group *group)
{
psample_group_notify(group, PSAMPLE_CMD_DEL_GROUP);
list_del(&group->list);
- kfree(group);
+ kfree_rcu(group, rcu);
}
static struct psample_group *
@@ -172,7 +172,7 @@ struct psample_group *psample_group_get(struct net *net, u32 group_num)
{
struct psample_group *group;
- spin_lock(&psample_groups_lock);
+ spin_lock_bh(&psample_groups_lock);
group = psample_group_lookup(net, group_num);
if (!group) {
@@ -183,19 +183,27 @@ struct psample_group *psample_group_get(struct net *net, u32 group_num)
group->refcount++;
out:
- spin_unlock(&psample_groups_lock);
+ spin_unlock_bh(&psample_groups_lock);
return group;
}
EXPORT_SYMBOL_GPL(psample_group_get);
+void psample_group_take(struct psample_group *group)
+{
+ spin_lock_bh(&psample_groups_lock);
+ group->refcount++;
+ spin_unlock_bh(&psample_groups_lock);
+}
+EXPORT_SYMBOL_GPL(psample_group_take);
+
void psample_group_put(struct psample_group *group)
{
- spin_lock(&psample_groups_lock);
+ spin_lock_bh(&psample_groups_lock);
if (--group->refcount == 0)
psample_group_destroy(group);
- spin_unlock(&psample_groups_lock);
+ spin_unlock_bh(&psample_groups_lock);
}
EXPORT_SYMBOL_GPL(psample_group_put);
@@ -221,7 +229,7 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN
- NLA_ALIGNTO;
- nl_skb = genlmsg_new(meta_len + data_len, GFP_ATOMIC);
+ nl_skb = genlmsg_new(meta_len + nla_total_size(data_len), GFP_ATOMIC);
if (unlikely(!nl_skb))
return;
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 6c8b0f6d28f9..5a8e42ad1504 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -8,6 +8,8 @@
#include <linux/qrtr.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
#include <linux/numa.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
#include <net/sock.h>
@@ -97,10 +99,11 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
static unsigned int qrtr_local_nid = NUMA_NO_NODE;
/* for node ids */
-static RADIX_TREE(qrtr_nodes, GFP_KERNEL);
+static RADIX_TREE(qrtr_nodes, GFP_ATOMIC);
+static DEFINE_SPINLOCK(qrtr_nodes_lock);
/* broadcast list */
static LIST_HEAD(qrtr_all_nodes);
-/* lock for qrtr_nodes, qrtr_all_nodes and node reference */
+/* lock for qrtr_all_nodes and node reference */
static DEFINE_MUTEX(qrtr_node_lock);
/* local port allocation management */
@@ -113,8 +116,9 @@ static DEFINE_MUTEX(qrtr_port_lock);
* @ep: endpoint
* @ref: reference count for node
* @nid: node id
+ * @qrtr_tx_flow: tree of qrtr_tx_flow, keyed by node << 32 | port
+ * @qrtr_tx_lock: lock for qrtr_tx_flow inserts
* @rx_queue: receive queue
- * @work: scheduled work struct for recv work
* @item: list item for broadcast list
*/
struct qrtr_node {
@@ -123,17 +127,36 @@ struct qrtr_node {
struct kref ref;
unsigned int nid;
+ struct radix_tree_root qrtr_tx_flow;
+ struct mutex qrtr_tx_lock; /* for qrtr_tx_flow */
+
struct sk_buff_head rx_queue;
- struct work_struct work;
struct list_head item;
};
+/**
+ * struct qrtr_tx_flow - tx flow control
+ * @resume_tx: waiters for a resume tx from the remote
+ * @pending: number of waiting senders
+ * @tx_failed: indicates that a message with confirm_rx flag was lost
+ */
+struct qrtr_tx_flow {
+ struct wait_queue_head resume_tx;
+ int pending;
+ int tx_failed;
+};
+
+#define QRTR_TX_FLOW_HIGH 10
+#define QRTR_TX_FLOW_LOW 5
+
static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb,
int type, struct sockaddr_qrtr *from,
struct sockaddr_qrtr *to);
static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb,
int type, struct sockaddr_qrtr *from,
struct sockaddr_qrtr *to);
+static struct qrtr_sock *qrtr_port_lookup(int port);
+static void qrtr_port_put(struct qrtr_sock *ipc);
/* Release node resources and free the node.
*
@@ -143,14 +166,25 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb,
static void __qrtr_node_release(struct kref *kref)
{
struct qrtr_node *node = container_of(kref, struct qrtr_node, ref);
+ struct radix_tree_iter iter;
+ unsigned long flags;
+ void __rcu **slot;
+ spin_lock_irqsave(&qrtr_nodes_lock, flags);
if (node->nid != QRTR_EP_NID_AUTO)
radix_tree_delete(&qrtr_nodes, node->nid);
+ spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
list_del(&node->item);
mutex_unlock(&qrtr_node_lock);
skb_queue_purge(&node->rx_queue);
+
+ /* Free tx flow counters */
+ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) {
+ radix_tree_iter_delete(&node->qrtr_tx_flow, &iter, slot);
+ kfree(*slot);
+ }
kfree(node);
}
@@ -170,6 +204,126 @@ static void qrtr_node_release(struct qrtr_node *node)
kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock);
}
+/**
+ * qrtr_tx_resume() - reset flow control counter
+ * @node: qrtr_node that the QRTR_TYPE_RESUME_TX packet arrived on
+ * @skb: resume_tx packet
+ */
+static void qrtr_tx_resume(struct qrtr_node *node, struct sk_buff *skb)
+{
+ struct qrtr_ctrl_pkt *pkt = (struct qrtr_ctrl_pkt *)skb->data;
+ u64 remote_node = le32_to_cpu(pkt->client.node);
+ u32 remote_port = le32_to_cpu(pkt->client.port);
+ struct qrtr_tx_flow *flow;
+ unsigned long key;
+
+ key = remote_node << 32 | remote_port;
+
+ rcu_read_lock();
+ flow = radix_tree_lookup(&node->qrtr_tx_flow, key);
+ rcu_read_unlock();
+ if (flow) {
+ spin_lock(&flow->resume_tx.lock);
+ flow->pending = 0;
+ spin_unlock(&flow->resume_tx.lock);
+ wake_up_interruptible_all(&flow->resume_tx);
+ }
+
+ consume_skb(skb);
+}
+
+/**
+ * qrtr_tx_wait() - flow control for outgoing packets
+ * @node: qrtr_node that the packet is to be send to
+ * @dest_node: node id of the destination
+ * @dest_port: port number of the destination
+ * @type: type of message
+ *
+ * The flow control scheme is based around the low and high "watermarks". When
+ * the low watermark is passed the confirm_rx flag is set on the outgoing
+ * message, which will trigger the remote to send a control message of the type
+ * QRTR_TYPE_RESUME_TX to reset the counter. If the high watermark is hit
+ * further transmision should be paused.
+ *
+ * Return: 1 if confirm_rx should be set, 0 otherwise or errno failure
+ */
+static int qrtr_tx_wait(struct qrtr_node *node, int dest_node, int dest_port,
+ int type)
+{
+ unsigned long key = (u64)dest_node << 32 | dest_port;
+ struct qrtr_tx_flow *flow;
+ int confirm_rx = 0;
+ int ret;
+
+ /* Never set confirm_rx on non-data packets */
+ if (type != QRTR_TYPE_DATA)
+ return 0;
+
+ mutex_lock(&node->qrtr_tx_lock);
+ flow = radix_tree_lookup(&node->qrtr_tx_flow, key);
+ if (!flow) {
+ flow = kzalloc(sizeof(*flow), GFP_KERNEL);
+ if (flow) {
+ init_waitqueue_head(&flow->resume_tx);
+ radix_tree_insert(&node->qrtr_tx_flow, key, flow);
+ }
+ }
+ mutex_unlock(&node->qrtr_tx_lock);
+
+ /* Set confirm_rx if we where unable to find and allocate a flow */
+ if (!flow)
+ return 1;
+
+ spin_lock_irq(&flow->resume_tx.lock);
+ ret = wait_event_interruptible_locked_irq(flow->resume_tx,
+ flow->pending < QRTR_TX_FLOW_HIGH ||
+ flow->tx_failed ||
+ !node->ep);
+ if (ret < 0) {
+ confirm_rx = ret;
+ } else if (!node->ep) {
+ confirm_rx = -EPIPE;
+ } else if (flow->tx_failed) {
+ flow->tx_failed = 0;
+ confirm_rx = 1;
+ } else {
+ flow->pending++;
+ confirm_rx = flow->pending == QRTR_TX_FLOW_LOW;
+ }
+ spin_unlock_irq(&flow->resume_tx.lock);
+
+ return confirm_rx;
+}
+
+/**
+ * qrtr_tx_flow_failed() - flag that tx of confirm_rx flagged messages failed
+ * @node: qrtr_node that the packet is to be send to
+ * @dest_node: node id of the destination
+ * @dest_port: port number of the destination
+ *
+ * Signal that the transmission of a message with confirm_rx flag failed. The
+ * flow's "pending" counter will keep incrementing towards QRTR_TX_FLOW_HIGH,
+ * at which point transmission would stall forever waiting for the resume TX
+ * message associated with the dropped confirm_rx message.
+ * Work around this by marking the flow as having a failed transmission and
+ * cause the next transmission attempt to be sent with the confirm_rx.
+ */
+static void qrtr_tx_flow_failed(struct qrtr_node *node, int dest_node,
+ int dest_port)
+{
+ unsigned long key = (u64)dest_node << 32 | dest_port;
+ struct qrtr_tx_flow *flow;
+
+ rcu_read_lock();
+ flow = radix_tree_lookup(&node->qrtr_tx_flow, key);
+ rcu_read_unlock();
+ if (flow) {
+ spin_lock_irq(&flow->resume_tx.lock);
+ flow->tx_failed = 1;
+ spin_unlock_irq(&flow->resume_tx.lock);
+ }
+}
+
/* Pass an outgoing packet socket buffer to the endpoint driver. */
static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
int type, struct sockaddr_qrtr *from,
@@ -178,6 +332,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
struct qrtr_hdr_v1 *hdr;
size_t len = skb->len;
int rc = -ENODEV;
+ int confirm_rx;
+
+ confirm_rx = qrtr_tx_wait(node, to->sq_node, to->sq_port, type);
+ if (confirm_rx < 0) {
+ kfree_skb(skb);
+ return confirm_rx;
+ }
hdr = skb_push(skb, sizeof(*hdr));
hdr->version = cpu_to_le32(QRTR_PROTO_VER_1);
@@ -193,9 +354,9 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
}
hdr->size = cpu_to_le32(len);
- hdr->confirm_rx = 0;
+ hdr->confirm_rx = !!confirm_rx;
- skb_put_padto(skb, ALIGN(len, 4));
+ skb_put_padto(skb, ALIGN(len, 4) + sizeof(*hdr));
mutex_lock(&node->ep_lock);
if (node->ep)
@@ -204,6 +365,11 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
kfree_skb(skb);
mutex_unlock(&node->ep_lock);
+ /* Need to ensure that a subsequent message carries the otherwise lost
+ * confirm_rx flag if we dropped this one */
+ if (rc && confirm_rx)
+ qrtr_tx_flow_failed(node, to->sq_node, to->sq_port);
+
return rc;
}
@@ -214,11 +380,12 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
static struct qrtr_node *qrtr_node_lookup(unsigned int nid)
{
struct qrtr_node *node;
+ unsigned long flags;
- mutex_lock(&qrtr_node_lock);
+ spin_lock_irqsave(&qrtr_nodes_lock, flags);
node = radix_tree_lookup(&qrtr_nodes, nid);
node = qrtr_node_acquire(node);
- mutex_unlock(&qrtr_node_lock);
+ spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
return node;
}
@@ -230,13 +397,15 @@ static struct qrtr_node *qrtr_node_lookup(unsigned int nid)
*/
static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid)
{
+ unsigned long flags;
+
if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO)
return;
- mutex_lock(&qrtr_node_lock);
+ spin_lock_irqsave(&qrtr_nodes_lock, flags);
radix_tree_insert(&qrtr_nodes, nid, node);
node->nid = nid;
- mutex_unlock(&qrtr_node_lock);
+ spin_unlock_irqrestore(&qrtr_nodes_lock, flags);
}
/**
@@ -252,6 +421,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
struct qrtr_node *node = ep->node;
const struct qrtr_hdr_v1 *v1;
const struct qrtr_hdr_v2 *v2;
+ struct qrtr_sock *ipc;
struct sk_buff *skb;
struct qrtr_cb *cb;
unsigned int size;
@@ -310,13 +480,26 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len)
if (len != ALIGN(size, 4) + hdrlen)
goto err;
- if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA)
+ if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA &&
+ cb->type != QRTR_TYPE_RESUME_TX)
goto err;
skb_put_data(skb, data + hdrlen, size);
- skb_queue_tail(&node->rx_queue, skb);
- schedule_work(&node->work);
+ qrtr_node_assign(node, cb->src_node);
+
+ if (cb->type == QRTR_TYPE_RESUME_TX) {
+ qrtr_tx_resume(node, skb);
+ } else {
+ ipc = qrtr_port_lookup(cb->dst_port);
+ if (!ipc)
+ goto err;
+
+ if (sock_queue_rcv_skb(&ipc->sk, skb))
+ goto err;
+
+ qrtr_port_put(ipc);
+ }
return 0;
@@ -351,61 +534,6 @@ static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt)
return skb;
}
-static struct qrtr_sock *qrtr_port_lookup(int port);
-static void qrtr_port_put(struct qrtr_sock *ipc);
-
-/* Handle and route a received packet.
- *
- * This will auto-reply with resume-tx packet as necessary.
- */
-static void qrtr_node_rx_work(struct work_struct *work)
-{
- struct qrtr_node *node = container_of(work, struct qrtr_node, work);
- struct qrtr_ctrl_pkt *pkt;
- struct sockaddr_qrtr dst;
- struct sockaddr_qrtr src;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&node->rx_queue)) != NULL) {
- struct qrtr_sock *ipc;
- struct qrtr_cb *cb;
- int confirm;
-
- cb = (struct qrtr_cb *)skb->cb;
- src.sq_node = cb->src_node;
- src.sq_port = cb->src_port;
- dst.sq_node = cb->dst_node;
- dst.sq_port = cb->dst_port;
- confirm = !!cb->confirm_rx;
-
- qrtr_node_assign(node, cb->src_node);
-
- ipc = qrtr_port_lookup(cb->dst_port);
- if (!ipc) {
- kfree_skb(skb);
- } else {
- if (sock_queue_rcv_skb(&ipc->sk, skb))
- kfree_skb(skb);
-
- qrtr_port_put(ipc);
- }
-
- if (confirm) {
- skb = qrtr_alloc_ctrl_packet(&pkt);
- if (!skb)
- break;
-
- pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX);
- pkt->client.node = cpu_to_le32(dst.sq_node);
- pkt->client.port = cpu_to_le32(dst.sq_port);
-
- if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX,
- &dst, &src))
- break;
- }
- }
-}
-
/**
* qrtr_endpoint_register() - register a new endpoint
* @ep: endpoint to register
@@ -425,13 +553,15 @@ int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid)
if (!node)
return -ENOMEM;
- INIT_WORK(&node->work, qrtr_node_rx_work);
kref_init(&node->ref);
mutex_init(&node->ep_lock);
skb_queue_head_init(&node->rx_queue);
node->nid = QRTR_EP_NID_AUTO;
node->ep = ep;
+ INIT_RADIX_TREE(&node->qrtr_tx_flow, GFP_KERNEL);
+ mutex_init(&node->qrtr_tx_lock);
+
qrtr_node_assign(node, nid);
mutex_lock(&qrtr_node_lock);
@@ -452,8 +582,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
struct qrtr_node *node = ep->node;
struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL};
struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL};
+ struct radix_tree_iter iter;
struct qrtr_ctrl_pkt *pkt;
+ struct qrtr_tx_flow *flow;
struct sk_buff *skb;
+ void __rcu **slot;
mutex_lock(&node->ep_lock);
node->ep = NULL;
@@ -466,6 +599,14 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep)
qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst);
}
+ /* Wake up any transmitters waiting for resume-tx from the node */
+ mutex_lock(&node->qrtr_tx_lock);
+ radix_tree_for_each_slot(slot, &node->qrtr_tx_flow, &iter, 0) {
+ flow = *slot;
+ wake_up_interruptible_all(&flow->resume_tx);
+ }
+ mutex_unlock(&node->qrtr_tx_lock);
+
qrtr_node_release(node);
ep->node = NULL;
}
@@ -482,11 +623,11 @@ static struct qrtr_sock *qrtr_port_lookup(int port)
if (port == QRTR_PORT_CTRL)
port = 0;
- mutex_lock(&qrtr_port_lock);
+ rcu_read_lock();
ipc = idr_find(&qrtr_ports, port);
if (ipc)
sock_hold(&ipc->sk);
- mutex_unlock(&qrtr_port_lock);
+ rcu_read_unlock();
return ipc;
}
@@ -528,6 +669,10 @@ static void qrtr_port_remove(struct qrtr_sock *ipc)
mutex_lock(&qrtr_port_lock);
idr_remove(&qrtr_ports, port);
mutex_unlock(&qrtr_port_lock);
+
+ /* Ensure that if qrtr_port_lookup() did enter the RCU read section we
+ * wait for it to up increment the refcount */
+ synchronize_rcu();
}
/* Assign port number to socket.
@@ -815,6 +960,34 @@ out_node:
return rc;
}
+static int qrtr_send_resume_tx(struct qrtr_cb *cb)
+{
+ struct sockaddr_qrtr remote = { AF_QIPCRTR, cb->src_node, cb->src_port };
+ struct sockaddr_qrtr local = { AF_QIPCRTR, cb->dst_node, cb->dst_port };
+ struct qrtr_ctrl_pkt *pkt;
+ struct qrtr_node *node;
+ struct sk_buff *skb;
+ int ret;
+
+ node = qrtr_node_lookup(remote.sq_node);
+ if (!node)
+ return -EINVAL;
+
+ skb = qrtr_alloc_ctrl_packet(&pkt);
+ if (!skb)
+ return -ENOMEM;
+
+ pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX);
+ pkt->client.node = cpu_to_le32(cb->dst_node);
+ pkt->client.port = cpu_to_le32(cb->dst_port);
+
+ ret = qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, &local, &remote);
+
+ qrtr_node_release(node);
+
+ return ret;
+}
+
static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
@@ -837,6 +1010,7 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
release_sock(sk);
return rc;
}
+ cb = (struct qrtr_cb *)skb->cb;
copied = skb->len;
if (copied > size) {
@@ -850,7 +1024,6 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
rc = copied;
if (addr) {
- cb = (struct qrtr_cb *)skb->cb;
addr->sq_family = AF_QIPCRTR;
addr->sq_node = cb->src_node;
addr->sq_port = cb->src_port;
@@ -858,6 +1031,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg,
}
out:
+ if (cb->confirm_rx)
+ qrtr_send_resume_tx(cb);
+
skb_free_datagram(sk, skb);
release_sock(sk);
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
index ccff1e544c21..15ce9b642b25 100644
--- a/net/qrtr/tun.c
+++ b/net/qrtr/tun.c
@@ -84,11 +84,14 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!kbuf)
return -ENOMEM;
- if (!copy_from_iter_full(kbuf, len, from))
+ if (!copy_from_iter_full(kbuf, len, from)) {
+ kfree(kbuf);
return -EFAULT;
+ }
ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
+ kfree(kbuf);
return ret < 0 ? ret : len;
}
@@ -108,15 +111,11 @@ static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait)
static int qrtr_tun_release(struct inode *inode, struct file *filp)
{
struct qrtr_tun *tun = filp->private_data;
- struct sk_buff *skb;
qrtr_endpoint_unregister(&tun->ep);
/* Discard all SKBs */
- while (!skb_queue_empty(&tun->queue)) {
- skb = skb_dequeue(&tun->queue);
- kfree_skb(skb);
- }
+ skb_queue_purge(&tun->queue);
kfree(tun);
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index 38ea7f0f2699..c64e154bc18f 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -23,6 +23,6 @@ config RDS_TCP
This transport does not support RDMA operations.
config RDS_DEBUG
- bool "RDS debugging messages"
+ bool "RDS debugging messages"
depends on RDS
- default n
+ default n
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 2b969f99ef13..1a5bf3fa4578 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -559,7 +559,7 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
ret = -EDESTADDRREQ;
break;
}
- if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
+ if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
ret = -EINVAL;
break;
@@ -593,7 +593,7 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
addr4 = sin6->sin6_addr.s6_addr32[3];
if (addr4 == htonl(INADDR_ANY) ||
addr4 == htonl(INADDR_BROADCAST) ||
- IN_MULTICAST(ntohl(addr4))) {
+ ipv4_is_multicast(addr4)) {
ret = -EPROTOTYPE;
break;
}
@@ -705,7 +705,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_SEQPACKET || protocol)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
+ sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
if (!sk)
return -ENOMEM;
@@ -741,6 +741,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
spin_lock_bh(&rds_sock_lock);
list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ /* This option only supports IPv4 sockets. */
+ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
+ continue;
+
read_lock(&rs->rs_recv_lock);
/* XXX too lazy to maintain counts.. */
@@ -762,21 +766,60 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
lens->each = sizeof(struct rds_info_message);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds_incoming *inc;
+ unsigned int total = 0;
+ struct rds_sock *rs;
+
+ len /= sizeof(struct rds6_info_message);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ read_lock(&rs->rs_recv_lock);
+
+ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+ total++;
+ if (total <= len)
+ rds6_inc_info_copy(inc, iter, &inc->i_saddr,
+ &rs->rs_bound_addr, 1);
+ }
+
+ read_unlock(&rs->rs_recv_lock);
+ }
+
+ spin_unlock_bh(&rds_sock_lock);
+
+ lens->nr = total;
+ lens->each = sizeof(struct rds6_info_message);
+}
+#endif
+
static void rds_sock_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
struct rds_info_socket sinfo;
+ unsigned int cnt = 0;
struct rds_sock *rs;
len /= sizeof(struct rds_info_socket);
spin_lock_bh(&rds_sock_lock);
- if (len < rds_sock_count)
+ if (len < rds_sock_count) {
+ cnt = rds_sock_count;
goto out;
+ }
list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ /* This option only supports IPv4 sockets. */
+ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
+ continue;
sinfo.sndbuf = rds_sk_sndbuf(rs);
sinfo.rcvbuf = rds_sk_rcvbuf(rs);
sinfo.bound_addr = rs->rs_bound_addr_v4;
@@ -786,15 +829,51 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
rds_info_copy(iter, &sinfo, sizeof(sinfo));
+ cnt++;
}
out:
- lens->nr = rds_sock_count;
+ lens->nr = cnt;
lens->each = sizeof(struct rds_info_socket);
spin_unlock_bh(&rds_sock_lock);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_sock_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ struct rds6_info_socket sinfo6;
+ struct rds_sock *rs;
+
+ len /= sizeof(struct rds6_info_socket);
+
+ spin_lock_bh(&rds_sock_lock);
+
+ if (len < rds_sock_count)
+ goto out;
+
+ list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ sinfo6.sndbuf = rds_sk_sndbuf(rs);
+ sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
+ sinfo6.bound_addr = rs->rs_bound_addr;
+ sinfo6.connected_addr = rs->rs_conn_addr;
+ sinfo6.bound_port = rs->rs_bound_port;
+ sinfo6.connected_port = rs->rs_conn_port;
+ sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+ rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
+ }
+
+ out:
+ lens->nr = rds_sock_count;
+ lens->each = sizeof(struct rds6_info_socket);
+
+ spin_unlock_bh(&rds_sock_lock);
+}
+#endif
+
static void rds_exit(void)
{
sock_unregister(rds_family_ops.family);
@@ -808,6 +887,10 @@ static void rds_exit(void)
rds_bind_lock_destroy();
rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
+ rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
+#endif
}
module_exit(rds_exit);
@@ -845,6 +928,10 @@ static int rds_init(void)
rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
+ rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
+#endif
goto out;
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 0f4398e7f2a7..5b5fb4ca8d3e 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -181,7 +181,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(struct sockaddr_in) ||
sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
- IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ ipv4_is_multicast(sin->sin_addr.s_addr))
return -EINVAL;
ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
binding_addr = &v6addr;
@@ -206,7 +206,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
addr4 = sin6->sin6_addr.s6_addr32[3];
if (addr4 == htonl(INADDR_ANY) ||
addr4 == htonl(INADDR_BROADCAST) ||
- IN_MULTICAST(ntohl(addr4)))
+ ipv4_is_multicast(addr4))
return -EINVAL;
}
/* The scope ID must be specified for link local address. */
@@ -239,34 +239,33 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
}
- sock_set_flag(sk, SOCK_RCU_FREE);
- ret = rds_add_bound(rs, binding_addr, &port, scope_id);
- if (ret)
- goto out;
-
- if (rs->rs_transport) { /* previously bound */
+ /* The transport can be set using SO_RDS_TRANSPORT option before the
+ * socket is bound.
+ */
+ if (rs->rs_transport) {
trans = rs->rs_transport;
- if (trans->laddr_check(sock_net(sock->sk),
+ if (!trans->laddr_check ||
+ trans->laddr_check(sock_net(sock->sk),
binding_addr, scope_id) != 0) {
ret = -ENOPROTOOPT;
- rds_remove_bound(rs);
- } else {
- ret = 0;
+ goto out;
}
- goto out;
- }
- trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
- scope_id);
- if (!trans) {
- ret = -EADDRNOTAVAIL;
- rds_remove_bound(rs);
- pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
- __func__, binding_addr);
- goto out;
+ } else {
+ trans = rds_trans_get_preferred(sock_net(sock->sk),
+ binding_addr, scope_id);
+ if (!trans) {
+ ret = -EADDRNOTAVAIL;
+ pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+ __func__, binding_addr);
+ goto out;
+ }
+ rs->rs_transport = trans;
}
- rs->rs_transport = trans;
- ret = 0;
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ ret = rds_add_bound(rs, binding_addr, &port, scope_id);
+ if (ret)
+ rs->rs_transport = NULL;
out:
release_sock(sk);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ec05d91aa9a2..a792d8a3872a 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -30,6 +30,7 @@
* SOFTWARE.
*
*/
+#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
@@ -107,6 +108,7 @@ static void rds_ib_dev_free(struct work_struct *work)
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
+ dma_pool_destroy(rds_ibdev->rid_hdrs_pool);
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
@@ -143,6 +145,9 @@ static void rds_ib_add_one(struct ib_device *device)
refcount_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+ INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+ INIT_LIST_HEAD(&rds_ibdev->conn_list);
+
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE);
@@ -151,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device)
has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr &&
device->ops.map_phys_fmr && device->ops.unmap_fmr);
rds_ibdev->use_fastreg = (has_fr && !has_fmr);
+ rds_ibdev->odp_capable =
+ !!(device->attrs.device_cap_flags &
+ IB_DEVICE_ON_DEMAND_PAGING) &&
+ !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
+ IB_ODP_SUPPORT_WRITE) &&
+ !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
+ IB_ODP_SUPPORT_READ);
rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
@@ -179,6 +191,12 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->pd = NULL;
goto put_dev;
}
+ rds_ibdev->rid_hdrs_pool = dma_pool_create(device->name,
+ device->dma_device,
+ sizeof(struct rds_header),
+ L1_CACHE_BYTES, 0);
+ if (!rds_ibdev->rid_hdrs_pool)
+ goto put_dev;
rds_ibdev->mr_1m_pool =
rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL);
@@ -203,9 +221,6 @@ static void rds_ib_add_one(struct ib_device *device)
device->name,
rds_ibdev->use_fastreg ? "FRMR" : "FMR");
- INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
- INIT_LIST_HEAD(&rds_ibdev->conn_list);
-
down_write(&rds_ib_devices_lock);
list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
up_write(&rds_ib_devices_lock);
@@ -291,7 +306,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
void *buffer)
{
struct rds_info_rdma_connection *iinfo = buffer;
- struct rds_ib_connection *ic;
+ struct rds_ib_connection *ic = conn->c_transport_data;
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_ib_transport)
@@ -301,15 +316,16 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
iinfo->src_addr = conn->c_laddr.s6_addr32[3];
iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
- iinfo->tos = conn->c_tos;
+ if (ic) {
+ iinfo->tos = conn->c_tos;
+ iinfo->sl = ic->i_sl;
+ }
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
if (rds_conn_state(conn) == RDS_CONN_UP) {
struct rds_ib_device *rds_ibdev;
- ic = conn->c_transport_data;
-
rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid,
(union ib_gid *)&iinfo->dst_gid);
@@ -329,7 +345,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
void *buffer)
{
struct rds6_info_rdma_connection *iinfo6 = buffer;
- struct rds_ib_connection *ic;
+ struct rds_ib_connection *ic = conn->c_transport_data;
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_ib_transport)
@@ -337,6 +353,10 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
iinfo6->src_addr = conn->c_laddr;
iinfo6->dst_addr = conn->c_faddr;
+ if (ic) {
+ iinfo6->tos = conn->c_tos;
+ iinfo6->sl = ic->i_sl;
+ }
memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
@@ -344,7 +364,6 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
if (rds_conn_state(conn) == RDS_CONN_UP) {
struct rds_ib_device *rds_ibdev;
- ic = conn->c_transport_data;
rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid,
(union ib_gid *)&iinfo6->dst_gid);
rds_ibdev = ic->rds_ibdev;
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 303c6ee8bdb7..0296f1f7acda 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -165,8 +165,8 @@ struct rds_ib_connection {
/* tx */
struct rds_ib_work_ring i_send_ring;
struct rm_data_op *i_data_op;
- struct rds_header *i_send_hdrs;
- dma_addr_t i_send_hdrs_dma;
+ struct rds_header **i_send_hdrs;
+ dma_addr_t *i_send_hdrs_dma;
struct rds_ib_send_work *i_sends;
atomic_t i_signaled_sends;
@@ -175,8 +175,8 @@ struct rds_ib_connection {
struct rds_ib_work_ring i_recv_ring;
struct rds_ib_incoming *i_ibinc;
u32 i_recv_data_rem;
- struct rds_header *i_recv_hdrs;
- dma_addr_t i_recv_hdrs_dma;
+ struct rds_header **i_recv_hdrs;
+ dma_addr_t *i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs;
u64 i_ack_recv; /* last ACK received */
struct rds_ib_refill_cache i_cache_incs;
@@ -220,6 +220,7 @@ struct rds_ib_connection {
/* Send/Recv vectors */
int i_scq_vector;
int i_rcq_vector;
+ u8 i_sl;
};
/* This assumes that atomic_t is at least 32 bits */
@@ -245,7 +246,9 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- bool use_fastreg;
+ struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */
+ u8 use_fastreg:1;
+ u8 odp_capable:1;
unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
@@ -380,7 +383,11 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
-
+struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
+ struct dma_pool *pool,
+ dma_addr_t **dma_addrs, u32 num_hdrs);
+void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
+ dma_addr_t *dma_addrs, u32 num_hdrs);
#define rds_ib_conn_error(conn, fmt...) \
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index fddaa09f7b0d..c71f4328d138 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -30,6 +30,7 @@
* SOFTWARE.
*
*/
+#include <linux/dmapool.h>
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/slab.h>
@@ -152,6 +153,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
+ /* receive sl from the peer */
+ ic->i_sl = ic->i_cm_id->route.path_rec->sl;
+
atomic_set(&ic->i_cq_quiesce, 0);
/* Init rings and fill recv. this needs to wait until protocol
@@ -436,6 +440,68 @@ static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
rds_ibdev->vector_load[index]--;
}
+/* Allocate DMA coherent memory to be used to store struct rds_header for
+ * sending/receiving packets. The pointers to the DMA memory and the
+ * associated DMA addresses are stored in two arrays.
+ *
+ * @ibdev: the IB device
+ * @pool: the DMA memory pool
+ * @dma_addrs: pointer to the array for storing DMA addresses
+ * @num_hdrs: number of headers to allocate
+ *
+ * It returns the pointer to the array storing the DMA memory pointers. On
+ * error, NULL pointer is returned.
+ */
+struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
+ struct dma_pool *pool,
+ dma_addr_t **dma_addrs, u32 num_hdrs)
+{
+ struct rds_header **hdrs;
+ dma_addr_t *hdr_daddrs;
+ u32 i;
+
+ hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
+ ibdev_to_node(ibdev));
+ if (!hdrs)
+ return NULL;
+
+ hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
+ ibdev_to_node(ibdev));
+ if (!hdr_daddrs) {
+ kvfree(hdrs);
+ return NULL;
+ }
+
+ for (i = 0; i < num_hdrs; i++) {
+ hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]);
+ if (!hdrs[i]) {
+ rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i);
+ return NULL;
+ }
+ }
+
+ *dma_addrs = hdr_daddrs;
+ return hdrs;
+}
+
+/* Free the DMA memory used to store struct rds_header.
+ *
+ * @pool: the DMA memory pool
+ * @hdrs: pointer to the array storing DMA memory pointers
+ * @dma_addrs: pointer to the array storing DMA addresses
+ * @num_hdars: number of headers to free.
+ */
+void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
+ dma_addr_t *dma_addrs, u32 num_hdrs)
+{
+ u32 i;
+
+ for (i = 0; i < num_hdrs; i++)
+ dma_pool_free(pool, hdrs[i], dma_addrs[i]);
+ kvfree(hdrs);
+ kvfree(dma_addrs);
+}
+
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
@@ -447,7 +513,9 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct ib_qp_init_attr attr;
struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
+ unsigned long max_wrs;
int ret, fr_queue_space;
+ struct dma_pool *pool;
/*
* It's normal to see a null device if an incoming connection races
@@ -466,10 +534,15 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
- if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
- rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
- if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
- rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+ max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
+ rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
+ if (ic->i_send_ring.w_nr != max_wrs)
+ rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
+
+ max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
+ rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
+ if (ic->i_recv_ring.w_nr != max_wrs)
+ rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
@@ -538,31 +611,28 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
goto recv_cq_out;
}
- ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_send_hdrs_dma, GFP_KERNEL);
+ pool = rds_ibdev->rid_hdrs_pool;
+ ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent send failed\n");
+ rdsdebug("DMA send hdrs alloc failed\n");
goto qp_out;
}
- ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_recv_hdrs_dma, GFP_KERNEL);
+ ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent recv failed\n");
+ rdsdebug("DMA recv hdrs alloc failed\n");
goto send_hdrs_dma_out;
}
- ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
- &ic->i_ack_dma, GFP_KERNEL);
+ ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL,
+ &ic->i_ack_dma);
if (!ic->i_ack) {
ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent ack failed\n");
+ rdsdebug("DMA ack header alloc failed\n");
goto recv_hdrs_dma_out;
}
@@ -593,17 +663,23 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
sends_out:
vfree(ic->i_sends);
+
ack_dma_out:
- ib_dma_free_coherent(dev, sizeof(struct rds_header),
- ic->i_ack, ic->i_ack_dma);
+ dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ ic->i_ack = NULL;
+
recv_hdrs_dma_out:
- ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_recv_hdrs, ic->i_recv_hdrs_dma);
+ rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr);
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+
send_hdrs_dma_out:
- ib_dma_free_coherent(dev, ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_send_hdrs, ic->i_send_hdrs_dma);
+ rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr);
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+
qp_out:
rdma_destroy_qp(ic->i_cm_id);
recv_cq_out:
@@ -981,8 +1057,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_cm_id ? ic->i_cm_id->qp : NULL);
if (ic->i_cm_id) {
- struct ib_device *dev = ic->i_cm_id->device;
-
rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
err = rdma_disconnect(ic->i_cm_id);
if (err) {
@@ -1032,24 +1106,39 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ib_destroy_cq(ic->i_recv_cq);
}
- /* then free the resources that ib callbacks use */
- if (ic->i_send_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_send_hdrs,
- ic->i_send_hdrs_dma);
-
- if (ic->i_recv_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_recv_hdrs,
- ic->i_recv_hdrs_dma);
-
- if (ic->i_ack)
- ib_dma_free_coherent(dev, sizeof(struct rds_header),
- ic->i_ack, ic->i_ack_dma);
+ if (ic->rds_ibdev) {
+ struct dma_pool *pool;
+
+ pool = ic->rds_ibdev->rid_hdrs_pool;
+
+ /* then free the resources that ib callbacks use */
+ if (ic->i_send_hdrs) {
+ rds_dma_hdrs_free(pool, ic->i_send_hdrs,
+ ic->i_send_hdrs_dma,
+ ic->i_send_ring.w_nr);
+ ic->i_send_hdrs = NULL;
+ ic->i_send_hdrs_dma = NULL;
+ }
+
+ if (ic->i_recv_hdrs) {
+ rds_dma_hdrs_free(pool, ic->i_recv_hdrs,
+ ic->i_recv_hdrs_dma,
+ ic->i_recv_ring.w_nr);
+ ic->i_recv_hdrs = NULL;
+ ic->i_recv_hdrs_dma = NULL;
+ }
+
+ if (ic->i_ack) {
+ dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
+ ic->i_ack = NULL;
+ }
+ } else {
+ WARN_ON(ic->i_send_hdrs);
+ WARN_ON(ic->i_send_hdrs_dma);
+ WARN_ON(ic->i_recv_hdrs);
+ WARN_ON(ic->i_recv_hdrs_dma);
+ WARN_ON(ic->i_ack);
+ }
if (ic->i_sends)
rds_ib_send_clear_ring(ic);
@@ -1068,9 +1157,6 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_pd = NULL;
ic->i_send_cq = NULL;
ic->i_recv_cq = NULL;
- ic->i_send_hdrs = NULL;
- ic->i_recv_hdrs = NULL;
- ic->i_ack = NULL;
}
BUG_ON(ic->rds_ibdev);
@@ -1096,8 +1182,9 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_flowctl = 0;
atomic_set(&ic->i_credits, 0);
- rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
- rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+ /* Re-init rings, but retain sizes. */
+ rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
+ rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
if (ic->i_ibinc) {
rds_inc_put(&ic->i_ibinc->ii_inc);
@@ -1144,8 +1231,8 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
* rds_ib_conn_shutdown() waits for these to be emptied so they
* must be initialized before it can be called.
*/
- rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
- rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+ rds_ib_ring_init(&ic->i_send_ring, 0);
+ rds_ib_ring_init(&ic->i_recv_ring, 0);
ic->conn = conn;
conn->c_transport_data = ic;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 9045a8c0edff..0c8252d7fe2b 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -67,6 +67,7 @@ struct rds_ib_frmr {
/* This is stored as mr->r_trans_private. */
struct rds_ib_mr {
+ struct delayed_work work;
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct rds_ib_connection *ic;
@@ -81,9 +82,11 @@ struct rds_ib_mr {
unsigned int sg_len;
int sg_dma_len;
+ u8 odp:1;
union {
struct rds_ib_fmr fmr;
struct rds_ib_frmr frmr;
+ struct ib_mr *mr;
} u;
};
@@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
- struct rds_connection *conn);
+ struct rds_connection *conn, u64 start, u64 length,
+ int need_odp);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
int rds_ib_mr_init(void);
void rds_ib_mr_exit(void);
+u32 rds_ib_get_lkey(void *trans_private);
void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index c8c1e3ae8d84..b34b24e237f8 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -37,8 +37,15 @@
#include "rds_single_path.h"
#include "ib_mr.h"
+#include "rds.h"
struct workqueue_struct *rds_ib_mr_wq;
+struct rds_ib_dereg_odp_mr {
+ struct work_struct work;
+ struct ib_mr *mr;
+};
+
+static void rds_ib_odp_mr_worker(struct work_struct *work);
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
@@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction)
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
+ if (ibmr->odp)
+ return;
+
switch (direction) {
case DMA_FROM_DEVICE:
ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
@@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+ if (ibmr->odp) {
+ /* A MR created and marked as use_once. We use delayed work,
+ * because there is a change that we are in interrupt and can't
+ * call to ib_dereg_mr() directly.
+ */
+ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
+ queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
+ return;
+ }
+
/* Return it to the pool's free list */
if (rds_ibdev->use_fastreg)
rds_ib_free_frmr_list(ibmr);
@@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void)
up_read(&rds_ib_devices_lock);
}
+u32 rds_ib_get_lkey(void *trans_private)
+{
+ struct rds_ib_mr *ibmr = trans_private;
+
+ return ibmr->u.mr->lkey;
+}
+
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret,
- struct rds_connection *conn)
+ struct rds_connection *conn,
+ u64 start, u64 length, int need_odp)
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
@@ -541,6 +569,51 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}
+ if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
+ u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
+ int access_flags =
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_ON_DEMAND);
+ struct ib_sge sge = {};
+ struct ib_mr *ib_mr;
+
+ if (!rds_ibdev->odp_capable) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
+ access_flags);
+
+ if (IS_ERR(ib_mr)) {
+ rdsdebug("rds_ib_get_user_mr returned %d\n",
+ IS_ERR(ib_mr));
+ ret = PTR_ERR(ib_mr);
+ goto out;
+ }
+ if (key_ret)
+ *key_ret = ib_mr->rkey;
+
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+ if (!ibmr) {
+ ib_dereg_mr(ib_mr);
+ ret = -ENOMEM;
+ goto out;
+ }
+ ibmr->u.mr = ib_mr;
+ ibmr->odp = 1;
+
+ sge.addr = virt_addr;
+ sge.length = length;
+ sge.lkey = ib_mr->lkey;
+
+ ib_advise_mr(rds_ibdev->pd,
+ IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+ IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
+ return ibmr;
+ }
+
if (conn)
ic = conn->c_transport_data;
@@ -629,3 +702,12 @@ void rds_ib_mr_exit(void)
{
destroy_workqueue(rds_ib_mr_wq);
}
+
+static void rds_ib_odp_mr_worker(struct work_struct *work)
+{
+ struct rds_ib_mr *ibmr;
+
+ ibmr = container_of(work, struct rds_ib_mr, work.work);
+ ib_dereg_mr(ibmr->u.mr);
+ kfree(ibmr);
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 3cae88cbdaa0..694d411dc72f 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -61,7 +61,7 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
recv->r_wr.num_sge = RDS_IB_RECV_SGE;
sge = &recv->r_sge[0];
- sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->addr = ic->i_recv_hdrs_dma[i];
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_pd->local_dma_lkey;
@@ -343,7 +343,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
WARN_ON(ret != 1);
sge = &recv->r_sge[0];
- sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+ sge->addr = ic->i_recv_hdrs_dma[recv - ic->i_recvs];
sge->length = sizeof(struct rds_header);
sge = &recv->r_sge[1];
@@ -385,6 +385,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
unsigned int posted = 0;
int ret = 0;
bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
+ bool must_wake = false;
u32 pos;
/* the goal here is to just make sure that someone, somewhere
@@ -405,6 +406,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
recv = &ic->i_recvs[pos];
ret = rds_ib_recv_refill_one(conn, recv, gfp);
if (ret) {
+ must_wake = true;
break;
}
@@ -423,6 +425,11 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
}
posted++;
+
+ if ((posted > 128 && need_resched()) || posted > 8192) {
+ must_wake = true;
+ break;
+ }
}
/* We're doing flow control - update the window. */
@@ -445,10 +452,13 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
* if we should requeue.
*/
if (rds_conn_up(conn) &&
- ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
+ (must_wake ||
+ (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
rds_ib_ring_empty(&ic->i_recv_ring))) {
queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
}
+ if (can_wait)
+ cond_resched();
}
/*
@@ -851,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
}
data_len -= sizeof(struct rds_header);
- ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+ ihdr = ic->i_recv_hdrs[recv - ic->i_recvs];
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
@@ -983,10 +993,11 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
} else {
/* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c, %d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
conn->c_tos, wc->status,
- ib_wc_status_msg(wc->status));
+ ib_wc_status_msg(wc->status),
+ wc->vendor_err);
}
/* rds_ib_process_recv() doesn't always consume the frag, and
@@ -1038,9 +1049,14 @@ int rds_ib_recv_init(void)
si_meminfo(&si);
rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
- rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
- sizeof(struct rds_ib_incoming),
- 0, SLAB_HWCACHE_ALIGN, NULL);
+ rds_ib_incoming_slab =
+ kmem_cache_create_usercopy("rds_ib_incoming",
+ sizeof(struct rds_ib_incoming),
+ 0, SLAB_HWCACHE_ALIGN,
+ offsetof(struct rds_ib_incoming,
+ ii_inc.i_usercopy),
+ sizeof(struct rds_inc_usercopy),
+ NULL);
if (!rds_ib_incoming_slab)
goto out;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index dfe6237dafe2..dfe778220657 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
#include "rds_single_path.h"
#include "rds.h"
#include "ib.h"
+#include "ib_mr.h"
/*
* Convert IB-specific error message to RDS error message and call core
@@ -201,7 +202,8 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
send->s_wr.ex.imm_data = 0;
sge = &send->s_sge[0];
- sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->addr = ic->i_send_hdrs_dma[i];
+
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_pd->local_dma_lkey;
@@ -300,10 +302,10 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
conn->c_tos, wc->status,
- ib_wc_status_msg(wc->status));
+ ib_wc_status_msg(wc->status), wc->vendor_err);
}
}
@@ -631,11 +633,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_queued = jiffies;
send->s_op = NULL;
- send->s_sge[0].addr = ic->i_send_hdrs_dma
- + (pos * sizeof(struct rds_header));
+ send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];
+
send->s_sge[0].length = sizeof(struct rds_header);
+ send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
+
+ memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
+ sizeof(struct rds_header));
- memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
/* Set up the data, if present */
if (i < work_alloc
@@ -647,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[1].addr = sg_dma_address(scat);
send->s_sge[1].addr += rm->data.op_dmaoff;
send->s_sge[1].length = len;
+ send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
bytes_sent += len;
rm->data.op_dmaoff += len;
@@ -674,7 +680,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
if (ic->i_flowctl && adv_credits) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
+ struct rds_header *hdr = ic->i_send_hdrs[pos];
/* add credit and redo the header checksum */
hdr->h_credit = adv_credits;
@@ -855,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
int ret;
int num_sge;
int nr_sig = 0;
+ u64 odp_addr = op->op_odp_addr;
+ u32 odp_lkey = 0;
/* map the op the first time we see it */
- if (!op->op_mapped) {
- op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents, (op->op_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
- if (op->op_count == 0) {
- rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
- ret = -ENOMEM; /* XXX ? */
- goto out;
+ if (!op->op_odp_mr) {
+ if (!op->op_mapped) {
+ op->op_count =
+ ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
+ op->op_nents,
+ (op->op_write) ? DMA_TO_DEVICE :
+ DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op,
+ op->op_count);
+ if (op->op_count == 0) {
+ rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+ ret = -ENOMEM; /* XXX ? */
+ goto out;
+ }
+ op->op_mapped = 1;
}
-
- op->op_mapped = 1;
+ } else {
+ op->op_count = op->op_nents;
+ odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
}
/*
@@ -920,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
scat != &op->op_sg[op->op_count]; j++) {
len = sg_dma_len(scat);
- send->s_sge[j].addr = sg_dma_address(scat);
+ if (!op->op_odp_mr) {
+ send->s_sge[j].addr = sg_dma_address(scat);
+ send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
+ } else {
+ send->s_sge[j].addr = odp_addr;
+ send->s_sge[j].lkey = odp_lkey;
+ }
send->s_sge[j].length = len;
- send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
sent += len;
rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
remote_addr += len;
+ odp_addr += len;
scat++;
}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index 9252ad126335..ac46d8961b61 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -42,7 +42,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
static const char *const rds_ib_stat_names[] = {
"ib_connect_raced",
"ib_listen_closed_stale",
- "s_ib_evt_handler_call",
+ "ib_evt_handler_call",
"ib_tasklet_call",
"ib_tx_cq_event",
"ib_tx_ring_full",
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 916f5ec373d8..3341eee87bf9 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
struct page **pages, int write)
{
+ unsigned int gup_flags = FOLL_LONGTERM;
int ret;
- ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
- pages);
+ if (write)
+ gup_flags |= FOLL_WRITE;
+ ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
if (ret >= 0 && ret < nr_pages) {
while (ret--)
put_page(pages[ret]);
@@ -175,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
struct rds_conn_path *cp)
{
struct rds_mr *mr = NULL, *found;
+ struct scatterlist *sg = NULL;
unsigned int nr_pages;
struct page **pages = NULL;
- struct scatterlist *sg;
void *trans_private;
unsigned long flags;
rds_rdma_cookie_t cookie;
- unsigned int nents;
+ unsigned int nents = 0;
+ int need_odp = 0;
long i;
int ret;
@@ -195,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out;
}
+ /* If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
+ PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
+ (args->vec.addr + args->vec.bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!can_do_mlock()) {
+ ret = -EPERM;
+ goto out;
+ }
+
nr_pages = rds_pages_in_vec(&args->vec);
if (nr_pages == 0) {
ret = -EINVAL;
@@ -248,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* the zero page.
*/
ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
- if (ret < 0)
- goto out;
-
- nents = ret;
- sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
- if (!sg) {
- ret = -ENOMEM;
+ if (ret == -EOPNOTSUPP) {
+ need_odp = 1;
+ } else if (ret <= 0) {
goto out;
- }
- WARN_ON(!nents);
- sg_init_table(sg, nents);
-
- /* Stick all pages into the scatterlist */
- for (i = 0 ; i < nents; i++)
- sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+ } else {
+ nents = ret;
+ sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+ if (!sg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ WARN_ON(!nents);
+ sg_init_table(sg, nents);
- rdsdebug("RDS: trans_private nents is %u\n", nents);
+ /* Stick all pages into the scatterlist */
+ for (i = 0 ; i < nents; i++)
+ sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+ rdsdebug("RDS: trans_private nents is %u\n", nents);
+ }
/* Obtain a transport specific MR. If this succeeds, the
* s/g list is now owned by the MR.
* Note that dma_map() implies that pending writes are
* flushed to RAM, so no dma_sync is needed here. */
- trans_private = rs->rs_transport->get_mr(sg, nents, rs,
- &mr->r_key,
- cp ? cp->cp_conn : NULL);
+ trans_private = rs->rs_transport->get_mr(
+ sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
+ args->vec.addr, args->vec.bytes,
+ need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
if (IS_ERR(trans_private)) {
- for (i = 0 ; i < nents; i++)
- put_page(sg_page(&sg[i]));
- kfree(sg);
+ /* In ODP case, we don't GUP pages, so don't need
+ * to release anything.
+ */
+ if (!need_odp) {
+ for (i = 0 ; i < nents; i++)
+ put_page(sg_page(&sg[i]));
+ kfree(sg);
+ }
ret = PTR_ERR(trans_private);
goto out;
}
@@ -291,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* map page aligned regions. So we keep the offset, and build
* a 64bit cookie containing <R_Key, offset> and pass that
* around. */
- cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+ if (need_odp)
+ cookie = rds_rdma_make_cookie(mr->r_key, 0);
+ else
+ cookie = rds_rdma_make_cookie(mr->r_key,
+ args->vec.addr & ~PAGE_MASK);
if (cookie_ret)
*cookie_ret = cookie;
@@ -456,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
{
unsigned int i;
- for (i = 0; i < ro->op_nents; i++) {
- struct page *page = sg_page(&ro->op_sg[i]);
-
- /* Mark page dirty if it was possibly modified, which
- * is the case for a RDMA_READ which copies from remote
- * to local memory */
- if (!ro->op_write) {
- WARN_ON(!page->mapping && irqs_disabled());
- set_page_dirty(page);
+ if (ro->op_odp_mr) {
+ rds_mr_put(ro->op_odp_mr);
+ } else {
+ for (i = 0; i < ro->op_nents; i++) {
+ struct page *page = sg_page(&ro->op_sg[i]);
+
+ /* Mark page dirty if it was possibly modified, which
+ * is the case for a RDMA_READ which copies from remote
+ * to local memory
+ */
+ if (!ro->op_write)
+ set_page_dirty(page);
+ put_page(page);
}
- put_page(page);
}
kfree(ro->op_notifier);
ro->op_notifier = NULL;
ro->op_active = 0;
+ ro->op_odp_mr = NULL;
}
void rds_atomic_free_op(struct rm_atomic_op *ao)
@@ -581,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct rds_iovec *iovs;
unsigned int i, j;
int ret = 0;
+ bool odp_supported = true;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
|| rm->rdma.op_active)
@@ -602,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
ret = -EINVAL;
goto out_ret;
}
+ /* odp-mr is not supported for multiple requests within one message */
+ if (args->nr_local != 1)
+ odp_supported = false;
iovs = vec->iov;
@@ -623,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
op->op_active = 1;
op->op_recverr = rs->rs_recverr;
+ op->op_odp_mr = NULL;
+
WARN_ON(!nr_pages);
op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret);
if (!op->op_sg)
@@ -672,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
* If it's a READ operation, we need to pin the pages for writing.
*/
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
- if (ret < 0)
+ if ((!odp_supported && ret <= 0) ||
+ (odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
goto out_pages;
- else
- ret = 0;
+
+ if (ret == -EOPNOTSUPP) {
+ struct rds_mr *local_odp_mr;
+
+ if (!rs->rs_transport->get_mr) {
+ ret = -EOPNOTSUPP;
+ goto out_pages;
+ }
+ local_odp_mr =
+ kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
+ if (!local_odp_mr) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
+ refcount_set(&local_odp_mr->r_refcount, 1);
+ local_odp_mr->r_trans = rs->rs_transport;
+ local_odp_mr->r_sock = rs;
+ local_odp_mr->r_trans_private =
+ rs->rs_transport->get_mr(
+ NULL, 0, rs, &local_odp_mr->r_key, NULL,
+ iov->addr, iov->bytes, ODP_VIRTUAL);
+ if (IS_ERR(local_odp_mr->r_trans_private)) {
+ ret = IS_ERR(local_odp_mr->r_trans_private);
+ rdsdebug("get_mr ret %d %p\"", ret,
+ local_odp_mr->r_trans_private);
+ kfree(local_odp_mr);
+ ret = -EOPNOTSUPP;
+ goto out_pages;
+ }
+ rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
+ local_odp_mr, local_odp_mr->r_trans_private);
+ op->op_odp_mr = local_odp_mr;
+ op->op_odp_addr = iov->addr;
+ }
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr);
@@ -691,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
offset);
+ sg_dma_len(sg) = sg->length;
rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
sg->offset, sg->length, iov->addr, iov->bytes);
@@ -709,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
goto out_pages;
}
op->op_bytes = nr_bytes;
+ ret = 0;
out_pages:
kfree(pages);
@@ -755,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (mr) {
- mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+ mr->r_trans->sync_mr(mr->r_trans_private,
+ DMA_TO_DEVICE);
rm->rdma.op_rdma_mr = mr;
}
return err;
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 9986d6065c4d..5f741e51b4ba 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -43,6 +43,9 @@ static struct rdma_cm_id *rds_rdma_listen_id;
static struct rdma_cm_id *rds6_rdma_listen_id;
#endif
+/* Per IB specification 7.7.3, service level is a 4-bit field. */
+#define TOS_TO_SL(tos) ((tos) & 0xF)
+
static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event,
bool isv6)
@@ -97,10 +100,13 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
struct rds_ib_connection *ibic;
ibic = conn->c_transport_data;
- if (ibic && ibic->i_cm_id == cm_id)
+ if (ibic && ibic->i_cm_id == cm_id) {
+ cm_id->route.path_rec[0].sl =
+ TOS_TO_SL(conn->c_tos);
ret = trans->cm_initiate_connect(cm_id, isv6);
- else
+ } else {
rds_conn_drop(conn);
+ }
}
break;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f0066d168499..e4a603523083 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -40,7 +40,6 @@
#ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64
#endif
-
#ifdef RDS_DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
@@ -271,6 +270,12 @@ struct rds_ext_header_rdma_dest {
#define RDS_MSG_RX_END 2
#define RDS_MSG_RX_CMSG 3
+/* The following values are whitelisted for usercopy */
+struct rds_inc_usercopy {
+ rds_rdma_cookie_t rdma_cookie;
+ ktime_t rx_tstamp;
+};
+
struct rds_incoming {
refcount_t i_refcount;
struct list_head i_item;
@@ -280,8 +285,7 @@ struct rds_incoming {
unsigned long i_rx_jiffies;
struct in6_addr i_saddr;
- rds_rdma_cookie_t i_rdma_cookie;
- ktime_t i_rx_tstamp;
+ struct rds_inc_usercopy i_usercopy;
u64 i_rx_lat_trace[RDS_RX_MAX_TRACES];
};
@@ -473,6 +477,9 @@ struct rds_message {
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
+
+ u64 op_odp_addr;
+ struct rds_mr *op_odp_mr;
} rdma;
struct rm_data_op {
unsigned int op_active:1;
@@ -568,7 +575,8 @@ struct rds_transport {
void (*exit)(void);
void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
struct rds_sock *rs, u32 *key_ret,
- struct rds_connection *conn);
+ struct rds_connection *conn,
+ u64 start, u64 length, int need_odp);
void (*sync_mr)(void *trans_private, int direction);
void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void);
@@ -717,7 +725,7 @@ struct rds_statistics {
uint64_t s_cong_send_blocked;
uint64_t s_recv_bytes_added_to_socket;
uint64_t s_recv_bytes_removed_from_socket;
-
+ uint64_t s_send_stuck_rm;
};
/* af_rds.c */
@@ -951,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn)
(conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
}
+enum {
+ ODP_NOT_NEEDED,
+ ODP_ZEROBASED,
+ ODP_VIRTUAL
+};
+
/* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do { \
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 853de4876088..c8404971d5ab 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -47,8 +47,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = conn;
inc->i_saddr = *saddr;
- inc->i_rdma_cookie = 0;
- inc->i_rx_tstamp = ktime_set(0, 0);
+ inc->i_usercopy.rdma_cookie = 0;
+ inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace));
}
@@ -62,8 +62,8 @@ void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
inc->i_conn = cp->cp_conn;
inc->i_conn_path = cp;
inc->i_saddr = *saddr;
- inc->i_rdma_cookie = 0;
- inc->i_rx_tstamp = ktime_set(0, 0);
+ inc->i_usercopy.rdma_cookie = 0;
+ inc->i_usercopy.rx_tstamp = ktime_set(0, 0);
}
EXPORT_SYMBOL_GPL(rds_inc_path_init);
@@ -186,7 +186,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock
case RDS_EXTHDR_RDMA_DEST:
/* We ignore the size for now. We could stash it
* somewhere and use it for error checking. */
- inc->i_rdma_cookie = rds_rdma_make_cookie(
+ inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie(
be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
@@ -380,7 +380,7 @@ void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
if (sock_flag(sk, SOCK_RCVTSTAMP))
- inc->i_rx_tstamp = ktime_get_real();
+ inc->i_usercopy.rx_tstamp = ktime_get_real();
rds_inc_addref(inc);
inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock();
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
@@ -540,16 +540,18 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
{
int ret = 0;
- if (inc->i_rdma_cookie) {
+ if (inc->i_usercopy.rdma_cookie) {
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
- sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+ sizeof(inc->i_usercopy.rdma_cookie),
+ &inc->i_usercopy.rdma_cookie);
if (ret)
goto out;
}
- if ((inc->i_rx_tstamp != 0) &&
+ if ((inc->i_usercopy.rx_tstamp != 0) &&
sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
- struct __kernel_old_timeval tv = ns_to_kernel_old_timeval(inc->i_rx_tstamp);
+ struct __kernel_old_timeval tv =
+ ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp);
if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) {
ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
@@ -811,6 +813,7 @@ void rds6_inc_info_copy(struct rds_incoming *inc,
minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+ minfo6.tos = inc->i_conn->c_tos;
if (flip) {
minfo6.laddr = *daddr;
@@ -824,6 +827,8 @@ void rds6_inc_info_copy(struct rds_incoming *inc,
minfo6.fport = inc->i_hdr.h_dport;
}
+ minfo6.flags = 0;
+
rds_info_copy(iter, &minfo6, sizeof(minfo6));
}
#endif
diff --git a/net/rds/send.c b/net/rds/send.c
index 031b1e97a466..82dcd8b84fe7 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -145,6 +145,7 @@ int rds_send_xmit(struct rds_conn_path *cp)
LIST_HEAD(to_be_dropped);
int batch_count;
unsigned long send_gen = 0;
+ int same_rm = 0;
restart:
batch_count = 0;
@@ -200,6 +201,17 @@ restart:
rm = cp->cp_xmit_rm;
+ if (!rm) {
+ same_rm = 0;
+ } else {
+ same_rm++;
+ if (same_rm >= 4096) {
+ rds_stats_inc(s_send_stuck_rm);
+ ret = -EAGAIN;
+ break;
+ }
+ }
+
/*
* If between sending messages, we can send a pending congestion
* map update.
@@ -1132,7 +1144,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
case AF_INET:
if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
- IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
+ ipv4_is_multicast(usin->sin_addr.s_addr)) {
ret = -EINVAL;
goto out;
}
@@ -1163,7 +1175,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
addr4 = sin6->sin6_addr.s6_addr32[3];
if (addr4 == htonl(INADDR_ANY) ||
addr4 == htonl(INADDR_BROADCAST) ||
- IN_MULTICAST(ntohl(addr4))) {
+ ipv4_is_multicast(addr4)) {
ret = -EINVAL;
goto out;
}
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 73be187d389e..9e87da43c004 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -76,6 +76,9 @@ static const char *const rds_stat_names[] = {
"cong_update_received",
"cong_send_error",
"cong_send_blocked",
+ "recv_bytes_added_to_sock",
+ "recv_bytes_freed_fromsock",
+ "send_stuck_rm",
};
void rds_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f9b08a6d8dbe..971c73c7d34c 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1002,10 +1002,13 @@ static void rfkill_sync_work(struct work_struct *work)
int __must_check rfkill_register(struct rfkill *rfkill)
{
static unsigned long rfkill_no;
- struct device *dev = &rfkill->dev;
+ struct device *dev;
int error;
- BUG_ON(!rfkill);
+ if (!rfkill)
+ return -EINVAL;
+
+ dev = &rfkill->dev;
mutex_lock(&rfkill_global_mutex);
@@ -1311,15 +1314,17 @@ static const struct file_operations rfkill_fops = {
.release = rfkill_fop_release,
#ifdef CONFIG_RFKILL_INPUT
.unlocked_ioctl = rfkill_fop_ioctl,
- .compat_ioctl = rfkill_fop_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
#endif
.llseek = no_llseek,
};
+#define RFKILL_NAME "rfkill"
+
static struct miscdevice rfkill_miscdev = {
- .name = "rfkill",
.fops = &rfkill_fops,
- .minor = MISC_DYNAMIC_MINOR,
+ .name = RFKILL_NAME,
+ .minor = RFKILL_MINOR,
};
static int __init rfkill_init(void)
@@ -1371,3 +1376,6 @@ static void __exit rfkill_exit(void)
class_unregister(&rfkill_class);
}
module_exit(rfkill_exit);
+
+MODULE_ALIAS_MISCDEV(RFKILL_MINOR);
+MODULE_ALIAS("devname:" RFKILL_NAME);
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index f0e9ccf472a9..1e8eeb044b07 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -65,28 +65,6 @@ static const struct proto_ops rose_proto_ops;
ax25_address rose_callsign;
/*
- * ROSE network devices are virtual network devices encapsulating ROSE
- * frames into AX.25 which will be sent through an AX.25 device, so form a
- * special "super class" of normal net devices; split their locks off into a
- * separate class since they always nest.
- */
-static struct lock_class_key rose_netdev_xmit_lock_key;
-static struct lock_class_key rose_netdev_addr_lock_key;
-
-static void rose_set_lockdep_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key);
-}
-
-static void rose_set_lockdep_key(struct net_device *dev)
-{
- lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key);
- netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL);
-}
-
-/*
* Convert a ROSE address into text.
*/
char *rose2asc(char *buf, const rose_address *addr)
@@ -928,7 +906,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
/* Now attach up the new socket */
skb->sk = NULL;
kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
out_release:
release_sock(sk);
@@ -1033,7 +1011,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros
make_rose->va = 0;
make_rose->vr = 0;
make_rose->vl = 0;
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
rose_insert_socket(make);
@@ -1497,7 +1475,7 @@ static int __init rose_proto_init(void)
int rc;
if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) {
- printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n");
+ printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n");
rc = -EINVAL;
goto out;
}
@@ -1533,7 +1511,6 @@ static int __init rose_proto_init(void)
free_netdev(dev);
goto fail;
}
- rose_set_lockdep_key(dev);
dev_rose[i] = dev;
}
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index c53307623236..5277631fa14c 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -696,7 +696,6 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause,
for (i = 0; i < node->count; i++) {
if (!rose_ftimer_running(node->neighbour[i])) {
res = node->neighbour[i];
- failed = 0;
goto out;
}
failed = 1;
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index 05610c3a3d25..57ebb29c26ad 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -49,7 +49,7 @@ config RXKAD
depends on AF_RXRPC
select CRYPTO
select CRYPTO_MANAGER
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
select CRYPTO_PCBC
select CRYPTO_FCRYPT
help
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0dbbfd1b6487..fe42f986cd94 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -194,6 +194,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
service_in_use:
write_unlock(&local->services_lock);
rxrpc_unuse_local(local);
+ rxrpc_put_local(local);
ret = -EADDRINUSE;
error_unlock:
release_sock(&rx->sk);
@@ -862,7 +863,6 @@ static void rxrpc_sock_destructor(struct sock *sk)
static int rxrpc_release_sock(struct sock *sk)
{
struct rxrpc_sock *rx = rxrpc_sk(sk);
- struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk));
_enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt));
@@ -898,10 +898,9 @@ static int rxrpc_release_sock(struct sock *sk)
rxrpc_release_calls_on_socket(rx);
flush_workqueue(rxrpc_workqueue);
rxrpc_purge_queue(&sk->sk_receive_queue);
- rxrpc_queue_work(&rxnet->service_conn_reaper);
- rxrpc_queue_work(&rxnet->client_conn_reaper);
rxrpc_unuse_local(rx->local);
+ rxrpc_put_local(rx->local);
rx->local = NULL;
key_put(rx->key);
rx->key = NULL;
@@ -975,7 +974,7 @@ static int __init af_rxrpc_init(void)
int ret = -1;
unsigned int tmp;
- BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb));
get_random_bytes(&tmp, sizeof(tmp));
tmp &= 0x3fffffff;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 145335611af6..7d730c438404 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -185,11 +185,17 @@ struct rxrpc_host_header {
* - max 48 bytes (struct sk_buff::cb)
*/
struct rxrpc_skb_priv {
- union {
- u8 nr_jumbo; /* Number of jumbo subpackets */
- };
+ atomic_t nr_ring_pins; /* Number of rxtx ring pins */
+ u8 nr_subpackets; /* Number of subpackets */
+ u8 rx_flags; /* Received packet flags */
+#define RXRPC_SKB_INCL_LAST 0x01 /* - Includes last packet */
+#define RXRPC_SKB_TX_BUFFER 0x02 /* - Is transmit buffer */
union {
int remain; /* amount of space remaining for next write */
+
+ /* List of requested ACKs on subpackets */
+ unsigned long rx_req_ack[(RXRPC_MAX_NR_JUMBO + BITS_PER_LONG - 1) /
+ BITS_PER_LONG];
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -203,6 +209,7 @@ struct rxrpc_skb_priv {
struct rxrpc_security {
const char *name; /* name of this service */
u8 security_index; /* security type provided */
+ u32 no_key_abort; /* Abort code indicating no key */
/* Initialise a security service */
int (*init)(void);
@@ -226,6 +233,9 @@ struct rxrpc_security {
int (*verify_packet)(struct rxrpc_call *, struct sk_buff *,
unsigned int, unsigned int, rxrpc_seq_t, u16);
+ /* Free crypto request on a call */
+ void (*free_call_crypto)(struct rxrpc_call *);
+
/* Locate the data in a received packet that has been verified. */
void (*locate_data)(struct rxrpc_call *, struct sk_buff *,
unsigned int *, unsigned int *);
@@ -480,6 +490,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
RXRPC_CALL_IS_INTR, /* The call is interruptible */
+ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
};
/*
@@ -547,6 +558,7 @@ struct rxrpc_call {
struct rxrpc_peer *peer; /* Peer record for remote address */
struct rxrpc_sock __rcu *socket; /* socket responsible */
struct rxrpc_net *rxnet; /* Network namespace to which call belongs */
+ const struct rxrpc_security *security; /* applied security module */
struct mutex user_mutex; /* User access mutex */
unsigned long ack_at; /* When deferred ACK needs to happen */
unsigned long ack_lost_at; /* When ACK is figured as lost */
@@ -558,6 +570,7 @@ struct rxrpc_call {
unsigned long expect_term_by; /* When we expect call termination by */
u32 next_rx_timo; /* Timeout for next Rx packet (jif) */
u32 next_req_timo; /* Timeout for next Rx request packet (jif) */
+ struct skcipher_request *cipher_req; /* Packet cipher request buffer */
struct timer_list timer; /* Combined event timer */
struct work_struct processor; /* Event processor */
rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */
@@ -590,6 +603,7 @@ struct rxrpc_call {
int debug_id; /* debug ID for printks */
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
+ bool rx_pkt_last; /* Current recvmsg packet is last */
/* Rx/Tx circular buffer, depending on phase.
*
@@ -613,8 +627,7 @@ struct rxrpc_call {
#define RXRPC_TX_ANNO_LAST 0x04
#define RXRPC_TX_ANNO_RESENT 0x08
-#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */
-#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */
+#define RXRPC_RX_ANNO_SUBPACKET 0x3f /* Subpacket number in jumbogram */
#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */
rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but
* not hard-ACK'd packet follows this.
@@ -905,6 +918,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *);
void rxrpc_put_client_conn(struct rxrpc_connection *);
void rxrpc_discard_expired_client_conns(struct work_struct *);
void rxrpc_destroy_all_client_connections(struct rxrpc_net *);
+void rxrpc_clean_up_local_conns(struct rxrpc_local *);
/*
* conn_event.c
@@ -965,8 +979,9 @@ static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn,
struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *,
struct sk_buff *);
struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *, gfp_t);
-void rxrpc_new_incoming_connection(struct rxrpc_sock *,
- struct rxrpc_connection *, struct sk_buff *);
+void rxrpc_new_incoming_connection(struct rxrpc_sock *, struct rxrpc_connection *,
+ const struct rxrpc_security *, struct key *,
+ struct sk_buff *);
void rxrpc_unpublish_service_conn(struct rxrpc_connection *);
/*
@@ -1007,6 +1022,16 @@ void rxrpc_unuse_local(struct rxrpc_local *);
void rxrpc_queue_local(struct rxrpc_local *);
void rxrpc_destroy_all_locals(struct rxrpc_net *);
+static inline bool __rxrpc_unuse_local(struct rxrpc_local *local)
+{
+ return atomic_dec_return(&local->active_users) == 0;
+}
+
+static inline bool __rxrpc_use_local(struct rxrpc_local *local)
+{
+ return atomic_fetch_add_unless(&local->active_users, 1, 0) != 0;
+}
+
/*
* misc.c
*/
@@ -1091,7 +1116,9 @@ extern const struct rxrpc_security rxkad;
int __init rxrpc_init_security(void);
void rxrpc_exit_security(void);
int rxrpc_init_client_conn_security(struct rxrpc_connection *);
-int rxrpc_init_server_conn_security(struct rxrpc_connection *);
+bool rxrpc_look_up_server_security(struct rxrpc_local *, struct rxrpc_sock *,
+ const struct rxrpc_security **, struct key **,
+ struct sk_buff *);
/*
* sendmsg.c
@@ -1105,6 +1132,7 @@ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *);
void rxrpc_packet_destructor(struct sk_buff *);
void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace);
+void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace);
void rxrpc_purge_queue(struct sk_buff_head *);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 00c095d74145..70e44abf106c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -84,7 +84,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
smp_store_release(&b->conn_backlog_head,
(head + 1) & (size - 1));
- trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service,
atomic_read(&conn->usage), here);
}
@@ -97,7 +97,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
call->state = RXRPC_CALL_SERVER_PREALLOC;
- trace_rxrpc_call(call, rxrpc_call_new_service,
+ trace_rxrpc_call(call->debug_id, rxrpc_call_new_service,
atomic_read(&call->usage),
here, (const void *)user_call_ID);
@@ -240,6 +240,22 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
}
/*
+ * Ping the other end to fill our RTT cache and to retrieve the rwind
+ * and MTU parameters.
+ */
+static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ ktime_t now = skb->tstamp;
+
+ if (call->peer->rtt_usage < 3 ||
+ ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
+ rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
+ true, true,
+ rxrpc_propose_ack_ping_for_params);
+}
+
+/*
* Allocate a new incoming call from the prealloc pool, along with a connection
* and a peer as necessary.
*/
@@ -247,6 +263,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_local *local,
struct rxrpc_peer *peer,
struct rxrpc_connection *conn,
+ const struct rxrpc_security *sec,
+ struct key *key,
struct sk_buff *skb)
{
struct rxrpc_backlog *b = rx->backlog;
@@ -294,7 +312,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
conn->params.local = rxrpc_get_local(local);
conn->params.peer = peer;
rxrpc_see_connection(conn);
- rxrpc_new_incoming_connection(rx, conn, skb);
+ rxrpc_new_incoming_connection(rx, conn, sec, key, skb);
} else {
rxrpc_get_connection(conn);
}
@@ -307,6 +325,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
rxrpc_see_call(call);
call->conn = conn;
+ call->security = conn->security;
call->peer = rxrpc_get_peer(conn->params.peer);
call->cong_cwnd = call->peer->cong_cwnd;
return call;
@@ -332,9 +351,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ const struct rxrpc_security *sec = NULL;
struct rxrpc_connection *conn;
struct rxrpc_peer *peer = NULL;
- struct rxrpc_call *call;
+ struct rxrpc_call *call = NULL;
+ struct key *key = NULL;
_enter("");
@@ -345,9 +366,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
skb->priority = RX_INVALID_OPERATION;
- _leave(" = NULL [close]");
- call = NULL;
- goto out;
+ goto no_call;
}
/* The peer, connection and call may all have sprung into existence due
@@ -357,29 +376,19 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
*/
conn = rxrpc_find_connection_rcu(local, skb, &peer);
- call = rxrpc_alloc_incoming_call(rx, local, peer, conn, skb);
+ if (!conn && !rxrpc_look_up_server_security(local, rx, &sec, &key, skb))
+ goto no_call;
+
+ call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, key, skb);
+ key_put(key);
if (!call) {
skb->mark = RXRPC_SKB_MARK_REJECT_BUSY;
- _leave(" = NULL [busy]");
- call = NULL;
- goto out;
+ goto no_call;
}
trace_rxrpc_receive(call, rxrpc_receive_incoming,
sp->hdr.serial, sp->hdr.seq);
- /* Lock the call to prevent rxrpc_kernel_send/recv_data() and
- * sendmsg()/recvmsg() inconveniently stealing the mutex once the
- * notification is generated.
- *
- * The BUG should never happen because the kernel should be well
- * behaved enough not to access the call before the first notification
- * event and userspace is prevented from doing so until the state is
- * appropriate.
- */
- if (!mutex_trylock(&call->user_mutex))
- BUG();
-
/* Make the call live. */
rxrpc_incoming_call(rx, call, skb);
conn = call->conn;
@@ -420,6 +429,9 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
BUG();
}
spin_unlock(&conn->state_lock);
+ spin_unlock(&rx->incoming_lock);
+
+ rxrpc_send_ping(call, skb);
if (call->state == RXRPC_CALL_SERVER_ACCEPTING)
rxrpc_notify_socket(call);
@@ -432,9 +444,12 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
rxrpc_put_call(call, rxrpc_call_put);
_leave(" = %p{%d}", call, call->debug_id);
-out:
- spin_unlock(&rx->incoming_lock);
return call;
+
+no_call:
+ spin_unlock(&rx->incoming_lock);
+ _leave(" = NULL [%u]", skb->mark);
+ return NULL;
}
/*
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index c767679bfa5d..cedbbb3a7c2e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -199,7 +199,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
continue;
skb = call->rxtx_buffer[ix];
- rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
+ rxrpc_see_skb(skb, rxrpc_skb_seen);
if (anno_type == RXRPC_TX_ANNO_UNACK) {
if (ktime_after(skb->tstamp, max_age)) {
@@ -255,18 +255,18 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
continue;
skb = call->rxtx_buffer[ix];
- rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ rxrpc_get_skb(skb, rxrpc_skb_got);
spin_unlock_bh(&call->lock);
if (rxrpc_send_data_packet(call, skb, true) < 0) {
- rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
return;
}
if (rxrpc_is_client_call(call))
rxrpc_expose_client_call(call);
- rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
spin_lock_bh(&call->lock);
/* We need to clear the retransmit state, but there are two
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 217b12be9e08..c9f34b0a11df 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -240,7 +240,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
if (p->intr)
__set_bit(RXRPC_CALL_IS_INTR, &call->flags);
call->tx_total_len = p->tx_total_len;
- trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
+ trace_rxrpc_call(call->debug_id, rxrpc_call_new_client,
+ atomic_read(&call->usage),
here, (const void *)p->user_call_ID);
/* We need to protect a partially set up call against the user as we
@@ -290,8 +291,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
if (ret < 0)
goto error;
- trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
- here, NULL);
+ trace_rxrpc_call(call->debug_id, rxrpc_call_connected,
+ atomic_read(&call->usage), here, NULL);
rxrpc_start_call_timer(call);
@@ -313,8 +314,8 @@ error_dup_user_ID:
error:
__rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
RX_CALL_DEAD, ret);
- trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
- here, ERR_PTR(ret));
+ trace_rxrpc_call(call->debug_id, rxrpc_call_error,
+ atomic_read(&call->usage), here, ERR_PTR(ret));
rxrpc_release_call(rx, call);
mutex_unlock(&call->user_mutex);
rxrpc_put_call(call, rxrpc_call_put);
@@ -376,7 +377,8 @@ bool rxrpc_queue_call(struct rxrpc_call *call)
if (n == 0)
return false;
if (rxrpc_queue_work(&call->processor))
- trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL);
+ trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1,
+ here, NULL);
else
rxrpc_put_call(call, rxrpc_call_put_noqueue);
return true;
@@ -391,7 +393,8 @@ bool __rxrpc_queue_call(struct rxrpc_call *call)
int n = atomic_read(&call->usage);
ASSERTCMP(n, >=, 1);
if (rxrpc_queue_work(&call->processor))
- trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL);
+ trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n,
+ here, NULL);
else
rxrpc_put_call(call, rxrpc_call_put_noqueue);
return true;
@@ -406,7 +409,8 @@ void rxrpc_see_call(struct rxrpc_call *call)
if (call) {
int n = atomic_read(&call->usage);
- trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL);
+ trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n,
+ here, NULL);
}
}
@@ -418,7 +422,20 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(&call->usage);
- trace_rxrpc_call(call, op, n, here, NULL);
+ trace_rxrpc_call(call->debug_id, op, n, here, NULL);
+}
+
+/*
+ * Clean up the RxTx skb ring.
+ */
+static void rxrpc_cleanup_ring(struct rxrpc_call *call)
+{
+ int i;
+
+ for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
+ rxrpc_free_skb(call->rxtx_buffer[i], rxrpc_skb_cleaned);
+ call->rxtx_buffer[i] = NULL;
+ }
}
/*
@@ -429,11 +446,11 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
const void *here = __builtin_return_address(0);
struct rxrpc_connection *conn = call->conn;
bool put = false;
- int i;
_enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
- trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
+ trace_rxrpc_call(call->debug_id, rxrpc_call_release,
+ atomic_read(&call->usage),
here, (const void *)call->flags);
ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
@@ -476,16 +493,12 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
- if (conn)
+ if (conn && !test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
rxrpc_disconnect_call(call);
+ if (call->security)
+ call->security->free_call_crypto(call);
- for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
- rxrpc_free_skb(call->rxtx_buffer[i],
- (call->tx_phase ? rxrpc_skb_tx_cleaned :
- rxrpc_skb_rx_cleaned));
- call->rxtx_buffer[i] = NULL;
- }
-
+ rxrpc_cleanup_ring(call);
_leave("");
}
@@ -526,12 +539,13 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
{
struct rxrpc_net *rxnet = call->rxnet;
const void *here = __builtin_return_address(0);
+ unsigned int debug_id = call->debug_id;
int n;
ASSERT(call != NULL);
n = atomic_dec_return(&call->usage);
- trace_rxrpc_call(call, op, n, here, NULL);
+ trace_rxrpc_call(debug_id, op, n, here, NULL);
ASSERTCMP(n, >=, 0);
if (n == 0) {
_debug("call %d dead", call->debug_id);
@@ -548,13 +562,14 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
}
/*
- * Final call destruction under RCU.
+ * Final call destruction - but must be done in process context.
*/
-static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
+static void rxrpc_destroy_call(struct work_struct *work)
{
- struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+ struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor);
struct rxrpc_net *rxnet = call->rxnet;
+ rxrpc_put_connection(call->conn);
rxrpc_put_peer(call->peer);
kfree(call->rxtx_buffer);
kfree(call->rxtx_annotations);
@@ -564,12 +579,26 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
}
/*
+ * Final call destruction under RCU.
+ */
+static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
+{
+ struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+
+ if (in_softirq()) {
+ INIT_WORK(&call->processor, rxrpc_destroy_call);
+ if (!rxrpc_queue_work(&call->processor))
+ BUG();
+ } else {
+ rxrpc_destroy_call(&call->processor);
+ }
+}
+
+/*
* clean up a call
*/
void rxrpc_cleanup_call(struct rxrpc_call *call)
{
- int i;
-
_net("DESTROY CALL %d", call->debug_id);
memset(&call->sock_node, 0xcd, sizeof(call->sock_node));
@@ -578,15 +607,9 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
- ASSERTCMP(call->conn, ==, NULL);
-
- /* Clean up the Rx/Tx buffer */
- for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
- rxrpc_free_skb(call->rxtx_buffer[i],
- (call->tx_phase ? rxrpc_skb_tx_cleaned :
- rxrpc_skb_rx_cleaned));
- rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned);
+ rxrpc_cleanup_ring(call);
+ rxrpc_free_skb(call->tx_pending, rxrpc_skb_cleaned);
call_rcu(&call->rcu, rxrpc_rcu_destroy_call);
}
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index aea82f909c60..ea7d4c21f889 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -212,7 +212,8 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
rxrpc_get_local(conn->params.local);
key_get(conn->params.key);
- trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage),
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client,
+ atomic_read(&conn->usage),
__builtin_return_address(0));
trace_rxrpc_client(conn, -1, rxrpc_client_alloc);
_leave(" = %p", conn);
@@ -352,6 +353,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx,
if (cp->exclusive) {
call->conn = candidate;
+ call->security = candidate->security;
call->security_ix = candidate->security_ix;
call->service_id = candidate->service_id;
_leave(" = 0 [exclusive %d]", candidate->debug_id);
@@ -403,6 +405,7 @@ static int rxrpc_get_client_conn(struct rxrpc_sock *rx,
candidate_published:
set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags);
call->conn = candidate;
+ call->security = candidate->security;
call->security_ix = candidate->security_ix;
call->service_id = candidate->service_id;
spin_unlock(&local->client_conns_lock);
@@ -425,6 +428,7 @@ found_extant_conn:
spin_lock(&conn->channel_lock);
call->conn = conn;
+ call->security = conn->security;
call->security_ix = conn->security_ix;
call->service_id = conn->service_id;
list_add_tail(&call->chan_wait_link, &conn->waiting_calls);
@@ -781,6 +785,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
u32 cid;
spin_lock(&conn->channel_lock);
+ set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
cid = call->cid;
if (cid) {
@@ -788,7 +793,6 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
chan = &conn->channels[channel];
}
trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
- call->conn = NULL;
/* Calls that have never actually been assigned a channel can simply be
* discarded. If the conn didn't get used either, it will follow
@@ -904,7 +908,6 @@ out:
spin_unlock(&rxnet->client_conn_cache_lock);
out_2:
spin_unlock(&conn->channel_lock);
- rxrpc_put_connection(conn);
_leave("");
return;
@@ -985,11 +988,12 @@ rxrpc_put_one_client_conn(struct rxrpc_connection *conn)
void rxrpc_put_client_conn(struct rxrpc_connection *conn)
{
const void *here = __builtin_return_address(0);
+ unsigned int debug_id = conn->debug_id;
int n;
do {
n = atomic_dec_return(&conn->usage);
- trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here);
+ trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, n, here);
if (n > 0)
return;
ASSERTCMP(n, >=, 0);
@@ -1162,3 +1166,47 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet)
_leave("");
}
+
+/*
+ * Clean up the client connections on a local endpoint.
+ */
+void rxrpc_clean_up_local_conns(struct rxrpc_local *local)
+{
+ struct rxrpc_connection *conn, *tmp;
+ struct rxrpc_net *rxnet = local->rxnet;
+ unsigned int nr_active;
+ LIST_HEAD(graveyard);
+
+ _enter("");
+
+ spin_lock(&rxnet->client_conn_cache_lock);
+ nr_active = rxnet->nr_active_client_conns;
+
+ list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns,
+ cache_link) {
+ if (conn->params.local == local) {
+ ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_IDLE);
+
+ trace_rxrpc_client(conn, -1, rxrpc_client_discard);
+ if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags))
+ BUG();
+ conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
+ list_move(&conn->cache_link, &graveyard);
+ nr_active--;
+ }
+ }
+
+ rxnet->nr_active_client_conns = nr_active;
+ spin_unlock(&rxnet->client_conn_cache_lock);
+ ASSERTCMP(nr_active, >=, 0);
+
+ while (!list_empty(&graveyard)) {
+ conn = list_entry(graveyard.next,
+ struct rxrpc_connection, cache_link);
+ list_del_init(&conn->cache_link);
+
+ rxrpc_put_connection(conn);
+ }
+
+ _leave(" [culled]");
+}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index df6624c140be..06fcff2ebbba 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -376,21 +376,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn)
_enter("{%d}", conn->debug_id);
ASSERT(conn->security_ix != 0);
-
- if (!conn->params.key) {
- _debug("set up security");
- ret = rxrpc_init_server_conn_security(conn);
- switch (ret) {
- case 0:
- break;
- case -ENOENT:
- abort_code = RX_CALL_DEAD;
- goto abort;
- default:
- abort_code = RXKADNOAUTH;
- goto abort;
- }
- }
+ ASSERT(conn->server_key);
if (conn->security->issue_challenge(conn) < 0) {
abort_code = RX_CALL_DEAD;
@@ -452,16 +438,12 @@ again:
/*
* connection-level event processor
*/
-void rxrpc_process_connection(struct work_struct *work)
+static void rxrpc_do_process_connection(struct rxrpc_connection *conn)
{
- struct rxrpc_connection *conn =
- container_of(work, struct rxrpc_connection, processor);
struct sk_buff *skb;
u32 abort_code = RX_PROTOCOL_ERROR;
int ret;
- rxrpc_see_connection(conn);
-
if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
rxrpc_secure_connection(conn);
@@ -472,7 +454,7 @@ void rxrpc_process_connection(struct work_struct *work)
/* go through the conn-level event packets, releasing the ref on this
* connection that each one has when we've finished with it */
while ((skb = skb_dequeue(&conn->rx_queue))) {
- rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ rxrpc_see_skb(skb, rxrpc_skb_seen);
ret = rxrpc_process_event(conn, skb, &abort_code);
switch (ret) {
case -EPROTO:
@@ -484,23 +466,37 @@ void rxrpc_process_connection(struct work_struct *work)
goto requeue_and_leave;
case -ECONNABORTED:
default:
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
break;
}
}
-out:
- rxrpc_put_connection(conn);
- _leave("");
return;
requeue_and_leave:
skb_queue_head(&conn->rx_queue, skb);
- goto out;
+ return;
protocol_error:
if (rxrpc_abort_connection(conn, ret, abort_code) < 0)
goto requeue_and_leave;
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
- goto out;
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
+ return;
+}
+
+void rxrpc_process_connection(struct work_struct *work)
+{
+ struct rxrpc_connection *conn =
+ container_of(work, struct rxrpc_connection, processor);
+
+ rxrpc_see_connection(conn);
+
+ if (__rxrpc_use_local(conn->params.local)) {
+ rxrpc_do_process_connection(conn);
+ rxrpc_unuse_local(conn->params.local);
+ }
+
+ rxrpc_put_connection(conn);
+ _leave("");
+ return;
}
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 434ef392212b..19e141eeed17 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -223,9 +223,8 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
__rxrpc_disconnect_call(conn, call);
spin_unlock(&conn->channel_lock);
- call->conn = NULL;
+ set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
conn->idle_timestamp = jiffies;
- rxrpc_put_connection(conn);
}
/*
@@ -269,7 +268,7 @@ bool rxrpc_queue_conn(struct rxrpc_connection *conn)
if (n == 0)
return false;
if (rxrpc_queue_work(&conn->processor))
- trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here);
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, n + 1, here);
else
rxrpc_put_connection(conn);
return true;
@@ -284,7 +283,7 @@ void rxrpc_see_connection(struct rxrpc_connection *conn)
if (conn) {
int n = atomic_read(&conn->usage);
- trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here);
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here);
}
}
@@ -296,7 +295,7 @@ void rxrpc_get_connection(struct rxrpc_connection *conn)
const void *here = __builtin_return_address(0);
int n = atomic_inc_return(&conn->usage);
- trace_rxrpc_conn(conn, rxrpc_conn_got, n, here);
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n, here);
}
/*
@@ -310,7 +309,7 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
if (conn) {
int n = atomic_fetch_add_unless(&conn->usage, 1, 0);
if (n > 0)
- trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here);
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, n + 1, here);
else
conn = NULL;
}
@@ -333,10 +332,11 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet,
void rxrpc_put_service_conn(struct rxrpc_connection *conn)
{
const void *here = __builtin_return_address(0);
+ unsigned int debug_id = conn->debug_id;
int n;
n = atomic_dec_return(&conn->usage);
- trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
+ trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, n, here);
ASSERTCMP(n, >=, 0);
if (n == 1)
rxrpc_set_service_reap_timer(conn->params.local->rxnet,
@@ -398,7 +398,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
continue;
- if (rxnet->live) {
+ if (rxnet->live && !conn->params.local->dead) {
idle_timestamp = READ_ONCE(conn->idle_timestamp);
expire_at = idle_timestamp + rxrpc_connection_expiry * HZ;
if (conn->params.local->service_closed)
@@ -420,7 +420,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
*/
if (atomic_cmpxchg(&conn->usage, 1, 0) != 1)
continue;
- trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL);
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL);
if (rxrpc_conn_is_client(conn))
BUG();
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index b30e13f6d95f..21da48e3d2e5 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -134,7 +134,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
write_unlock(&rxnet->conn_lock);
- trace_rxrpc_conn(conn, rxrpc_conn_new_service,
+ trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service,
atomic_read(&conn->usage),
__builtin_return_address(0));
}
@@ -148,6 +148,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
*/
void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
struct rxrpc_connection *conn,
+ const struct rxrpc_security *sec,
+ struct key *key,
struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
@@ -160,6 +162,8 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx,
conn->service_id = sp->hdr.serviceId;
conn->security_ix = sp->hdr.securityIndex;
conn->out_clientflag = 0;
+ conn->security = sec;
+ conn->server_key = key_get(key);
if (conn->security_ix)
conn->state = RXRPC_CONN_SERVICE_UNSECURED;
else
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index dd47d465d1d3..ef10fbf71b15 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -193,22 +193,6 @@ send_extra_data:
}
/*
- * Ping the other end to fill our RTT cache and to retrieve the rwind
- * and MTU parameters.
- */
-static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb)
-{
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- ktime_t now = skb->tstamp;
-
- if (call->peer->rtt_usage < 3 ||
- ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
- true, true,
- rxrpc_propose_ack_ping_for_params);
-}
-
-/*
* Apply a hard ACK by advancing the Tx window.
*/
static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
@@ -233,7 +217,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
annotation = call->rxtx_annotations[ix];
- rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
+ rxrpc_see_skb(skb, rxrpc_skb_rotated);
call->rxtx_buffer[ix] = NULL;
call->rxtx_annotations[ix] = 0;
skb->next = list;
@@ -258,7 +242,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
skb = list;
list = skb->next;
skb_mark_not_on_list(skb);
- rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
}
return rot_last;
@@ -347,7 +331,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
}
/*
- * Scan a jumbo packet to validate its structure and to work out how many
+ * Scan a data packet to validate its structure and to work out how many
* subpackets it contains.
*
* A jumbo packet is a collection of consecutive packets glued together with
@@ -358,16 +342,21 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call)
* the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any
* size.
*/
-static bool rxrpc_validate_jumbo(struct sk_buff *skb)
+static bool rxrpc_validate_data(struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len = skb->len;
- int nr_jumbo = 1;
u8 flags = sp->hdr.flags;
- do {
- nr_jumbo++;
+ for (;;) {
+ if (flags & RXRPC_REQUEST_ACK)
+ __set_bit(sp->nr_subpackets, sp->rx_req_ack);
+ sp->nr_subpackets++;
+
+ if (!(flags & RXRPC_JUMBO_PACKET))
+ break;
+
if (len - offset < RXRPC_JUMBO_SUBPKTLEN)
goto protocol_error;
if (flags & RXRPC_LAST_PACKET)
@@ -376,9 +365,10 @@ static bool rxrpc_validate_jumbo(struct sk_buff *skb)
if (skb_copy_bits(skb, offset, &flags, 1) < 0)
goto protocol_error;
offset += sizeof(struct rxrpc_jumbo_header);
- } while (flags & RXRPC_JUMBO_PACKET);
+ }
- sp->nr_jumbo = nr_jumbo;
+ if (flags & RXRPC_LAST_PACKET)
+ sp->rx_flags |= RXRPC_SKB_INCL_LAST;
return true;
protocol_error:
@@ -399,10 +389,10 @@ protocol_error:
* (that information is encoded in the ACK packet).
*/
static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
- u8 annotation, bool *_jumbo_bad)
+ bool is_jumbo, bool *_jumbo_bad)
{
/* Discard normal packets that are duplicates. */
- if (annotation == 0)
+ if (is_jumbo)
return;
/* Skip jumbo subpackets that are duplicates. When we've had three or
@@ -416,29 +406,30 @@ static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq,
}
/*
- * Process a DATA packet, adding the packet to the Rx ring.
+ * Process a DATA packet, adding the packet to the Rx ring. The caller's
+ * packet ref must be passed on or discarded.
*/
static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
enum rxrpc_call_state state;
- unsigned int offset = sizeof(struct rxrpc_wire_header);
- unsigned int ix;
+ unsigned int j, nr_subpackets;
rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0;
- rxrpc_seq_t seq = sp->hdr.seq, hard_ack;
- bool immediate_ack = false, jumbo_bad = false, queued;
- u16 len;
- u8 ack = 0, flags, annotation = 0;
+ rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack;
+ bool immediate_ack = false, jumbo_bad = false;
+ u8 ack = 0;
_enter("{%u,%u},{%u,%u}",
- call->rx_hard_ack, call->rx_top, skb->len, seq);
+ call->rx_hard_ack, call->rx_top, skb->len, seq0);
- _proto("Rx DATA %%%u { #%u f=%02x }",
- sp->hdr.serial, seq, sp->hdr.flags);
+ _proto("Rx DATA %%%u { #%u f=%02x n=%u }",
+ sp->hdr.serial, seq0, sp->hdr.flags, sp->nr_subpackets);
state = READ_ONCE(call->state);
- if (state >= RXRPC_CALL_COMPLETE)
+ if (state >= RXRPC_CALL_COMPLETE) {
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
return;
+ }
if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) {
unsigned long timo = READ_ONCE(call->next_req_timo);
@@ -463,137 +454,139 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb)
!rxrpc_receiving_reply(call))
goto unlock;
- call->ackr_prev_seq = seq;
-
+ call->ackr_prev_seq = seq0;
hard_ack = READ_ONCE(call->rx_hard_ack);
- if (after(seq, hard_ack + call->rx_winsize)) {
- ack = RXRPC_ACK_EXCEEDS_WINDOW;
- ack_serial = serial;
- goto ack;
- }
- flags = sp->hdr.flags;
- if (flags & RXRPC_JUMBO_PACKET) {
+ nr_subpackets = sp->nr_subpackets;
+ if (nr_subpackets > 1) {
if (call->nr_jumbo_bad > 3) {
ack = RXRPC_ACK_NOSPACE;
ack_serial = serial;
goto ack;
}
- annotation = 1;
}
-next_subpacket:
- queued = false;
- ix = seq & RXRPC_RXTX_BUFF_MASK;
- len = skb->len;
- if (flags & RXRPC_JUMBO_PACKET)
- len = RXRPC_JUMBO_DATALEN;
-
- if (flags & RXRPC_LAST_PACKET) {
- if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
- seq != call->rx_top) {
- rxrpc_proto_abort("LSN", call, seq);
- goto unlock;
- }
- } else {
- if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
- after_eq(seq, call->rx_top)) {
- rxrpc_proto_abort("LSA", call, seq);
- goto unlock;
+ for (j = 0; j < nr_subpackets; j++) {
+ rxrpc_serial_t serial = sp->hdr.serial + j;
+ rxrpc_seq_t seq = seq0 + j;
+ unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK;
+ bool terminal = (j == nr_subpackets - 1);
+ bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST);
+ u8 flags, annotation = j;
+
+ _proto("Rx DATA+%u %%%u { #%x t=%u l=%u }",
+ j, serial, seq, terminal, last);
+
+ if (last) {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ seq != call->rx_top) {
+ rxrpc_proto_abort("LSN", call, seq);
+ goto unlock;
+ }
+ } else {
+ if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) &&
+ after_eq(seq, call->rx_top)) {
+ rxrpc_proto_abort("LSA", call, seq);
+ goto unlock;
+ }
}
- }
- trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
- if (before_eq(seq, hard_ack)) {
- ack = RXRPC_ACK_DUPLICATE;
- ack_serial = serial;
- goto skip;
- }
-
- if (flags & RXRPC_REQUEST_ACK && !ack) {
- ack = RXRPC_ACK_REQUESTED;
- ack_serial = serial;
- }
+ flags = 0;
+ if (last)
+ flags |= RXRPC_LAST_PACKET;
+ if (!terminal)
+ flags |= RXRPC_JUMBO_PACKET;
+ if (test_bit(j, sp->rx_req_ack))
+ flags |= RXRPC_REQUEST_ACK;
+ trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
- if (call->rxtx_buffer[ix]) {
- rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad);
- if (ack != RXRPC_ACK_DUPLICATE) {
+ if (before_eq(seq, hard_ack)) {
ack = RXRPC_ACK_DUPLICATE;
ack_serial = serial;
+ continue;
}
- immediate_ack = true;
- goto skip;
- }
-
- /* Queue the packet. We use a couple of memory barriers here as need
- * to make sure that rx_top is perceived to be set after the buffer
- * pointer and that the buffer pointer is set after the annotation and
- * the skb data.
- *
- * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
- * and also rxrpc_fill_out_ack().
- */
- rxrpc_get_skb(skb, rxrpc_skb_rx_got);
- call->rxtx_annotations[ix] = annotation;
- smp_wmb();
- call->rxtx_buffer[ix] = skb;
- if (after(seq, call->rx_top)) {
- smp_store_release(&call->rx_top, seq);
- } else if (before(seq, call->rx_top)) {
- /* Send an immediate ACK if we fill in a hole */
- if (!ack) {
- ack = RXRPC_ACK_DELAY;
- ack_serial = serial;
- }
- immediate_ack = true;
- }
- if (flags & RXRPC_LAST_PACKET) {
- set_bit(RXRPC_CALL_RX_LAST, &call->flags);
- trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
- } else {
- trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
- }
- queued = true;
- if (after_eq(seq, call->rx_expect_next)) {
- if (after(seq, call->rx_expect_next)) {
- _net("OOS %u > %u", seq, call->rx_expect_next);
- ack = RXRPC_ACK_OUT_OF_SEQUENCE;
- ack_serial = serial;
+ if (call->rxtx_buffer[ix]) {
+ rxrpc_input_dup_data(call, seq, nr_subpackets > 1,
+ &jumbo_bad);
+ if (ack != RXRPC_ACK_DUPLICATE) {
+ ack = RXRPC_ACK_DUPLICATE;
+ ack_serial = serial;
+ }
+ immediate_ack = true;
+ continue;
}
- call->rx_expect_next = seq + 1;
- }
-skip:
- offset += len;
- if (flags & RXRPC_JUMBO_PACKET) {
- if (skb_copy_bits(skb, offset, &flags, 1) < 0) {
- rxrpc_proto_abort("XJF", call, seq);
- goto unlock;
- }
- offset += sizeof(struct rxrpc_jumbo_header);
- seq++;
- serial++;
- annotation++;
- if (flags & RXRPC_JUMBO_PACKET)
- annotation |= RXRPC_RX_ANNO_JLAST;
if (after(seq, hard_ack + call->rx_winsize)) {
ack = RXRPC_ACK_EXCEEDS_WINDOW;
ack_serial = serial;
- if (!jumbo_bad) {
- call->nr_jumbo_bad++;
- jumbo_bad = true;
+ if (flags & RXRPC_JUMBO_PACKET) {
+ if (!jumbo_bad) {
+ call->nr_jumbo_bad++;
+ jumbo_bad = true;
+ }
}
+
goto ack;
}
- _proto("Rx DATA Jumbo %%%u", serial);
- goto next_subpacket;
- }
+ if (flags & RXRPC_REQUEST_ACK && !ack) {
+ ack = RXRPC_ACK_REQUESTED;
+ ack_serial = serial;
+ }
+
+ /* Queue the packet. We use a couple of memory barriers here as need
+ * to make sure that rx_top is perceived to be set after the buffer
+ * pointer and that the buffer pointer is set after the annotation and
+ * the skb data.
+ *
+ * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window()
+ * and also rxrpc_fill_out_ack().
+ */
+ if (!terminal)
+ rxrpc_get_skb(skb, rxrpc_skb_got);
+ call->rxtx_annotations[ix] = annotation;
+ smp_wmb();
+ call->rxtx_buffer[ix] = skb;
+ if (after(seq, call->rx_top)) {
+ smp_store_release(&call->rx_top, seq);
+ } else if (before(seq, call->rx_top)) {
+ /* Send an immediate ACK if we fill in a hole */
+ if (!ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
+ immediate_ack = true;
+ }
+
+ if (terminal) {
+ /* From this point on, we're not allowed to touch the
+ * packet any longer as its ref now belongs to the Rx
+ * ring.
+ */
+ skb = NULL;
+ sp = NULL;
+ }
- if (queued && flags & RXRPC_LAST_PACKET && !ack) {
- ack = RXRPC_ACK_DELAY;
- ack_serial = serial;
+ if (last) {
+ set_bit(RXRPC_CALL_RX_LAST, &call->flags);
+ if (!ack) {
+ ack = RXRPC_ACK_DELAY;
+ ack_serial = serial;
+ }
+ trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq);
+ } else {
+ trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq);
+ }
+
+ if (after_eq(seq, call->rx_expect_next)) {
+ if (after(seq, call->rx_expect_next)) {
+ _net("OOS %u > %u", seq, call->rx_expect_next);
+ ack = RXRPC_ACK_OUT_OF_SEQUENCE;
+ ack_serial = serial;
+ }
+ call->rx_expect_next = seq + 1;
+ }
}
ack:
@@ -606,13 +599,12 @@ ack:
false, true,
rxrpc_propose_ack_input_data);
- if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) {
- trace_rxrpc_notify_socket(call->debug_id, serial);
- rxrpc_notify_socket(call);
- }
+ trace_rxrpc_notify_socket(call->debug_id, serial);
+ rxrpc_notify_socket(call);
unlock:
spin_unlock(&call->input_lock);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
_leave(" [queued]");
}
@@ -1021,7 +1013,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_DATA:
rxrpc_input_data(call, skb);
- break;
+ goto no_free;
case RXRPC_PACKET_TYPE_ACK:
rxrpc_input_ack(call, skb);
@@ -1048,6 +1040,8 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
break;
}
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
+no_free:
_leave("");
}
@@ -1109,7 +1103,7 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
skb_queue_tail(&local->event_queue, skb);
rxrpc_queue_local(local);
} else {
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
}
}
@@ -1124,7 +1118,7 @@ static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
skb_queue_tail(&local->reject_queue, skb);
rxrpc_queue_local(local);
} else {
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
}
}
@@ -1188,7 +1182,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
if (skb->tstamp == 0)
skb->tstamp = ktime_get_real();
- rxrpc_new_skb(skb, rxrpc_skb_rx_received);
+ rxrpc_new_skb(skb, rxrpc_skb_received);
skb_pull(skb, sizeof(struct udphdr));
@@ -1205,7 +1199,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
static int lose;
if ((lose++ & 7) == 7) {
trace_rxrpc_rx_lose(sp);
- rxrpc_free_skb(skb, rxrpc_skb_rx_lost);
+ rxrpc_free_skb(skb, rxrpc_skb_lost);
return 0;
}
}
@@ -1237,9 +1231,26 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
if (sp->hdr.callNumber == 0 ||
sp->hdr.seq == 0)
goto bad_message;
- if (sp->hdr.flags & RXRPC_JUMBO_PACKET &&
- !rxrpc_validate_jumbo(skb))
+ if (!rxrpc_validate_data(skb))
goto bad_message;
+
+ /* Unshare the packet so that it can be modified for in-place
+ * decryption.
+ */
+ if (sp->hdr.securityIndex != 0) {
+ struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC);
+ if (!nskb) {
+ rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem);
+ goto out;
+ }
+
+ if (nskb != skb) {
+ rxrpc_eaten_skb(skb, rxrpc_skb_received);
+ skb = nskb;
+ rxrpc_new_skb(skb, rxrpc_skb_unshared);
+ sp = rxrpc_skb(skb);
+ }
+ }
break;
case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -1369,15 +1380,16 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb)
call = rxrpc_new_incoming_call(local, rx, skb);
if (!call)
goto reject_packet;
- rxrpc_send_ping(call, skb);
- mutex_unlock(&call->user_mutex);
}
+ /* Process a call packet; this either discards or passes on the ref
+ * elsewhere.
+ */
rxrpc_input_call_packet(call, skb);
- goto discard;
+ goto out;
discard:
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
out:
trace_rxrpc_rx_done(0, 0);
return 0;
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index a29d26c273b5..f6c59f5fae9d 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -33,6 +33,10 @@ static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
return 0;
}
+static void none_free_call_crypto(struct rxrpc_call *call)
+{
+}
+
static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
unsigned int *_offset, unsigned int *_len)
{
@@ -83,6 +87,7 @@ const struct rxrpc_security rxrpc_no_security = {
.exit = none_exit,
.init_connection_security = none_init_connection_security,
.prime_packet_security = none_prime_packet_security,
+ .free_call_crypto = none_free_call_crypto,
.secure_packet = none_secure_packet,
.verify_packet = none_verify_packet,
.locate_data = none_locate_data,
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index e93a78f7c05e..3ce6d628cd75 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -90,7 +90,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
if (skb) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ rxrpc_see_skb(skb, rxrpc_skb_seen);
_debug("{%d},{%u}", local->debug_id, sp->hdr.type);
switch (sp->hdr.type) {
@@ -108,7 +108,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local)
break;
}
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
}
_leave("");
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 72a6e12a9304..a6c1349e965d 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -364,11 +364,14 @@ void rxrpc_queue_local(struct rxrpc_local *local)
void rxrpc_put_local(struct rxrpc_local *local)
{
const void *here = __builtin_return_address(0);
+ unsigned int debug_id;
int n;
if (local) {
+ debug_id = local->debug_id;
+
n = atomic_dec_return(&local->usage);
- trace_rxrpc_local(local->debug_id, rxrpc_local_put, n, here);
+ trace_rxrpc_local(debug_id, rxrpc_local_put, n, here);
if (n == 0)
call_rcu(&local->rcu, rxrpc_local_rcu);
@@ -380,14 +383,11 @@ void rxrpc_put_local(struct rxrpc_local *local)
*/
struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local)
{
- unsigned int au;
-
local = rxrpc_get_local_maybe(local);
if (!local)
return NULL;
- au = atomic_fetch_add_unless(&local->active_users, 1, 0);
- if (au == 0) {
+ if (!__rxrpc_use_local(local)) {
rxrpc_put_local(local);
return NULL;
}
@@ -401,14 +401,11 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local)
*/
void rxrpc_unuse_local(struct rxrpc_local *local)
{
- unsigned int au;
-
if (local) {
- au = atomic_dec_return(&local->active_users);
- if (au == 0)
+ if (__rxrpc_unuse_local(local)) {
+ rxrpc_get_local(local);
rxrpc_queue_local(local);
- else
- rxrpc_put_local(local);
+ }
}
}
@@ -426,11 +423,14 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local)
_enter("%d", local->debug_id);
+ local->dead = true;
+
mutex_lock(&rxnet->local_mutex);
list_del_init(&local->link);
mutex_unlock(&rxnet->local_mutex);
- ASSERT(RB_EMPTY_ROOT(&local->client_conns));
+ rxrpc_clean_up_local_conns(local);
+ rxrpc_service_connection_reaper(&rxnet->service_conn_reaper);
ASSERT(!local->service);
if (socket) {
@@ -462,7 +462,7 @@ static void rxrpc_local_processor(struct work_struct *work)
do {
again = false;
- if (atomic_read(&local->active_users) == 0) {
+ if (!__rxrpc_use_local(local)) {
rxrpc_local_destroyer(local);
break;
}
@@ -476,6 +476,8 @@ static void rxrpc_local_processor(struct work_struct *work)
rxrpc_process_local_events(local);
again = true;
}
+
+ __rxrpc_unuse_local(local);
} while (again);
rxrpc_put_local(local);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 369e516c4bdf..bad3d2420344 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -129,7 +129,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
rxrpc_serial_t *_serial)
{
- struct rxrpc_connection *conn = NULL;
+ struct rxrpc_connection *conn;
struct rxrpc_ack_buffer *pkt;
struct msghdr msg;
struct kvec iov[2];
@@ -139,18 +139,14 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
int ret;
u8 reason;
- spin_lock_bh(&call->lock);
- if (call->conn)
- conn = rxrpc_get_connection_maybe(call->conn);
- spin_unlock_bh(&call->lock);
- if (!conn)
+ if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return -ECONNRESET;
pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
- if (!pkt) {
- rxrpc_put_connection(conn);
+ if (!pkt)
return -ENOMEM;
- }
+
+ conn = call->conn;
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
@@ -244,7 +240,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
}
out:
- rxrpc_put_connection(conn);
kfree(pkt);
return ret;
}
@@ -254,7 +249,7 @@ out:
*/
int rxrpc_send_abort_packet(struct rxrpc_call *call)
{
- struct rxrpc_connection *conn = NULL;
+ struct rxrpc_connection *conn;
struct rxrpc_abort_buffer pkt;
struct msghdr msg;
struct kvec iov[1];
@@ -271,13 +266,11 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
test_bit(RXRPC_CALL_TX_LAST, &call->flags))
return 0;
- spin_lock_bh(&call->lock);
- if (call->conn)
- conn = rxrpc_get_connection_maybe(call->conn);
- spin_unlock_bh(&call->lock);
- if (!conn)
+ if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return -ECONNRESET;
+ conn = call->conn;
+
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
@@ -312,8 +305,6 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
rxrpc_tx_point_call_abort);
rxrpc_tx_backoff(call, ret);
-
- rxrpc_put_connection(conn);
return ret;
}
@@ -565,7 +556,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
memset(&whdr, 0, sizeof(whdr));
while ((skb = skb_dequeue(&local->reject_queue))) {
- rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ rxrpc_see_skb(skb, rxrpc_skb_seen);
sp = rxrpc_skb(skb);
switch (skb->mark) {
@@ -581,7 +572,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
ioc = 2;
break;
default:
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
continue;
}
@@ -606,7 +597,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
rxrpc_tx_point_reject);
}
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
}
_leave("");
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 7666ec72d37e..923b263c401b 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -147,10 +147,16 @@ void rxrpc_error_report(struct sock *sk)
{
struct sock_exterr_skb *serr;
struct sockaddr_rxrpc srx;
- struct rxrpc_local *local = sk->sk_user_data;
+ struct rxrpc_local *local;
struct rxrpc_peer *peer;
struct sk_buff *skb;
+ rcu_read_lock();
+ local = rcu_dereference_sk_user_data(sk);
+ if (unlikely(!local)) {
+ rcu_read_unlock();
+ return;
+ }
_enter("%p{%d}", sk, local->debug_id);
/* Clear the outstanding error value on the socket so that it doesn't
@@ -160,24 +166,25 @@ void rxrpc_error_report(struct sock *sk)
skb = sock_dequeue_err_skb(sk);
if (!skb) {
+ rcu_read_unlock();
_leave("UDP socket errqueue empty");
return;
}
- rxrpc_new_skb(skb, rxrpc_skb_rx_received);
+ rxrpc_new_skb(skb, rxrpc_skb_received);
serr = SKB_EXT_ERR(skb);
if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) {
_leave("UDP empty message");
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rcu_read_unlock();
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
return;
}
- rcu_read_lock();
peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx);
if (peer && !rxrpc_get_peer_maybe(peer))
peer = NULL;
if (!peer) {
rcu_read_unlock();
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
_leave(" [no peer]");
return;
}
@@ -189,7 +196,7 @@ void rxrpc_error_report(struct sock *sk)
serr->ee.ee_code == ICMP_FRAG_NEEDED)) {
rxrpc_adjust_mtu(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
rxrpc_put_peer(peer);
_leave(" [MTU update]");
return;
@@ -197,7 +204,7 @@ void rxrpc_error_report(struct sock *sk)
rxrpc_store_error(peer, serr);
rcu_read_unlock();
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
rxrpc_put_peer(peer);
_leave("");
@@ -357,27 +364,31 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
if (!rxrpc_get_peer_maybe(peer))
continue;
- spin_unlock_bh(&rxnet->peer_hash_lock);
+ if (__rxrpc_use_local(peer->local)) {
+ spin_unlock_bh(&rxnet->peer_hash_lock);
- keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
- slot = keepalive_at - base;
- _debug("%02x peer %u t=%d {%pISp}",
- cursor, peer->debug_id, slot, &peer->srx.transport);
+ keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
+ slot = keepalive_at - base;
+ _debug("%02x peer %u t=%d {%pISp}",
+ cursor, peer->debug_id, slot, &peer->srx.transport);
- if (keepalive_at <= base ||
- keepalive_at > base + RXRPC_KEEPALIVE_TIME) {
- rxrpc_send_keepalive(peer);
- slot = RXRPC_KEEPALIVE_TIME;
- }
+ if (keepalive_at <= base ||
+ keepalive_at > base + RXRPC_KEEPALIVE_TIME) {
+ rxrpc_send_keepalive(peer);
+ slot = RXRPC_KEEPALIVE_TIME;
+ }
- /* A transmission to this peer occurred since last we examined
- * it so put it into the appropriate future bucket.
- */
- slot += cursor;
- slot &= mask;
- spin_lock_bh(&rxnet->peer_hash_lock);
- list_add_tail(&peer->keepalive_link,
- &rxnet->peer_keepalive[slot & mask]);
+ /* A transmission to this peer occurred since last we
+ * examined it so put it into the appropriate future
+ * bucket.
+ */
+ slot += cursor;
+ slot &= mask;
+ spin_lock_bh(&rxnet->peer_hash_lock);
+ list_add_tail(&peer->keepalive_link,
+ &rxnet->peer_keepalive[slot & mask]);
+ rxrpc_unuse_local(peer->local);
+ }
rxrpc_put_peer_locked(peer);
}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 9c3ac96f71cb..452163eadb98 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -209,6 +209,7 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx,
*/
struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
{
+ const void *here = __builtin_return_address(0);
struct rxrpc_peer *peer;
_enter("");
@@ -216,7 +217,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
peer = kzalloc(sizeof(struct rxrpc_peer), gfp);
if (peer) {
atomic_set(&peer->usage, 1);
- peer->local = local;
+ peer->local = rxrpc_get_local(local);
INIT_HLIST_HEAD(&peer->error_targets);
peer->service_conns = RB_ROOT;
seqlock_init(&peer->service_conn_lock);
@@ -230,6 +231,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
peer->cong_cwnd = 3;
else
peer->cong_cwnd = 4;
+ trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here);
}
_leave(" = %p", peer);
@@ -307,7 +309,6 @@ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local,
unsigned long hash_key;
hash_key = rxrpc_peer_hash_key(local, &peer->srx);
- peer->local = local;
rxrpc_init_peer(rx, peer, hash_key);
spin_lock(&rxnet->peer_hash_lock);
@@ -382,7 +383,7 @@ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
int n;
n = atomic_inc_return(&peer->usage);
- trace_rxrpc_peer(peer, rxrpc_peer_got, n, here);
+ trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n, here);
return peer;
}
@@ -396,7 +397,7 @@ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
if (peer) {
int n = atomic_fetch_add_unless(&peer->usage, 1, 0);
if (n > 0)
- trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here);
+ trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n + 1, here);
else
peer = NULL;
}
@@ -417,6 +418,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
list_del_init(&peer->keepalive_link);
spin_unlock_bh(&rxnet->peer_hash_lock);
+ rxrpc_put_local(peer->local);
kfree_rcu(peer, rcu);
}
@@ -426,11 +428,13 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
void rxrpc_put_peer(struct rxrpc_peer *peer)
{
const void *here = __builtin_return_address(0);
+ unsigned int debug_id;
int n;
if (peer) {
+ debug_id = peer->debug_id;
n = atomic_dec_return(&peer->usage);
- trace_rxrpc_peer(peer, rxrpc_peer_put, n, here);
+ trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here);
if (n == 0)
__rxrpc_put_peer(peer);
}
@@ -443,13 +447,15 @@ void rxrpc_put_peer(struct rxrpc_peer *peer)
void rxrpc_put_peer_locked(struct rxrpc_peer *peer)
{
const void *here = __builtin_return_address(0);
+ unsigned int debug_id = peer->debug_id;
int n;
n = atomic_dec_return(&peer->usage);
- trace_rxrpc_peer(peer, rxrpc_peer_put, n, here);
+ trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here);
if (n == 0) {
hash_del_rcu(&peer->hash_link);
list_del_init(&peer->keepalive_link);
+ rxrpc_put_local(peer->local);
kfree_rcu(peer, rcu);
}
}
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 99ce322d7caa..49bb972539aa 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -89,6 +89,15 @@ struct rxrpc_jumbo_header {
#define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */
#define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
+/*
+ * The maximum number of subpackets that can possibly fit in a UDP packet is:
+ *
+ * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1
+ * = ((65535 - 28 - 28) / 1416) + 1
+ * = 46 non-terminal packets and 1 terminal packet.
+ */
+#define RXRPC_MAX_NR_JUMBO 47
+
/*****************************************************************************/
/*
* on-the-wire Rx ACK packet data payload
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 9a7e1bc9791d..8578c39ec839 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -177,7 +177,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
struct sk_buff *skb;
rxrpc_serial_t serial;
rxrpc_seq_t hard_ack, top;
- u8 flags;
+ bool last = false;
+ u8 subpacket;
int ix;
_enter("%d", call->debug_id);
@@ -189,23 +190,25 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
hard_ack++;
ix = hard_ack & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
- rxrpc_see_skb(skb, rxrpc_skb_rx_rotated);
+ rxrpc_see_skb(skb, rxrpc_skb_rotated);
sp = rxrpc_skb(skb);
- flags = sp->hdr.flags;
- serial = sp->hdr.serial;
- if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO)
- serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1;
+
+ subpacket = call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET;
+ serial = sp->hdr.serial + subpacket;
+
+ if (subpacket == sp->nr_subpackets - 1 &&
+ sp->rx_flags & RXRPC_SKB_INCL_LAST)
+ last = true;
call->rxtx_buffer[ix] = NULL;
call->rxtx_annotations[ix] = 0;
/* Barrier against rxrpc_input_data(). */
smp_store_release(&call->rx_hard_ack, hard_ack);
- rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
- _debug("%u,%u,%02x", hard_ack, top, flags);
trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack);
- if (flags & RXRPC_LAST_PACKET) {
+ if (last) {
rxrpc_end_rx_phase(call, serial);
} else {
/* Check to see if there's an ACK that needs sending. */
@@ -233,22 +236,23 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
rxrpc_seq_t seq = sp->hdr.seq;
u16 cksum = sp->hdr.cksum;
+ u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET;
_enter("");
/* For all but the head jumbo subpacket, the security checksum is in a
* jumbo header immediately prior to the data.
*/
- if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) {
+ if (subpacket > 0) {
__be16 tmp;
if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0)
BUG();
cksum = ntohs(tmp);
- seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1;
+ seq += subpacket;
}
- return call->conn->security->verify_packet(call, skb, offset, len,
- seq, cksum);
+ return call->security->verify_packet(call, skb, offset, len,
+ seq, cksum);
}
/*
@@ -263,21 +267,24 @@ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
*/
static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
u8 *_annotation,
- unsigned int *_offset, unsigned int *_len)
+ unsigned int *_offset, unsigned int *_len,
+ bool *_last)
{
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len;
+ bool last = false;
int ret;
u8 annotation = *_annotation;
+ u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET;
/* Locate the subpacket */
+ offset += subpacket * RXRPC_JUMBO_SUBPKTLEN;
len = skb->len - offset;
- if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) {
- offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) *
- RXRPC_JUMBO_SUBPKTLEN);
- len = (annotation & RXRPC_RX_ANNO_JLAST) ?
- skb->len - offset : RXRPC_JUMBO_SUBPKTLEN;
- }
+ if (subpacket < sp->nr_subpackets - 1)
+ len = RXRPC_JUMBO_DATALEN;
+ else if (sp->rx_flags & RXRPC_SKB_INCL_LAST)
+ last = true;
if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) {
ret = rxrpc_verify_packet(call, skb, annotation, offset, len);
@@ -288,7 +295,8 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
*_offset = offset;
*_len = len;
- call->conn->security->locate_data(call, skb, _offset, _len);
+ *_last = last;
+ call->security->locate_data(call, skb, _offset, _len);
return 0;
}
@@ -303,9 +311,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
{
struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
+ rxrpc_serial_t serial;
rxrpc_seq_t hard_ack, top, seq;
size_t remain;
- bool last;
+ bool rx_pkt_last;
unsigned int rx_pkt_offset, rx_pkt_len;
int ix, copy, ret = -EAGAIN, ret2;
@@ -315,6 +324,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
rx_pkt_offset = call->rx_pkt_offset;
rx_pkt_len = call->rx_pkt_len;
+ rx_pkt_last = call->rx_pkt_last;
if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) {
seq = call->rx_hard_ack;
@@ -325,6 +335,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
/* Barriers against rxrpc_input_data(). */
hard_ack = call->rx_hard_ack;
seq = hard_ack + 1;
+
while (top = smp_load_acquire(&call->rx_top),
before_eq(seq, top)
) {
@@ -336,12 +347,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
break;
}
smp_rmb();
- rxrpc_see_skb(skb, rxrpc_skb_rx_seen);
+ rxrpc_see_skb(skb, rxrpc_skb_seen);
sp = rxrpc_skb(skb);
- if (!(flags & MSG_PEEK))
+ if (!(flags & MSG_PEEK)) {
+ serial = sp->hdr.serial;
+ serial += call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET;
trace_rxrpc_receive(call, rxrpc_receive_front,
- sp->hdr.serial, seq);
+ serial, seq);
+ }
if (msg)
sock_recv_timestamp(msg, sock->sk, skb);
@@ -349,7 +363,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
if (rx_pkt_offset == 0) {
ret2 = rxrpc_locate_data(call, skb,
&call->rxtx_annotations[ix],
- &rx_pkt_offset, &rx_pkt_len);
+ &rx_pkt_offset, &rx_pkt_len,
+ &rx_pkt_last);
trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq,
rx_pkt_offset, rx_pkt_len, ret2);
if (ret2 < 0) {
@@ -389,13 +404,12 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
}
/* The whole packet has been transferred. */
- last = sp->hdr.flags & RXRPC_LAST_PACKET;
if (!(flags & MSG_PEEK))
rxrpc_rotate_rx_window(call);
rx_pkt_offset = 0;
rx_pkt_len = 0;
- if (last) {
+ if (rx_pkt_last) {
ASSERTCMP(seq, ==, READ_ONCE(call->rx_top));
ret = 1;
goto out;
@@ -408,6 +422,7 @@ out:
if (!(flags & MSG_PEEK)) {
call->rx_pkt_offset = rx_pkt_offset;
call->rx_pkt_len = rx_pkt_len;
+ call->rx_pkt_last = rx_pkt_last;
}
done:
trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index ae8cd8926456..098f1f9ec53b 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -43,6 +43,7 @@ struct rxkad_level2_hdr {
* packets
*/
static struct crypto_sync_skcipher *rxkad_ci;
+static struct skcipher_request *rxkad_ci_req;
static DEFINE_MUTEX(rxkad_ci_mutex);
/*
@@ -99,8 +100,8 @@ error:
*/
static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
{
+ struct skcipher_request *req;
struct rxrpc_key_token *token;
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
struct scatterlist sg;
struct rxrpc_crypt iv;
__be32 *tmpbuf;
@@ -115,6 +116,12 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
if (!tmpbuf)
return -ENOMEM;
+ req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS);
+ if (!req) {
+ kfree(tmpbuf);
+ return -ENOMEM;
+ }
+
token = conn->params.key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
@@ -128,7 +135,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, &sg, &sg, tmpsize, iv.x);
crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
+ skcipher_request_free(req);
memcpy(&conn->csum_iv, tmpbuf + 2, sizeof(conn->csum_iv));
kfree(tmpbuf);
@@ -137,6 +144,35 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
}
/*
+ * Allocate and prepare the crypto request on a call. For any particular call,
+ * this is called serially for the packets, so no lock should be necessary.
+ */
+static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call)
+{
+ struct crypto_skcipher *tfm = &call->conn->cipher->base;
+ struct skcipher_request *cipher_req = call->cipher_req;
+
+ if (!cipher_req) {
+ cipher_req = skcipher_request_alloc(tfm, GFP_NOFS);
+ if (!cipher_req)
+ return NULL;
+ call->cipher_req = cipher_req;
+ }
+
+ return cipher_req;
+}
+
+/*
+ * Clean up the crypto on a call.
+ */
+static void rxkad_free_call_crypto(struct rxrpc_call *call)
+{
+ if (call->cipher_req)
+ skcipher_request_free(call->cipher_req);
+ call->cipher_req = NULL;
+}
+
+/*
* partially encrypt a packet (level 1 security)
*/
static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
@@ -187,10 +223,8 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct rxrpc_skb_priv *sp;
struct rxrpc_crypt iv;
struct scatterlist sg[16];
- struct sk_buff *trailer;
unsigned int len;
u16 check;
- int nsg;
int err;
sp = rxrpc_skb(skb);
@@ -214,15 +248,14 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
crypto_skcipher_encrypt(req);
/* we want to encrypt the skbuff in-place */
- nsg = skb_cow_data(skb, 0, &trailer);
- err = -ENOMEM;
- if (nsg < 0 || nsg > 16)
+ err = -EMSGSIZE;
+ if (skb_shinfo(skb)->nr_frags > 16)
goto out;
len = data_size + call->conn->size_align - 1;
len &= ~(call->conn->size_align - 1);
- sg_init_table(sg, nsg);
+ sg_init_table(sg, ARRAY_SIZE(sg));
err = skb_to_sgvec(skb, sg, 0, len);
if (unlikely(err < 0))
goto out;
@@ -246,7 +279,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
void *sechdr)
{
struct rxrpc_skb_priv *sp;
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
+ struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg;
u32 x, y;
@@ -265,6 +298,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
if (ret < 0)
return ret;
+ req = rxkad_get_call_crypto(call);
+ if (!req)
+ return -ENOMEM;
+
/* continue encrypting from where we left off */
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
@@ -319,11 +356,10 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
struct rxkad_level1_hdr sechdr;
struct rxrpc_crypt iv;
struct scatterlist sg[16];
- struct sk_buff *trailer;
bool aborted;
u32 data_size, buf;
u16 check;
- int nsg, ret;
+ int ret;
_enter("");
@@ -336,11 +372,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
/* Decrypt the skbuff in-place. TODO: We really want to decrypt
* directly into the target buffer.
*/
- nsg = skb_cow_data(skb, 0, &trailer);
- if (nsg < 0 || nsg > 16)
- goto nomem;
-
- sg_init_table(sg, nsg);
+ sg_init_table(sg, ARRAY_SIZE(sg));
ret = skb_to_sgvec(skb, sg, offset, 8);
if (unlikely(ret < 0))
return ret;
@@ -388,10 +420,6 @@ protocol_error:
if (aborted)
rxrpc_send_abort_packet(call);
return -EPROTO;
-
-nomem:
- _leave(" = -ENOMEM");
- return -ENOMEM;
}
/*
@@ -406,7 +434,6 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
struct rxkad_level2_hdr sechdr;
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
- struct sk_buff *trailer;
bool aborted;
u32 data_size, buf;
u16 check;
@@ -423,12 +450,11 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
/* Decrypt the skbuff in-place. TODO: We really want to decrypt
* directly into the target buffer.
*/
- nsg = skb_cow_data(skb, 0, &trailer);
- if (nsg < 0)
- goto nomem;
-
sg = _sg;
- if (unlikely(nsg > 4)) {
+ nsg = skb_shinfo(skb)->nr_frags;
+ if (nsg <= 4) {
+ nsg = 4;
+ } else {
sg = kmalloc_array(nsg, sizeof(*sg), GFP_NOIO);
if (!sg)
goto nomem;
@@ -502,7 +528,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
unsigned int offset, unsigned int len,
rxrpc_seq_t seq, u16 expected_cksum)
{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
+ struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg;
bool aborted;
@@ -515,6 +541,10 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
if (!call->conn->cipher)
return 0;
+ req = rxkad_get_call_crypto(call);
+ if (!req)
+ return -ENOMEM;
+
/* continue encrypting from where we left off */
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
@@ -618,9 +648,9 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
u32 serial;
int ret;
- _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key));
+ _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
- ret = key_validate(conn->params.key);
+ ret = key_validate(conn->server_key);
if (ret < 0)
return ret;
@@ -747,14 +777,18 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response)
/*
* encrypt the response packet
*/
-static void rxkad_encrypt_response(struct rxrpc_connection *conn,
- struct rxkad_response *resp,
- const struct rxkad_key *s2)
+static int rxkad_encrypt_response(struct rxrpc_connection *conn,
+ struct rxkad_response *resp,
+ const struct rxkad_key *s2)
{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
+ struct skcipher_request *req;
struct rxrpc_crypt iv;
struct scatterlist sg[1];
+ req = skcipher_request_alloc(&conn->cipher->base, GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
/* continue encrypting from where we left off */
memcpy(&iv, s2->session_key, sizeof(iv));
@@ -764,7 +798,8 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn,
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
crypto_skcipher_encrypt(req);
- skcipher_request_zero(req);
+ skcipher_request_free(req);
+ return 0;
}
/*
@@ -839,8 +874,9 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
/* calculate the response checksum and then do the encryption */
rxkad_calc_response_checksum(resp);
- rxkad_encrypt_response(conn, resp, token->kad);
- ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad);
+ ret = rxkad_encrypt_response(conn, resp, token->kad);
+ if (ret == 0)
+ ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad);
kfree(resp);
return ret;
@@ -1017,18 +1053,16 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
struct rxkad_response *resp,
const struct rxrpc_crypt *session_key)
{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci);
+ struct skcipher_request *req = rxkad_ci_req;
struct scatterlist sg[1];
struct rxrpc_crypt iv;
_enter(",,%08x%08x",
ntohl(session_key->n[0]), ntohl(session_key->n[1]));
- ASSERT(rxkad_ci != NULL);
-
mutex_lock(&rxkad_ci_mutex);
if (crypto_sync_skcipher_setkey(rxkad_ci, session_key->x,
- sizeof(*session_key)) < 0)
+ sizeof(*session_key)) < 0)
BUG();
memcpy(&iv, session_key, sizeof(iv));
@@ -1222,10 +1256,26 @@ static void rxkad_clear(struct rxrpc_connection *conn)
*/
static int rxkad_init(void)
{
+ struct crypto_sync_skcipher *tfm;
+ struct skcipher_request *req;
+
/* pin the cipher we need so that the crypto layer doesn't invoke
* keventd to go get it */
- rxkad_ci = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0);
- return PTR_ERR_OR_ZERO(rxkad_ci);
+ tfm = crypto_alloc_sync_skcipher("pcbc(fcrypt)", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ req = skcipher_request_alloc(&tfm->base, GFP_KERNEL);
+ if (!req)
+ goto nomem_tfm;
+
+ rxkad_ci_req = req;
+ rxkad_ci = tfm;
+ return 0;
+
+nomem_tfm:
+ crypto_free_sync_skcipher(tfm);
+ return -ENOMEM;
}
/*
@@ -1233,8 +1283,8 @@ static int rxkad_init(void)
*/
static void rxkad_exit(void)
{
- if (rxkad_ci)
- crypto_free_sync_skcipher(rxkad_ci);
+ crypto_free_sync_skcipher(rxkad_ci);
+ skcipher_request_free(rxkad_ci_req);
}
/*
@@ -1243,12 +1293,14 @@ static void rxkad_exit(void)
const struct rxrpc_security rxkad = {
.name = "rxkad",
.security_index = RXRPC_SECURITY_RXKAD,
+ .no_key_abort = RXKADUNKNOWNKEY,
.init = rxkad_init,
.exit = rxkad_exit,
.init_connection_security = rxkad_init_connection_security,
.prime_packet_security = rxkad_prime_packet_security,
.secure_packet = rxkad_secure_packet,
.verify_packet = rxkad_verify_packet,
+ .free_call_crypto = rxkad_free_call_crypto,
.locate_data = rxkad_locate_data,
.issue_challenge = rxkad_issue_challenge,
.respond_to_challenge = rxkad_respond_to_challenge,
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index a4c47d2b7054..9b1fb9ed0717 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -101,62 +101,58 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn)
}
/*
- * initialise the security on a server connection
+ * Find the security key for a server connection.
*/
-int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
+bool rxrpc_look_up_server_security(struct rxrpc_local *local, struct rxrpc_sock *rx,
+ const struct rxrpc_security **_sec,
+ struct key **_key,
+ struct sk_buff *skb)
{
const struct rxrpc_security *sec;
- struct rxrpc_local *local = conn->params.local;
- struct rxrpc_sock *rx;
- struct key *key;
- key_ref_t kref;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ key_ref_t kref = NULL;
char kdesc[5 + 1 + 3 + 1];
_enter("");
- sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix);
+ sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex);
- sec = rxrpc_security_lookup(conn->security_ix);
+ sec = rxrpc_security_lookup(sp->hdr.securityIndex);
if (!sec) {
- _leave(" = -ENOKEY [lookup]");
- return -ENOKEY;
+ trace_rxrpc_abort(0, "SVS",
+ sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_INVALID_OPERATION, EKEYREJECTED);
+ skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
+ skb->priority = RX_INVALID_OPERATION;
+ return false;
}
- /* find the service */
- read_lock(&local->services_lock);
- rx = rcu_dereference_protected(local->service,
- lockdep_is_held(&local->services_lock));
- if (rx && (rx->srx.srx_service == conn->service_id ||
- rx->second_service == conn->service_id))
- goto found_service;
+ if (sp->hdr.securityIndex == RXRPC_SECURITY_NONE)
+ goto out;
- /* the service appears to have died */
- read_unlock(&local->services_lock);
- _leave(" = -ENOENT");
- return -ENOENT;
-
-found_service:
if (!rx->securities) {
- read_unlock(&local->services_lock);
- _leave(" = -ENOKEY");
- return -ENOKEY;
+ trace_rxrpc_abort(0, "SVR",
+ sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ RX_INVALID_OPERATION, EKEYREJECTED);
+ skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
+ skb->priority = RX_INVALID_OPERATION;
+ return false;
}
/* look through the service's keyring */
kref = keyring_search(make_key_ref(rx->securities, 1UL),
&key_type_rxrpc_s, kdesc, true);
if (IS_ERR(kref)) {
- read_unlock(&local->services_lock);
- _leave(" = %ld [search]", PTR_ERR(kref));
- return PTR_ERR(kref);
+ trace_rxrpc_abort(0, "SVK",
+ sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+ sec->no_key_abort, EKEYREJECTED);
+ skb->mark = RXRPC_SKB_MARK_REJECT_ABORT;
+ skb->priority = sec->no_key_abort;
+ return false;
}
- key = key_ref_to_ptr(kref);
- read_unlock(&local->services_lock);
-
- conn->server_key = key;
- conn->security = sec;
-
- _leave(" = 0");
- return 0;
+out:
+ *_sec = sec;
+ *_key = key_ref_to_ptr(kref);
+ return true;
}
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index bae14438f869..813fd6888142 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -176,7 +176,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
skb->tstamp = ktime_get_real();
ix = seq & RXRPC_RXTX_BUFF_MASK;
- rxrpc_get_skb(skb, rxrpc_skb_tx_got);
+ rxrpc_get_skb(skb, rxrpc_skb_got);
call->rxtx_annotations[ix] = annotation;
smp_wmb();
call->rxtx_buffer[ix] = skb;
@@ -248,7 +248,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
}
out:
- rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
_leave(" = %d", ret);
return ret;
}
@@ -289,7 +289,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
skb = call->tx_pending;
call->tx_pending = NULL;
- rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
+ rxrpc_see_skb(skb, rxrpc_skb_seen);
copied = 0;
do {
@@ -336,7 +336,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
if (!skb)
goto maybe_error;
- rxrpc_new_skb(skb, rxrpc_skb_tx_new);
+ sp = rxrpc_skb(skb);
+ sp->rx_flags |= RXRPC_SKB_TX_BUFFER;
+ rxrpc_new_skb(skb, rxrpc_skb_new);
_debug("ALLOC SEND %p", skb);
@@ -346,7 +348,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
skb_reserve(skb, call->conn->security_size);
skb->len += call->conn->security_size;
- sp = rxrpc_skb(skb);
sp->remain = chunk;
if (sp->remain > skb_tailroom(skb))
sp->remain = skb_tailroom(skb);
@@ -418,7 +419,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
call->tx_winsize)
sp->hdr.flags |= RXRPC_MORE_PACKETS;
- ret = conn->security->secure_packet(
+ ret = call->security->secure_packet(
call, skb, skb->mark, skb->head);
if (ret < 0)
goto out;
@@ -439,7 +440,7 @@ out:
return ret;
call_terminated:
- rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
+ rxrpc_free_skb(skb, rxrpc_skb_freed);
_leave(" = %d", call->error);
return call->error;
@@ -660,6 +661,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
case RXRPC_CALL_SERVER_PREALLOC:
case RXRPC_CALL_SERVER_SECURING:
case RXRPC_CALL_SERVER_ACCEPTING:
+ rxrpc_put_call(call, rxrpc_call_put);
ret = -EBUSY;
goto error_release_sock;
default:
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 9ad5045b7c2f..0348d2bf6f7d 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -14,7 +14,8 @@
#include <net/af_rxrpc.h>
#include "ar-internal.h"
-#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
+#define is_tx_skb(skb) (rxrpc_skb(skb)->rx_flags & RXRPC_SKB_TX_BUFFER)
+#define select_skb_count(skb) (is_tx_skb(skb) ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs)
/*
* Note the allocation or reception of a socket buffer.
@@ -22,8 +23,9 @@
void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(select_skb_count(op));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+ int n = atomic_inc_return(select_skb_count(skb));
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
+ rxrpc_skb(skb)->rx_flags, here);
}
/*
@@ -33,8 +35,9 @@ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
if (skb) {
- int n = atomic_read(select_skb_count(op));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+ int n = atomic_read(select_skb_count(skb));
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
+ rxrpc_skb(skb)->rx_flags, here);
}
}
@@ -44,12 +47,23 @@ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
{
const void *here = __builtin_return_address(0);
- int n = atomic_inc_return(select_skb_count(op));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+ int n = atomic_inc_return(select_skb_count(skb));
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
+ rxrpc_skb(skb)->rx_flags, here);
skb_get(skb);
}
/*
+ * Note the dropping of a ref on a socket buffer by the core.
+ */
+void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
+{
+ const void *here = __builtin_return_address(0);
+ int n = atomic_inc_return(&rxrpc_n_rx_skbs);
+ trace_rxrpc_skb(skb, op, 0, n, 0, here);
+}
+
+/*
* Note the destruction of a socket buffer.
*/
void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
@@ -58,8 +72,9 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op)
if (skb) {
int n;
CHECK_SLAB_OKAY(&skb->users);
- n = atomic_dec_return(select_skb_count(op));
- trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here);
+ n = atomic_dec_return(select_skb_count(skb));
+ trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n,
+ rxrpc_skb(skb)->rx_flags, here);
kfree_skb(skb);
}
}
@@ -72,9 +87,10 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
const void *here = __builtin_return_address(0);
struct sk_buff *skb;
while ((skb = skb_dequeue((list))) != NULL) {
- int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged));
- trace_rxrpc_skb(skb, rxrpc_skb_rx_purged,
- refcount_read(&skb->users), n, here);
+ int n = atomic_dec_return(select_skb_count(skb));
+ trace_rxrpc_skb(skb, rxrpc_skb_purged,
+ refcount_read(&skb->users), n,
+ rxrpc_skb(skb)->rx_flags, here);
kfree_skb(skb);
}
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index afd2ba157a13..edde0e519438 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -324,7 +324,7 @@ config NET_SCH_CAKE
tristate "Common Applications Kept Enhanced (CAKE)"
help
Say Y here if you want to use the Common Applications Kept Enhanced
- (CAKE) queue management algorithm.
+ (CAKE) queue management algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_cake.
@@ -366,6 +366,19 @@ config NET_SCH_PIE
If unsure, say N.
+config NET_SCH_FQ_PIE
+ depends on NET_SCH_PIE
+ tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"
+ help
+ Say Y here if you want to use the Flow Queue Proportional Integral
+ controller Enhanced (FQ-PIE) packet scheduling algorithm.
+ For more information, please see https://tools.ietf.org/html/rfc8033
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_fq_pie.
+
+ If unsure, say N.
+
config NET_SCH_INGRESS
tristate "Ingress/classifier-action Qdisc"
depends on NET_CLS_ACT
@@ -409,6 +422,23 @@ config NET_SCH_PLUG
To compile this code as a module, choose M here: the
module will be called sch_plug.
+config NET_SCH_ETS
+ tristate "Enhanced transmission selection scheduler (ETS)"
+ help
+ The Enhanced Transmission Selection scheduler is a classful
+ queuing discipline that merges functionality of PRIO and DRR
+ qdiscs in one scheduler. ETS makes it easy to configure a set of
+ strict and bandwidth-sharing bands to implement the transmission
+ selection described in 802.1Qaz.
+
+ Say Y here if you want to use the ETS packet scheduling
+ algorithm.
+
+ To compile this driver as a module, choose M here: the module
+ will be called sch_ets.
+
+ If unsure, say N.
+
menuconfig NET_SCH_DEFAULT
bool "Allow override default queue discipline"
---help---
@@ -730,8 +760,8 @@ config NET_CLS_ACT
config NET_ACT_POLICE
tristate "Traffic Policing"
- depends on NET_CLS_ACT
- ---help---
+ depends on NET_CLS_ACT
+ ---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
@@ -740,9 +770,9 @@ config NET_ACT_POLICE
module will be called act_police.
config NET_ACT_GACT
- tristate "Generic actions"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Generic actions"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to take generic actions such as dropping and
accepting packets.
@@ -750,15 +780,15 @@ config NET_ACT_GACT
module will be called act_gact.
config GACT_PROB
- bool "Probability support"
- depends on NET_ACT_GACT
- ---help---
+ bool "Probability support"
+ depends on NET_ACT_GACT
+ ---help---
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
- tristate "Redirecting and Mirroring"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Redirecting and Mirroring"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to allow packets to be mirrored or redirected to
other devices.
@@ -766,10 +796,10 @@ config NET_ACT_MIRRED
module will be called act_mirred.
config NET_ACT_SAMPLE
- tristate "Traffic Sampling"
- depends on NET_CLS_ACT
- select PSAMPLE
- ---help---
+ tristate "Traffic Sampling"
+ depends on NET_CLS_ACT
+ select PSAMPLE
+ ---help---
Say Y here to allow packet sampling tc action. The packet sample
action consists of statistically choosing packets and sampling
them using the psample module.
@@ -778,9 +808,9 @@ config NET_ACT_SAMPLE
module will be called act_sample.
config NET_ACT_IPT
- tristate "IPtables targets"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- ---help---
+ tristate "IPtables targets"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ ---help---
Say Y here to be able to invoke iptables targets after successful
classification.
@@ -788,9 +818,9 @@ config NET_ACT_IPT
module will be called act_ipt.
config NET_ACT_NAT
- tristate "Stateless NAT"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Stateless NAT"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
@@ -798,18 +828,18 @@ config NET_ACT_NAT
module will be called act_nat.
config NET_ACT_PEDIT
- tristate "Packet Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Packet Editing"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
module will be called act_pedit.
config NET_ACT_SIMP
- tristate "Simple Example (Debug)"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Simple Example (Debug)"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
@@ -821,9 +851,9 @@ config NET_ACT_SIMP
module will be called act_simple.
config NET_ACT_SKBEDIT
- tristate "SKB Editing"
- depends on NET_CLS_ACT
- ---help---
+ tristate "SKB Editing"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
@@ -832,10 +862,10 @@ config NET_ACT_SKBEDIT
module will be called act_skbedit.
config NET_ACT_CSUM
- tristate "Checksum Updating"
- depends on NET_CLS_ACT && INET
- select LIBCRC32C
- ---help---
+ tristate "Checksum Updating"
+ depends on NET_CLS_ACT && INET
+ select LIBCRC32C
+ ---help---
Say Y here to update some common checksum after some direct
packet alterations.
@@ -854,9 +884,9 @@ config NET_ACT_MPLS
module will be called act_mpls.
config NET_ACT_VLAN
- tristate "Vlan manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "Vlan manipulation"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to push or pop vlan headers.
If unsure, say N.
@@ -865,9 +895,9 @@ config NET_ACT_VLAN
module will be called act_vlan.
config NET_ACT_BPF
- tristate "BPF based action"
- depends on NET_CLS_ACT
- ---help---
+ tristate "BPF based action"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to execute BPF code on packets. The BPF code will decide
if the packet should be dropped or not.
@@ -877,10 +907,10 @@ config NET_ACT_BPF
module will be called act_bpf.
config NET_ACT_CONNMARK
- tristate "Netfilter Connection Mark Retriever"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- ---help---
+ tristate "Netfilter Connection Mark Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ ---help---
Say Y here to allow retrieving of conn mark
If unsure, say N.
@@ -889,10 +919,10 @@ config NET_ACT_CONNMARK
module will be called act_connmark.
config NET_ACT_CTINFO
- tristate "Netfilter Connection Mark Actions"
- depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
- depends on NF_CONNTRACK && NF_CONNTRACK_MARK
- help
+ tristate "Netfilter Connection Mark Actions"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
Say Y here to allow transfer of a connmark stored information.
Current actions transfer connmark stored DSCP into
ipv4/v6 diffserv and/or to transfer connmark to packet
@@ -906,21 +936,21 @@ config NET_ACT_CTINFO
module will be called act_ctinfo.
config NET_ACT_SKBMOD
- tristate "skb data modification action"
- depends on NET_CLS_ACT
- ---help---
- Say Y here to allow modification of skb data
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow modification of skb data
- If unsure, say N.
+ If unsure, say N.
- To compile this code as a module, choose M here: the
- module will be called act_skbmod.
+ To compile this code as a module, choose M here: the
+ module will be called act_skbmod.
config NET_ACT_IFE
- tristate "Inter-FE action based on IETF ForCES InterFE LFB"
- depends on NET_CLS_ACT
- select NET_IFE
- ---help---
+ tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+ depends on NET_CLS_ACT
+ select NET_IFE
+ ---help---
Say Y here to allow for sourcing and terminating metadata
For details refer to netdev01 paper:
"Distributing Linux Traffic Control Classifier-Action Subsystem"
@@ -930,9 +960,9 @@ config NET_ACT_IFE
module will be called act_ife.
config NET_ACT_TUNNEL_KEY
- tristate "IP tunnel metadata manipulation"
- depends on NET_CLS_ACT
- ---help---
+ tristate "IP tunnel metadata manipulation"
+ depends on NET_CLS_ACT
+ ---help---
Say Y here to set/release ip tunnel metadata.
If unsure, say N.
@@ -941,9 +971,9 @@ config NET_ACT_TUNNEL_KEY
module will be called act_tunnel_key.
config NET_ACT_CT
- tristate "connection tracking tc action"
- depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
- help
+ tristate "connection tracking tc action"
+ depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+ help
Say Y here to allow sending the packets to conntrack module.
If unsure, say N.
@@ -952,16 +982,28 @@ config NET_ACT_CT
module will be called act_ct.
config NET_IFE_SKBMARK
- tristate "Support to encoding decoding skb mark on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb mark on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBPRIO
- tristate "Support to encoding decoding skb prio on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb prio on IFE action"
+ depends on NET_ACT_IFE
config NET_IFE_SKBTCINDEX
- tristate "Support to encoding decoding skb tcindex on IFE action"
- depends on NET_ACT_IFE
+ tristate "Support to encoding decoding skb tcindex on IFE action"
+ depends on NET_ACT_IFE
+
+config NET_TC_SKB_EXT
+ bool "TC recirculation support"
+ depends on NET_CLS_ACT
+ select SKB_EXTENSIONS
+
+ help
+ Say Y here to allow tc chain misses to continue in OvS datapath in
+ the correct recirc_id, and hardware chain misses to continue in
+ the correct chain in tc software datapath.
+
+ Say N here if you won't be using tc<->ovs offload or tc chains offload.
endif # NET_SCHED
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 415d1e1f237e..31c367a6cd09 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
+obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
@@ -58,6 +59,7 @@ obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
+obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 339712296164..90a31b15585f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -88,7 +88,7 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
struct tcf_chain *goto_chain)
{
a->tcfa_action = action;
- rcu_swap_protected(a->goto_chain, goto_chain, 1);
+ goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1);
return goto_chain;
}
EXPORT_SYMBOL(tcf_action_set_ctrlact);
@@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
+ nla_total_size(0) /* TCA_ACT_STATS nested */
/* TCA_STATS_BASIC */
+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+ /* TCA_STATS_PKT64 */
+ + nla_total_size_64bit(sizeof(u64))
/* TCA_STATS_QUEUE */
+ nla_total_size_64bit(sizeof(struct gnet_stats_queue))
+ nla_total_size(0) /* TCA_OPTIONS nested */
@@ -399,7 +401,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
- int bind, bool cpustats)
+ int bind, bool cpustats, u32 flags)
{
struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -427,6 +429,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
p->tcfa_tm.install = jiffies;
p->tcfa_tm.lastuse = jiffies;
p->tcfa_tm.firstuse = 0;
+ p->tcfa_flags = flags;
if (est) {
err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
&p->tcfa_rate_est,
@@ -451,6 +454,17 @@ err1:
}
EXPORT_SYMBOL(tcf_idr_create);
+int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
+ struct nlattr *est, struct tc_action **a,
+ const struct tc_action_ops *ops, int bind,
+ u32 flags)
+{
+ /* Set cpustats according to actions flags. */
+ return tcf_idr_create(tn, index, est, a, ops, bind,
+ !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags);
+}
+EXPORT_SYMBOL(tcf_idr_create_from_flags);
+
void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
{
struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -773,6 +787,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
}
rcu_read_unlock();
+ if (a->tcfa_flags) {
+ struct nla_bitfield32 flags = { a->tcfa_flags,
+ a->tcfa_flags, };
+
+ if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags))
+ goto nla_put_failure;
+ }
+
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
@@ -831,12 +853,24 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
return c;
}
+static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS;
+static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
+ [TCA_ACT_KIND] = { .type = NLA_STRING },
+ [TCA_ACT_INDEX] = { .type = NLA_U32 },
+ [TCA_ACT_COOKIE] = { .type = NLA_BINARY,
+ .len = TC_COOKIE_MAX_SIZE },
+ [TCA_ACT_OPTIONS] = { .type = NLA_NESTED },
+ [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32,
+ .validation_data = &tca_act_flags_allowed },
+};
+
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
bool rtnl_held,
struct netlink_ext_ack *extack)
{
+ struct nla_bitfield32 flags = { 0, 0 };
struct tc_action *a;
struct tc_action_ops *a_o;
struct tc_cookie *cookie = NULL;
@@ -846,8 +880,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
int err;
if (name == NULL) {
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL,
- extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
err = -EINVAL;
@@ -861,13 +895,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
goto err_out;
}
if (tb[TCA_ACT_COOKIE]) {
- int cklen = nla_len(tb[TCA_ACT_COOKIE]);
-
- if (cklen > TC_COOKIE_MAX_SIZE) {
- NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
- goto err_out;
- }
-
cookie = nla_memdup_cookie(tb);
if (!cookie) {
NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
@@ -875,6 +902,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
goto err_out;
}
}
+ if (tb[TCA_ACT_FLAGS])
+ flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
} else {
if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
NL_SET_ERR_MSG(extack, "TC action name too long");
@@ -913,10 +942,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- rtnl_held, tp, extack);
+ rtnl_held, tp, flags.value, extack);
else
err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
- tp, extack);
+ tp, flags.value, extack);
if (err < 0)
goto err_mod;
@@ -974,7 +1003,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
err = PTR_ERR(act);
goto err;
}
- act->order = i;
sz += tcf_action_fill_size(act);
/* Start from index 0 */
actions[i - 1] = act;
@@ -988,6 +1016,29 @@ err:
return err;
}
+void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
+ bool drop, bool hw)
+{
+ if (a->cpu_bstats) {
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+ if (drop)
+ this_cpu_ptr(a->cpu_qstats)->drops += packets;
+
+ if (hw)
+ _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+ bytes, packets);
+ return;
+ }
+
+ _bstats_update(&a->tcfa_bstats, bytes, packets);
+ if (drop)
+ a->tcfa_qstats.drops += packets;
+ if (hw)
+ _bstats_update(&a->tcfa_bstats_hw, bytes, packets);
+}
+EXPORT_SYMBOL(tcf_action_update_stats);
+
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
@@ -1098,7 +1149,8 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
int index;
int err;
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1152,7 +1204,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
b = skb_tail_pointer(skb);
- err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, NULL, extack);
+ err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla,
+ tcf_action_policy, extack);
if (err < 0)
goto err_out;
@@ -1350,11 +1403,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
struct netlink_ext_ack *extack)
{
size_t attr_size = 0;
- int ret = 0;
+ int loop, ret;
struct tc_action *actions[TCA_ACT_MAX_PRIO] = {};
- ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions,
- &attr_size, true, extack);
+ for (loop = 0; loop < 10; loop++) {
+ ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0,
+ actions, &attr_size, true, extack);
+ if (ret != -EAGAIN)
+ break;
+ }
+
if (ret < 0)
return ret;
ret = tcf_add_notify(net, n, actions, portid, attr_size, extack);
@@ -1404,11 +1462,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
*/
if (n->nlmsg_flags & NLM_F_REPLACE)
ovr = 1;
-replay:
ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
extack);
- if (ret == -EAGAIN)
- goto replay;
break;
case RTM_DELACTION:
ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
@@ -1440,7 +1495,7 @@ static struct nlattr *find_dump_kind(struct nlattr **nla)
if (tb[1] == NULL)
return NULL;
- if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0)
+ if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0)
return NULL;
kind = tb2[TCA_ACT_KIND];
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index fd1f7e799e23..46f47e58b3be 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
int replace, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
@@ -303,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
ret = tcf_idr_check_alloc(tn, &index, act, bind);
if (!ret) {
ret = tcf_idr_create(tn, index, est, act,
- &act_bpf_ops, bind, true);
+ &act_bpf_ops, bind, true, 0);
if (ret < 0) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -422,7 +423,7 @@ static __net_init int bpf_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
- return tc_action_net_init(tn, &act_bpf_ops);
+ return tc_action_net_init(net, tn, &act_bpf_ops);
}
static void __net_exit bpf_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 32ac04d77a45..43a243081e7d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
@@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
ret = tcf_idr_create(tn, index, est, a,
- &act_connmark_ops, bind, false);
+ &act_connmark_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -231,7 +231,7 @@ static __net_init int connmark_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
- return tc_action_net_init(tn, &act_connmark_ops);
+ return tc_action_net_init(net, tn, &act_connmark_ops);
}
static void __net_exit connmark_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 9b9288267a54..cb8608f0a77a 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
struct tcf_csum_params *params_new;
@@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_csum_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_csum_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -101,8 +101,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&p->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(p->params, params_new,
- lockdep_is_held(&p->tcf_lock));
+ params_new = rcu_replace_pointer(p->params, params_new,
+ lockdep_is_held(&p->tcf_lock));
spin_unlock_bh(&p->tcf_lock);
if (goto_ch)
@@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
params = rcu_dereference_bh(p->params);
tcf_lastuse_update(&p->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&p->common, skb);
action = READ_ONCE(p->tcf_action);
if (unlikely(action == TC_ACT_SHOT))
@@ -624,7 +624,7 @@ out:
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&p->common);
action = TC_ACT_SHOT;
goto out;
}
@@ -714,7 +714,7 @@ static __net_init int csum_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
- return tc_action_net_init(tn, &act_csum_ops);
+ return tc_action_net_init(net, tn, &act_csum_ops);
}
static void __net_exit csum_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 33a1a7406e87..f685c0d73708 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -24,12 +24,12 @@
#include <uapi/linux/tc_act/tc_ct.h>
#include <net/tc_act/tc_ct.h>
-#include <linux/netfilter/nf_nat.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <uapi/linux/netfilter/nf_nat.h>
static struct tc_action_ops act_ct_ops;
static unsigned int ct_net_id;
@@ -312,7 +312,7 @@ static void tcf_ct_act_set_labels(struct nf_conn *ct,
u32 *labels_m)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
- size_t labels_sz = FIELD_SIZEOF(struct tcf_ct_params, labels);
+ size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
if (!memchr_inv(labels_m, 0, labels_sz))
return;
@@ -329,6 +329,7 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
bool commit)
{
#if IS_ENABLED(CONFIG_NF_NAT)
+ int err;
enum nf_nat_manip_type maniptype;
if (!(ct_action & TCA_CT_ACT_NAT))
@@ -359,7 +360,17 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
return NF_ACCEPT;
}
- return ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+ err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+ if (err == NF_ACCEPT &&
+ ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
+
+ err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
+ }
+ return err;
#else
return NF_ACCEPT;
#endif
@@ -465,16 +476,15 @@ out_push:
skb_push_rcsum(skb, nh_ofs);
out:
- bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+ tcf_action_update_bstats(&c->common, skb);
return retval;
drop:
- qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+ tcf_action_inc_drop_qstats(&c->common);
return TC_ACT_SHOT;
}
static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
- [TCA_CT_UNSPEC] = { .strict_start_type = TCA_CT_UNSPEC + 1 },
[TCA_CT_ACTION] = { .type = NLA_U16 },
[TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) },
[TCA_CT_ZONE] = { .type = NLA_U16 },
@@ -656,7 +666,7 @@ static int tcf_ct_fill_params(struct net *net,
static int tcf_ct_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int replace, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ct_net_id);
@@ -688,8 +698,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
return err;
if (!err) {
- err = tcf_idr_create(tn, index, est, a,
- &act_ct_ops, bind, true);
+ err = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_ct_ops, bind, flags);
if (err) {
tcf_idr_cleanup(tn, index);
return err;
@@ -722,7 +732,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(c->params, params, lockdep_is_held(&c->tcf_lock));
+ params = rcu_replace_pointer(c->params, params,
+ lockdep_is_held(&c->tcf_lock));
spin_unlock_bh(&c->tcf_lock);
if (goto_ch)
@@ -905,11 +916,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
{
struct tcf_ct *c = to_ct(a);
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, false, hw);
c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
}
@@ -929,7 +936,7 @@ static struct tc_action_ops act_ct_ops = {
static __net_init int ct_init_net(struct net *net)
{
- unsigned int n_bits = FIELD_SIZEOF(struct tcf_ct_params, labels) * 8;
+ unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
struct tc_ct_action_net *tn = net_generic(net, ct_net_id);
if (nf_connlabels_get(net, n_bits - 1)) {
@@ -939,7 +946,7 @@ static __net_init int ct_init_net(struct net *net)
tn->labels = true;
}
- return tc_action_net_init(&tn->tn, &act_ct_ops);
+ return tc_action_net_init(net, &tn->tn, &act_ct_ops);
}
static void __net_exit ct_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 06ef74b74911..19649623493b 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
@@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_ctinfo_ops, bind, false);
+ &act_ctinfo_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -257,8 +257,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&ci->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
- rcu_swap_protected(ci->params, cp_new,
- lockdep_is_held(&ci->tcf_lock));
+ cp_new = rcu_replace_pointer(ci->params, cp_new,
+ lockdep_is_held(&ci->tcf_lock));
spin_unlock_bh(&ci->tcf_lock);
if (goto_ch)
@@ -360,6 +360,16 @@ static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
return tcf_idr_search(tn, a, index);
}
+static void tcf_ctinfo_cleanup(struct tc_action *a)
+{
+ struct tcf_ctinfo *ci = to_ctinfo(a);
+ struct tcf_ctinfo_params *cp;
+
+ cp = rcu_dereference_protected(ci->params, 1);
+ if (cp)
+ kfree_rcu(cp, rcu);
+}
+
static struct tc_action_ops act_ctinfo_ops = {
.kind = "ctinfo",
.id = TCA_ID_CTINFO,
@@ -367,6 +377,7 @@ static struct tc_action_ops act_ctinfo_ops = {
.act = tcf_ctinfo_act,
.dump = tcf_ctinfo_dump,
.init = tcf_ctinfo_init,
+ .cleanup= tcf_ctinfo_cleanup,
.walk = tcf_ctinfo_walker,
.lookup = tcf_ctinfo_search,
.size = sizeof(struct tcf_ctinfo),
@@ -376,7 +387,7 @@ static __net_init int ctinfo_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
- return tc_action_net_init(tn, &act_ctinfo_ops);
+ return tc_action_net_init(net, tn, &act_ctinfo_ops);
}
static void __net_exit ctinfo_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 8f0140c6ca58..416065772719 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
@@ -98,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_gact_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_gact_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -161,9 +162,9 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
action = gact_rand[ptype](gact);
}
#endif
- bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&gact->common, skb);
if (action == TC_ACT_SHOT)
- qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&gact->common);
tcf_lastuse_update(&gact->tcf_tm);
@@ -177,15 +178,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
int action = READ_ONCE(gact->tcf_action);
struct tcf_t *tm = &gact->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
- packets);
- if (action == TC_ACT_SHOT)
- this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
-
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
- bytes, packets);
-
+ tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -278,7 +271,7 @@ static __net_init int gact_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
- return tc_action_net_init(tn, &act_gact_ops);
+ return tc_action_net_init(net, tn, &act_gact_ops);
}
static void __net_exit gact_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 92ee853d43e6..c1fcd85719d6 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
@@ -522,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
- bind, true);
+ bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
kfree(p);
@@ -536,6 +537,9 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
}
ife = to_ife(*a);
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&ife->metalist);
+
err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
if (err < 0)
goto release_idr;
@@ -565,10 +569,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
p->eth_type = ife_type;
}
-
- if (ret == ACT_P_CREATED)
- INIT_LIST_HEAD(&ife->metalist);
-
if (tb[TCA_IFE_METALST]) {
err = nla_parse_nested_deprecated(tb2, IFE_META_MAX,
tb[TCA_IFE_METALST], NULL,
@@ -594,7 +594,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&ife->tcf_lock);
/* protected by tcf_lock when modifying existing action */
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(ife->params, p, 1);
+ p = rcu_replace_pointer(ife->params, p, 1);
if (exists)
spin_unlock_bh(&ife->tcf_lock);
@@ -890,7 +890,7 @@ static __net_init int ife_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
- return tc_action_net_init(tn, &act_ife_ops);
+ return tc_action_net_init(net, tn, &act_ife_ops);
}
static void __net_exit ife_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index ce2c30a591d2..400a2cfe8452 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -61,12 +61,13 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
return 0;
}
-static void ipt_destroy_target(struct xt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t, struct net *net)
{
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
.family = NFPROTO_IPV4,
+ .net = net,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
@@ -78,7 +79,7 @@ static void tcf_ipt_release(struct tc_action *a)
struct tcf_ipt *ipt = to_ipt(a);
if (ipt->tcfi_t) {
- ipt_destroy_target(ipt->tcfi_t);
+ ipt_destroy_target(ipt->tcfi_t, a->idrinfo->net);
kfree(ipt->tcfi_t);
}
kfree(ipt->tcfi_tname);
@@ -94,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
const struct tc_action_ops *ops, int ovr, int bind,
- struct tcf_proto *tp)
+ struct tcf_proto *tp, u32 flags)
{
struct tc_action_net *tn = net_generic(net, id);
struct nlattr *tb[TCA_IPT_MAX + 1];
@@ -143,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a, ops, bind,
- false);
+ false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -180,7 +181,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
spin_lock_bh(&ipt->tcf_lock);
if (ret != ACT_P_CREATED) {
- ipt_destroy_target(ipt->tcfi_t);
+ ipt_destroy_target(ipt->tcfi_t, net);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
}
@@ -204,19 +205,19 @@ err1:
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
- bind, tp);
+ bind, tp, flags);
}
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool unlocked, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
- bind, tp);
+ bind, tp, flags);
}
static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
@@ -350,7 +351,7 @@ static __net_init int ipt_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, ipt_net_id);
- return tc_action_net_init(tn, &act_ipt_ops);
+ return tc_action_net_init(net, tn, &act_ipt_ops);
}
static void __net_exit ipt_exit_net(struct list_head *net_list)
@@ -399,7 +400,7 @@ static __net_init int xt_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, xt_net_id);
- return tc_action_net_init(tn, &act_xt_ops);
+ return tc_action_net_init(net, tn, &act_xt_ops);
}
static void __net_exit xt_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index be3f88dfc37e..1ad300e6dbc0 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
@@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
- ret = tcf_idr_create(tn, index, est, a,
- &act_mirred_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_mirred_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -178,8 +178,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
goto put_chain;
}
mac_header_xmit = dev_is_mac_header_xmit(dev);
- rcu_swap_protected(m->tcfm_dev, dev,
- lockdep_is_held(&m->tcf_lock));
+ dev = rcu_replace_pointer(m->tcfm_dev, dev,
+ lockdep_is_held(&m->tcf_lock));
if (dev)
dev_put(dev);
m->tcfm_mac_header_xmit = mac_header_xmit;
@@ -219,8 +219,10 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
bool use_reinsert;
bool want_ingress;
bool is_redirect;
+ bool expects_nh;
int m_eaction;
int mac_len;
+ bool at_nh;
rec_level = __this_cpu_inc_return(mirred_rec_level);
if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
@@ -231,7 +233,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
}
tcf_lastuse_update(&m->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&m->common, skb);
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
m_eaction = READ_ONCE(m->tcfm_eaction);
@@ -261,19 +263,19 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
goto out;
}
- /* If action's target direction differs than filter's direction,
- * and devices expect a mac header on xmit, then mac push/pull is
- * needed.
- */
want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
- if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
- if (!skb_at_tc_ingress(skb)) {
- /* caught at egress, act ingress: pull mac */
- mac_len = skb_network_header(skb) - skb_mac_header(skb);
+
+ expects_nh = want_ingress || !m_mac_header_xmit;
+ at_nh = skb->data == skb_network_header(skb);
+ if (at_nh != expects_nh) {
+ mac_len = skb_at_tc_ingress(skb) ? skb->mac_len :
+ skb_network_header(skb) - skb_mac_header(skb);
+ if (expects_nh) {
+ /* target device/action expect data at nh */
skb_pull_rcsum(skb2, mac_len);
} else {
- /* caught at ingress, act egress: push mac */
- skb_push_rcsum(skb2, skb->mac_len);
+ /* target device/action expect data at mac */
+ skb_push_rcsum(skb2, mac_len);
}
}
@@ -289,8 +291,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
- res->qstats = this_cpu_ptr(m->common.cpu_qstats);
- skb_tc_reinsert(skb, res);
+ if (skb_tc_reinsert(skb, res))
+ tcf_action_inc_overlimit_qstats(&m->common);
__this_cpu_dec(mirred_rec_level);
return TC_ACT_CONSUMED;
}
@@ -303,7 +305,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
if (err) {
out:
- qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
+ tcf_action_inc_overlimit_qstats(&m->common);
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
@@ -318,10 +320,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
struct tcf_mirred *m = to_mirred(a);
struct tcf_t *tm = &m->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, false, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -408,25 +407,31 @@ static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
-static struct net_device *tcf_mirred_get_dev(const struct tc_action *a)
+static void tcf_mirred_dev_put(void *priv)
+{
+ struct net_device *dev = priv;
+
+ dev_put(dev);
+}
+
+static struct net_device *
+tcf_mirred_get_dev(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
{
struct tcf_mirred *m = to_mirred(a);
struct net_device *dev;
rcu_read_lock();
dev = rcu_dereference(m->tcfm_dev);
- if (dev)
+ if (dev) {
dev_hold(dev);
+ *destructor = tcf_mirred_dev_put;
+ }
rcu_read_unlock();
return dev;
}
-static void tcf_mirred_put_dev(struct net_device *dev)
-{
- dev_put(dev);
-}
-
static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
{
return nla_total_size(sizeof(struct tc_mirred));
@@ -446,14 +451,13 @@ static struct tc_action_ops act_mirred_ops = {
.get_fill_size = tcf_mirred_get_fill_size,
.size = sizeof(struct tcf_mirred),
.get_dev = tcf_mirred_get_dev,
- .put_dev = tcf_mirred_put_dev,
};
static __net_init int mirred_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
- return tc_action_net_init(tn, &act_mirred_ops);
+ return tc_action_net_init(net, tn, &act_mirred_ops);
}
static void __net_exit mirred_exit_net(struct list_head *net_list)
@@ -479,7 +483,11 @@ static int __init mirred_init_module(void)
return err;
pr_info("Mirror/redirect action on\n");
- return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ err = tcf_register_action(&act_mirred_ops, &mirred_net_ops);
+ if (err)
+ unregister_netdevice_notifier(&mirred_device_notifier);
+
+ return err;
}
static void __exit mirred_cleanup_module(void)
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 0f299e3b618c..be3f215cd027 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (C) 2019 Netronome Systems, Inc. */
+#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -55,7 +56,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_mpls *m = to_mpls(a);
struct tcf_mpls_params *p;
__be32 new_lse;
- int ret;
+ int ret, mac_len;
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -63,8 +64,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
/* Ensure 'data' points at mac_header prior calling mpls manipulating
* functions.
*/
- if (skb_at_tc_ingress(skb))
+ if (skb_at_tc_ingress(skb)) {
skb_push_rcsum(skb, skb->mac_len);
+ mac_len = skb->mac_len;
+ } else {
+ mac_len = skb_network_header(skb) - skb_mac_header(skb);
+ }
ret = READ_ONCE(m->tcf_action);
@@ -72,12 +77,14 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
switch (p->tcfm_action) {
case TCA_MPLS_ACT_POP:
- if (skb_mpls_pop(skb, p->tcfm_proto))
+ if (skb_mpls_pop(skb, p->tcfm_proto, mac_len,
+ skb->dev && skb->dev->type == ARPHRD_ETHER))
goto drop;
break;
case TCA_MPLS_ACT_PUSH:
new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
- if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+ if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len,
+ skb->dev && skb->dev->type == ARPHRD_ETHER))
goto drop;
break;
case TCA_MPLS_ACT_MODIFY:
@@ -115,7 +122,6 @@ static int valid_label(const struct nlattr *attr,
}
static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
- [TCA_MPLS_UNSPEC] = { .strict_start_type = TCA_MPLS_UNSPEC + 1 },
[TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)),
[TCA_MPLS_PROTO] = { .type = NLA_U16 },
[TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_U32, valid_label),
@@ -127,7 +133,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
static int tcf_mpls_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mpls_net_id);
struct nlattr *tb[TCA_MPLS_MAX + 1];
@@ -220,7 +227,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_mpls_ops, bind, true);
+ &act_mpls_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -258,7 +265,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&m->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
+ p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock));
spin_unlock_bh(&m->tcf_lock);
if (goto_ch)
@@ -375,7 +382,7 @@ static __net_init int mpls_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, mpls_net_id);
- return tc_action_net_init(tn, &act_mpls_ops);
+ return tc_action_net_init(net, tn, &act_mpls_ops);
}
static void __net_exit mpls_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 7b858c11b1b5..855a6fa16a62 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action **a, int ovr, int bind,
bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
@@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
ret = tcf_idr_create(tn, index, est, a,
- &act_nat_ops, bind, false);
+ &act_nat_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -206,9 +206,7 @@ static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a,
icmph = (void *)(skb_network_header(skb) + ihl);
- if ((icmph->type != ICMP_DEST_UNREACH) &&
- (icmph->type != ICMP_TIME_EXCEEDED) &&
- (icmph->type != ICMP_PARAMETERPROB))
+ if (!icmp_is_err(icmph->type))
break;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
@@ -327,7 +325,7 @@ static __net_init int nat_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
- return tc_action_net_init(tn, &act_nat_ops);
+ return tc_action_net_init(net, tn, &act_nat_ops);
}
static void __net_exit nat_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 17360c6faeaa..3ad718576304 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -43,7 +43,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla,
int err = -EINVAL;
int rem;
- if (!nla || !n)
+ if (!nla)
return NULL;
keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL);
@@ -137,7 +137,8 @@ nla_failure:
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
@@ -170,6 +171,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(pattr);
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
+ return -EINVAL;
+ }
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
if (nla_len(pattr) < sizeof(*parm) + ksize) {
NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
@@ -183,14 +188,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
index = parm->index;
err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- if (!parm->nkeys) {
- tcf_idr_cleanup(tn, index);
- NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
- ret = -EINVAL;
- goto out_free;
- }
ret = tcf_idr_create(tn, index, est, a,
- &act_pedit_ops, bind, false);
+ &act_pedit_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
goto out_free;
@@ -498,7 +497,7 @@ static __net_init int pedit_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
- return tc_action_net_init(tn, &act_pedit_ops);
+ return tc_action_net_init(net, tn, &act_pedit_ops);
}
static void __net_exit pedit_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 49cec3e64a4d..8b7a0ac96c51 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -40,12 +40,14 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
+ [TCA_POLICE_RATE64] = { .type = NLA_U64 },
+ [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
};
static int tcf_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
int ret = 0, tcfp_result = TC_ACT_OK, err, size;
@@ -58,6 +60,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
struct tcf_police_params *new;
bool exists = false;
u32 index;
+ u64 rate64, prate64;
if (nla == NULL)
return -EINVAL;
@@ -84,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, NULL, a,
- &act_police_ops, bind, true);
+ &act_police_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -155,14 +158,18 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
}
if (R_tab) {
new->rate_present = true;
- psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
+ rate64 = tb[TCA_POLICE_RATE64] ?
+ nla_get_u64(tb[TCA_POLICE_RATE64]) : 0;
+ psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64);
qdisc_put_rtab(R_tab);
} else {
new->rate_present = false;
}
if (P_tab) {
new->peak_present = true;
- psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
+ prate64 = tb[TCA_POLICE_PEAKRATE64] ?
+ nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0;
+ psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64);
qdisc_put_rtab(P_tab);
} else {
new->peak_present = false;
@@ -184,9 +191,9 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
police->tcfp_ptoks = new->tcfp_mtu_ptoks;
spin_unlock_bh(&police->tcfp_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(police->params,
- new,
- lockdep_is_held(&police->tcf_lock));
+ new = rcu_replace_pointer(police->params,
+ new,
+ lockdep_is_held(&police->tcf_lock));
spin_unlock_bh(&police->tcf_lock);
if (goto_ch)
@@ -287,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a,
struct tcf_police *police = to_police(a);
struct tcf_t *tm = &police->tcf_tm;
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
- if (hw)
- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
- bytes, packets);
+ tcf_action_update_stats(a, bytes, packets, false, hw);
tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}
@@ -313,10 +317,22 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
lockdep_is_held(&police->tcf_lock));
opt.mtu = p->tcfp_mtu;
opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
- if (p->rate_present)
+ if (p->rate_present) {
psched_ratecfg_getrate(&opt.rate, &p->rate);
- if (p->peak_present)
+ if ((police->params->rate.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_RATE64,
+ police->params->rate.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
+ if (p->peak_present) {
psched_ratecfg_getrate(&opt.peakrate, &p->peak);
+ if ((police->params->peak.rate_bytes_ps >= (1ULL << 32)) &&
+ nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64,
+ police->params->peak.rate_bytes_ps,
+ TCA_POLICE_PAD))
+ goto nla_put_failure;
+ }
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
if (p->tcfp_result &&
@@ -326,10 +342,7 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
goto nla_put_failure;
- t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
- t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse);
- t.firstuse = jiffies_to_clock_t(jiffies - police->tcf_tm.firstuse);
- t.expires = jiffies_to_clock_t(police->tcf_tm.expires);
+ tcf_tm_dump(&t, &police->tcf_tm);
if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
goto nla_put_failure;
spin_unlock_bh(&police->tcf_lock);
@@ -371,7 +384,7 @@ static __net_init int police_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, police_net_id);
- return tc_action_net_init(tn, &act_police_ops);
+ return tc_action_net_init(net, tn, &act_police_ops);
}
static void __net_exit police_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 595308d60133..ce948c1e24dc 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
int bind, bool rtnl_held, struct tcf_proto *tp,
- struct netlink_ext_ack *extack)
+ u32 flags, struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
@@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_sample_ops, bind, true);
+ &act_sample_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -102,13 +102,17 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
s->rate = rate;
s->psample_group_num = psample_group_num;
- RCU_INIT_POINTER(s->psample_group, psample_group);
+ psample_group = rcu_replace_pointer(s->psample_group, psample_group,
+ lockdep_is_held(&s->tcf_lock));
if (tb[TCA_SAMPLE_TRUNC_SIZE]) {
s->truncate = true;
s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]);
}
spin_unlock_bh(&s->tcf_lock);
+
+ if (psample_group)
+ psample_group_put(psample_group);
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
@@ -142,6 +146,7 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev)
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
+ case ARPHRD_IP6GRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
return false;
@@ -248,6 +253,32 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
return tcf_idr_search(tn, a, index);
}
+static void tcf_psample_group_put(void *priv)
+{
+ struct psample_group *group = priv;
+
+ psample_group_put(group);
+}
+
+static struct psample_group *
+tcf_sample_get_group(const struct tc_action *a,
+ tc_action_priv_destructor *destructor)
+{
+ struct tcf_sample *s = to_sample(a);
+ struct psample_group *group;
+
+ spin_lock_bh(&s->tcf_lock);
+ group = rcu_dereference_protected(s->psample_group,
+ lockdep_is_held(&s->tcf_lock));
+ if (group) {
+ psample_group_take(group);
+ *destructor = tcf_psample_group_put;
+ }
+ spin_unlock_bh(&s->tcf_lock);
+
+ return group;
+}
+
static struct tc_action_ops act_sample_ops = {
.kind = "sample",
.id = TCA_ID_SAMPLE,
@@ -258,6 +289,7 @@ static struct tc_action_ops act_sample_ops = {
.cleanup = tcf_sample_cleanup,
.walk = tcf_sample_walker,
.lookup = tcf_sample_search,
+ .get_psample_group = tcf_sample_get_group,
.size = sizeof(struct tcf_sample),
};
@@ -265,7 +297,7 @@ static __net_init int sample_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
- return tc_action_net_init(tn, &act_sample_ops);
+ return tc_action_net_init(net, tn, &act_sample_ops);
}
static void __net_exit sample_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 33aefa25b545..9813ca4006dd 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
*/
- pr_info("simple: %s_%d\n",
+ pr_info("simple: %s_%llu\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
@@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
@@ -127,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_simp_ops, bind, false);
+ &act_simp_ops, bind, false, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -232,7 +233,7 @@ static __net_init int simp_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
- return tc_action_net_init(tn, &act_simp_ops);
+ return tc_action_net_init(net, tn, &act_simp_ops);
}
static void __net_exit simp_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 37dced00b63d..e857424c387c 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
@@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_skbedit_ops, bind, true);
+ &act_skbedit_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -206,8 +206,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(d->params, params_new,
- lockdep_is_held(&d->tcf_lock));
+ params_new = rcu_replace_pointer(d->params, params_new,
+ lockdep_is_held(&d->tcf_lock));
spin_unlock_bh(&d->tcf_lock);
if (params_new)
kfree_rcu(params_new, rcu);
@@ -336,7 +336,7 @@ static __net_init int skbedit_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
- return tc_action_net_init(tn, &act_skbedit_ops);
+ return tc_action_net_init(net, tn, &act_skbedit_ops);
}
static void __net_exit skbedit_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 7da3518e18ef..39e6d94cfafb 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
@@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
if (!exists) {
ret = tcf_idr_create(tn, index, est, a,
- &act_skbmod_ops, bind, true);
+ &act_skbmod_ops, bind, true, 0);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -287,7 +287,7 @@ static __net_init int skbmod_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
- return tc_action_net_init(tn, &act_skbmod_ops);
+ return tc_action_net_init(net, tn, &act_skbmod_ops);
}
static void __net_exit skbmod_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 6d0debdc9b97..536c4bc31be6 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -10,6 +10,8 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
@@ -31,7 +33,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&t->common, skb);
action = READ_ONCE(t->tcf_action);
switch (params->tcft_action) {
@@ -53,7 +55,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
static const struct nla_policy
enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -64,6 +70,19 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static int
tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
struct netlink_ext_ack *extack)
@@ -116,10 +135,89 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
return opt_len;
}
+static int
+tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct vxlan_metadata *md = dst;
+
+ md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]);
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int
+tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ } else if (ver == 2) {
+ if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ if (dst) {
+ struct erspan_metadata *md = dst;
+
+ md->version = ver;
+ if (ver == 1) {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ } else {
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
int dst_len, struct netlink_ext_ack *extack)
{
- int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+ int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0;
const struct nlattr *attr, *head = nla_data(nla);
err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
@@ -130,15 +228,48 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
nla_for_each_attr(attr, head, len, rem) {
switch (nla_type(attr)) {
case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
opt_len = tunnel_key_copy_geneve_opt(attr, dst,
dst_len, extack);
if (opt_len < 0)
return opt_len;
opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX) {
+ NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size");
+ return -EINVAL;
+ }
if (dst) {
dst_len -= opt_len;
dst += opt_len;
}
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_vxlan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+ if (type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ opt_len = tunnel_key_copy_erspan_opt(attr, dst,
+ dst_len, extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = TUNNEL_ERSPAN_OPT;
break;
}
}
@@ -175,6 +306,22 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
#else
return -EAFNOSUPPORT;
#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+#if IS_ENABLED(CONFIG_INET)
+ info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+ opts_len, extack);
+#else
+ return -EAFNOSUPPORT;
+#endif
default:
NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
return -EINVAL;
@@ -208,7 +355,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp,
+ struct tcf_proto *tp, u32 act_flags,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
@@ -347,8 +494,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
if (!exists) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_tunnel_key_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_tunnel_key_ops, bind,
+ act_flags);
if (ret) {
NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
goto release_tun_meta;
@@ -381,8 +529,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&t->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(t->params, params_new,
- lockdep_is_held(&t->tcf_lock));
+ params_new = rcu_replace_pointer(t->params, params_new,
+ lockdep_is_held(&t->tcf_lock));
spin_unlock_bh(&t->tcf_lock);
tunnel_key_release_params(params_new);
if (goto_ch)
@@ -450,6 +598,56 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
return 0;
}
+static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, start);
+ return 0;
+}
+
+static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
+ const struct ip_tunnel_info *info)
+{
+ struct erspan_metadata *md = (struct erspan_metadata *)(info + 1);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
+ if (!start)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, start);
+ return 0;
+err:
+ nla_nest_cancel(skb, start);
+ return -EMSGSIZE;
+}
+
static int tunnel_key_opts_dump(struct sk_buff *skb,
const struct ip_tunnel_info *info)
{
@@ -467,6 +665,14 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
err = tunnel_key_geneve_opts_dump(skb, info);
if (err)
goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+ err = tunnel_key_vxlan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
+ } else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+ err = tunnel_key_erspan_opts_dump(skb, info);
+ if (err)
+ goto err_out;
} else {
err_out:
nla_nest_cancel(skb, start);
@@ -600,7 +806,7 @@ static __net_init int tunnel_key_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
- return tc_action_net_init(tn, &act_tunnel_key_ops);
+ return tc_action_net_init(net, tn, &act_tunnel_key_ops);
}
static void __net_exit tunnel_key_exit_net(struct list_head *net_list)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index a3c9eea1ee8a..c91d3958fcbb 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
u16 tci;
tcf_lastuse_update(&v->tcf_tm);
- bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+ tcf_action_update_bstats(&v->common, skb);
/* Ensure 'data' points at mac_header prior calling vlan manipulating
* functions.
@@ -88,7 +88,7 @@ out:
return action;
drop:
- qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ tcf_action_inc_drop_qstats(&v->common);
return TC_ACT_SHOT;
}
@@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
int ovr, int bind, bool rtnl_held,
- struct tcf_proto *tp, struct netlink_ext_ack *extack)
+ struct tcf_proto *tp, u32 flags,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
@@ -188,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
action = parm->v_action;
if (!exists) {
- ret = tcf_idr_create(tn, index, est, a,
- &act_vlan_ops, bind, true);
+ ret = tcf_idr_create_from_flags(tn, index, est, a,
+ &act_vlan_ops, bind, flags);
if (ret) {
tcf_idr_cleanup(tn, index);
return ret;
@@ -220,7 +221,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
- rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
+ p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
spin_unlock_bh(&v->tcf_lock);
if (goto_ch)
@@ -301,6 +302,16 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
return tcf_generic_walker(tn, skb, cb, type, ops, extack);
}
+static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_vlan *v = to_vlan(a);
+ struct tcf_t *tm = &v->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, false, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
@@ -325,6 +336,7 @@ static struct tc_action_ops act_vlan_ops = {
.init = tcf_vlan_init,
.cleanup = tcf_vlan_cleanup,
.walk = tcf_vlan_walker,
+ .stats_update = tcf_vlan_stats_update,
.get_fill_size = tcf_vlan_get_fill_size,
.lookup = tcf_vlan_search,
.size = sizeof(struct tcf_vlan),
@@ -334,7 +346,7 @@ static __net_init int vlan_init_net(struct net *net)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
- return tc_action_net_init(tn, &act_vlan_ops);
+ return tc_action_net_init(net, tn, &act_vlan_ops);
}
static void __net_exit vlan_exit_net(struct list_head *net_list)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index efd3cfb80a2a..c2cdd0fc2e70 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -21,6 +21,7 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
+#include <linux/jhash.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -36,6 +37,8 @@
#include <net/tc_act/tc_sample.h>
#include <net/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_ct.h>
+#include <net/tc_act/tc_mpls.h>
+#include <net/flow_offload.h>
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -45,6 +48,62 @@ static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
+static u32 destroy_obj_hashfn(const struct tcf_proto *tp)
+{
+ return jhash_3words(tp->chain->index, tp->prio,
+ (__force __u32)tp->protocol, 0);
+}
+
+static void tcf_proto_signal_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node,
+ destroy_obj_hashfn(tp));
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
+static bool tcf_proto_cmp(const struct tcf_proto *tp1,
+ const struct tcf_proto *tp2)
+{
+ return tp1->chain->index == tp2->chain->index &&
+ tp1->prio == tp2->prio &&
+ tp1->protocol == tp2->protocol;
+}
+
+static bool tcf_proto_exists_destroying(struct tcf_chain *chain,
+ struct tcf_proto *tp)
+{
+ u32 hash = destroy_obj_hashfn(tp);
+ struct tcf_proto *iter;
+ bool found = false;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter,
+ destroy_ht_node, hash) {
+ if (tcf_proto_cmp(tp, iter)) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static void
+tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp)
+{
+ struct tcf_block *block = chain->block;
+
+ mutex_lock(&block->proto_destroy_lock);
+ if (hash_hashed(&tp->destroy_ht_node))
+ hash_del_rcu(&tp->destroy_ht_node);
+ mutex_unlock(&block->proto_destroy_lock);
+}
+
/* Find classifier type by string name */
static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
@@ -160,11 +219,22 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
return TC_H_MAJ(first);
}
+static bool tcf_proto_check_kind(struct nlattr *kind, char *name)
+{
+ if (kind)
+ return nla_strlcpy(name, kind, IFNAMSIZ) >= IFNAMSIZ;
+ memset(name, 0, IFNAMSIZ);
+ return false;
+}
+
static bool tcf_proto_is_unlocked(const char *kind)
{
const struct tcf_proto_ops *ops;
bool ret;
+ if (strlen(kind) == 0)
+ return false;
+
ops = tcf_proto_lookup_ops(kind, false, NULL);
/* On error return false to take rtnl lock. Proto lookup/create
* functions will perform lookup again and properly handle errors.
@@ -221,9 +291,11 @@ static void tcf_proto_get(struct tcf_proto *tp)
static void tcf_chain_put(struct tcf_chain *chain);
static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held,
- struct netlink_ext_ack *extack)
+ bool sig_destroy, struct netlink_ext_ack *extack)
{
tp->ops->destroy(tp, rtnl_held, extack);
+ if (sig_destroy)
+ tcf_proto_signal_destroyed(tp->chain, tp);
tcf_chain_put(tp->chain);
module_put(tp->ops->owner);
kfree_rcu(tp, rcu);
@@ -233,36 +305,15 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
if (refcount_dec_and_test(&tp->refcnt))
- tcf_proto_destroy(tp, rtnl_held, extack);
-}
-
-static int walker_check_empty(struct tcf_proto *tp, void *fh,
- struct tcf_walker *arg)
-{
- if (fh) {
- arg->nonempty = true;
- return -1;
- }
- return 0;
+ tcf_proto_destroy(tp, rtnl_held, true, extack);
}
-static bool tcf_proto_is_empty(struct tcf_proto *tp, bool rtnl_held)
+static bool tcf_proto_check_delete(struct tcf_proto *tp)
{
- struct tcf_walker walker = { .fn = walker_check_empty, };
+ if (tp->ops->delete_empty)
+ return tp->ops->delete_empty(tp);
- if (tp->ops->walk) {
- tp->ops->walk(tp, &walker, rtnl_held);
- return !walker.nonempty;
- }
- return true;
-}
-
-static bool tcf_proto_check_delete(struct tcf_proto *tp, bool rtnl_held)
-{
- spin_lock(&tp->lock);
- if (tcf_proto_is_empty(tp, rtnl_held))
- tp->deleting = true;
- spin_unlock(&tp->lock);
+ tp->deleting = true;
return tp->deleting;
}
@@ -357,6 +408,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain)
static void tcf_block_destroy(struct tcf_block *block)
{
mutex_destroy(&block->lock);
+ mutex_destroy(&block->proto_destroy_lock);
kfree_rcu(block, rcu);
}
@@ -532,6 +584,12 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
mutex_lock(&chain->filter_chain_lock);
tp = tcf_chain_dereference(chain->filter_chain, chain);
+ while (tp) {
+ tp_next = rcu_dereference_protected(tp->next, 1);
+ tcf_proto_signal_destroying(chain, tp);
+ tp = tp_next;
+ }
+ tp = tcf_chain_dereference(chain->filter_chain, chain);
RCU_INIT_POINTER(chain->filter_chain, NULL);
tcf_chain0_head_change(chain, NULL);
chain->flushing = true;
@@ -544,235 +602,87 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
}
}
-static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
-{
- const struct Qdisc_class_ops *cops;
- struct Qdisc *qdisc;
-
- if (!dev_ingress_queue(dev))
- return NULL;
-
- qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
- if (!qdisc)
- return NULL;
-
- cops = qdisc->ops->cl_ops;
- if (!cops)
- return NULL;
-
- if (!cops->tcf_block)
- return NULL;
-
- return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
-}
-
-static struct rhashtable indr_setup_block_ht;
-
-struct tc_indr_block_dev {
- struct rhash_head ht_node;
- struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
- struct tcf_block *block;
-};
-
-struct tc_indr_block_cb {
- struct list_head list;
- void *cb_priv;
- tc_indr_block_bind_cb_t *cb;
- void *cb_ident;
-};
-
-static const struct rhashtable_params tc_indr_setup_block_ht_params = {
- .key_offset = offsetof(struct tc_indr_block_dev, dev),
- .head_offset = offsetof(struct tc_indr_block_dev, ht_node),
- .key_len = sizeof(struct net_device *),
-};
-
-static struct tc_indr_block_dev *
-tc_indr_block_dev_lookup(struct net_device *dev)
-{
- return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
- tc_indr_setup_block_ht_params);
-}
-
-static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
-{
- struct tc_indr_block_dev *indr_dev;
-
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (indr_dev)
- goto inc_ref;
-
- indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
- if (!indr_dev)
- return NULL;
-
- INIT_LIST_HEAD(&indr_dev->cb_list);
- indr_dev->dev = dev;
- indr_dev->block = tc_dev_ingress_block(dev);
- if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- tc_indr_setup_block_ht_params)) {
- kfree(indr_dev);
- return NULL;
- }
-
-inc_ref:
- indr_dev->refcnt++;
- return indr_dev;
-}
-
-static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
-{
- if (--indr_dev->refcnt)
- return;
-
- rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- tc_indr_setup_block_ht_params);
- kfree(indr_dev);
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
-
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- if (indr_block_cb->cb == cb &&
- indr_block_cb->cb_ident == cb_ident)
- return indr_block_cb;
- return NULL;
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- struct tc_indr_block_cb *indr_block_cb;
-
- indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (indr_block_cb)
- return ERR_PTR(-EEXIST);
-
- indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
- if (!indr_block_cb)
- return ERR_PTR(-ENOMEM);
-
- indr_block_cb->cb_priv = cb_priv;
- indr_block_cb->cb = cb;
- indr_block_cb->cb_ident = cb_ident;
- list_add(&indr_block_cb->list, &indr_dev->cb_list);
-
- return indr_block_cb;
-}
-
-static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
-{
- list_del(&indr_block_cb->list);
- kfree(indr_block_cb);
-}
-
static int tcf_block_setup(struct tcf_block *block,
struct flow_block_offload *bo);
-static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
- struct tc_indr_block_cb *indr_block_cb,
- enum flow_block_command command)
+static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block,
+ flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ enum flow_block_command command, bool ingress)
{
struct flow_block_offload bo = {
.command = command,
- .binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
- .net = dev_net(indr_dev->dev),
- .block_shared = tcf_block_non_null_shared(indr_dev->block),
+ .binder_type = ingress ?
+ FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS :
+ FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
+ .net = dev_net(dev),
+ .block_shared = tcf_block_non_null_shared(block),
};
INIT_LIST_HEAD(&bo.cb_list);
- if (!indr_dev->block)
+ if (!block)
return;
- bo.block = &indr_dev->block->flow_block;
+ bo.block = &block->flow_block;
+
+ down_write(&block->cb_lock);
+ cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
- indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- &bo);
- tcf_block_setup(indr_dev->block, &bo);
+ tcf_block_setup(block, &bo);
+ up_write(&block->cb_lock);
}
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
+static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
- int err;
+ const struct Qdisc_class_ops *cops;
+ const struct Qdisc_ops *ops;
+ struct Qdisc *qdisc;
- indr_dev = tc_indr_block_dev_get(dev);
- if (!indr_dev)
- return -ENOMEM;
+ if (!dev_ingress_queue(dev))
+ return NULL;
- indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
- err = PTR_ERR_OR_ZERO(indr_block_cb);
- if (err)
- goto err_dev_put;
+ qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
+ if (!qdisc)
+ return NULL;
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
- return 0;
+ ops = qdisc->ops;
+ if (!ops)
+ return NULL;
-err_dev_put:
- tc_indr_block_dev_put(indr_dev);
- return err;
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
+ if (!ingress && !strcmp("ingress", ops->id))
+ return NULL;
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- int err;
+ cops = ops->cl_ops;
+ if (!cops)
+ return NULL;
- rtnl_lock();
- err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
- rtnl_unlock();
+ if (!cops->tcf_block)
+ return NULL;
- return err;
+ return cops->tcf_block(qdisc,
+ ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS,
+ NULL);
}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
-void __tc_indr_block_cb_unregister(struct net_device *dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
+static void tc_indr_block_get_and_cmd(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command command)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
-
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ struct tcf_block *block;
- indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (!indr_block_cb)
- return;
+ block = tc_dev_block(dev, true);
+ tc_indr_block_cmd(dev, block, cb, cb_priv, command, true);
- /* Send unbind message if required to free any block cbs. */
- tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
- tc_indr_block_cb_del(indr_block_cb);
- tc_indr_block_dev_put(indr_dev);
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
-
-void tc_indr_block_cb_unregister(struct net_device *dev,
- tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
- rtnl_lock();
- __tc_indr_block_cb_unregister(dev, cb, cb_ident);
- rtnl_unlock();
+ block = tc_dev_block(dev, false);
+ tc_indr_block_cmd(dev, block, cb, cb_priv, command, false);
}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
-static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
+static void tc_indr_block_call(struct tcf_block *block,
+ struct net_device *dev,
struct tcf_block_ext_info *ei,
enum flow_block_command command,
struct netlink_ext_ack *extack)
{
- struct tc_indr_block_cb *indr_block_cb;
- struct tc_indr_block_dev *indr_dev;
struct flow_block_offload bo = {
.command = command,
.binder_type = ei->binder_type,
@@ -783,22 +693,13 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
};
INIT_LIST_HEAD(&bo.cb_list);
- indr_dev = tc_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
-
- indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
-
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- &bo);
-
+ flow_indr_block_call(dev, &bo, command);
tcf_block_setup(block, &bo);
}
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
- return block->offloadcnt;
+ return atomic_read(&block->offloadcnt);
}
static int tcf_block_offload_cmd(struct tcf_block *block,
@@ -832,6 +733,7 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
+ down_write(&block->cb_lock);
if (!dev->netdev_ops->ndo_setup_tc)
goto no_offload_dev_inc;
@@ -840,24 +742,31 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
*/
if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto err_unlock;
}
err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_BIND, extack);
if (err == -EOPNOTSUPP)
goto no_offload_dev_inc;
if (err)
- return err;
+ goto err_unlock;
tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
+ up_write(&block->cb_lock);
return 0;
no_offload_dev_inc:
- if (tcf_block_offload_in_use(block))
- return -EOPNOTSUPP;
+ if (tcf_block_offload_in_use(block)) {
+ err = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+ err = 0;
block->nooffloaddevcnt++;
tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack);
- return 0;
+err_unlock:
+ up_write(&block->cb_lock);
+ return err;
}
static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
@@ -866,6 +775,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
struct net_device *dev = q->dev_queue->dev;
int err;
+ down_write(&block->cb_lock);
tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (!dev->netdev_ops->ndo_setup_tc)
@@ -873,10 +783,12 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL);
if (err == -EOPNOTSUPP)
goto no_offload_dev_dec;
+ up_write(&block->cb_lock);
return;
no_offload_dev_dec:
WARN_ON(block->nooffloaddevcnt-- == 0);
+ up_write(&block->cb_lock);
}
static int
@@ -991,6 +903,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
return ERR_PTR(-ENOMEM);
}
mutex_init(&block->lock);
+ mutex_init(&block->proto_destroy_lock);
+ init_rwsem(&block->cb_lock);
flow_block_init(&block->flow_block);
INIT_LIST_HEAD(&block->chain_list);
INIT_LIST_HEAD(&block->owner_list);
@@ -1526,6 +1440,8 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
struct tcf_proto *tp, *tp_prev;
int err;
+ lockdep_assert_held(&block->cb_lock);
+
for (chain = __tcf_get_next_chain(block, NULL);
chain;
chain_prev = chain,
@@ -1564,6 +1480,8 @@ static int tcf_block_bind(struct tcf_block *block,
struct flow_block_cb *block_cb, *next;
int err, i = 0;
+ lockdep_assert_held(&block->cb_lock);
+
list_for_each_entry(block_cb, &bo->cb_list, list) {
err = tcf_block_playback_offloads(block, block_cb->cb,
block_cb->cb_priv, true,
@@ -1571,6 +1489,8 @@ static int tcf_block_bind(struct tcf_block *block,
bo->extack);
if (err)
goto err_unroll;
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt++;
i++;
}
@@ -1586,6 +1506,8 @@ err_unroll:
block_cb->cb_priv, false,
tcf_block_offload_in_use(block),
NULL);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
}
flow_block_cb_free(block_cb);
}
@@ -1598,6 +1520,8 @@ static void tcf_block_unbind(struct tcf_block *block,
{
struct flow_block_cb *block_cb, *next;
+ lockdep_assert_held(&block->cb_lock);
+
list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
tcf_block_playback_offloads(block, block_cb->cb,
block_cb->cb_priv, false,
@@ -1605,6 +1529,8 @@ static void tcf_block_unbind(struct tcf_block *block,
NULL);
list_del(&block_cb->list);
flow_block_cb_free(block_cb);
+ if (!bo->unlocked_driver_cb)
+ block->lockeddevcnt--;
}
}
@@ -1659,6 +1585,18 @@ reclassify:
goto reset;
} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
first_tp = res->goto_tp;
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ {
+ struct tc_skb_ext *ext;
+
+ ext = skb_ext_add(skb, TC_SKB_EXT);
+ if (WARN_ON_ONCE(!ext))
+ return TC_ACT_SHOT;
+
+ ext->chain = err & TC_ACT_EXT_VAL_MASK;
+ }
+#endif
goto reset;
}
#endif
@@ -1743,6 +1681,12 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
mutex_lock(&chain->filter_chain_lock);
+ if (tcf_proto_exists_destroying(chain, tp_new)) {
+ mutex_unlock(&chain->filter_chain_lock);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
+ return ERR_PTR(-EAGAIN);
+ }
+
tp = tcf_chain_tp_find(chain, &chain_info,
protocol, prio, false);
if (!tp)
@@ -1750,10 +1694,10 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain,
mutex_unlock(&chain->filter_chain_lock);
if (tp) {
- tcf_proto_destroy(tp_new, rtnl_held, NULL);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
tp_new = tp;
} else if (err) {
- tcf_proto_destroy(tp_new, rtnl_held, NULL);
+ tcf_proto_destroy(tp_new, rtnl_held, false, NULL);
tp_new = ERR_PTR(err);
}
@@ -1786,11 +1730,12 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain,
* concurrently.
* Mark tp for deletion if it is empty.
*/
- if (!tp_iter || !tcf_proto_check_delete(tp, rtnl_held)) {
+ if (!tp_iter || !tcf_proto_check_delete(tp)) {
mutex_unlock(&chain->filter_chain_lock);
return;
}
+ tcf_proto_signal_destroying(chain, tp);
next = tcf_chain_dereference(chain_info.next, chain);
if (tp == chain->filter_chain)
tcf_chain0_head_change(chain, next);
@@ -1976,6 +1921,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -2032,13 +1978,19 @@ replay:
if (err)
return err;
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
+
/* Take rtnl mutex if rtnl_held was set to true on previous iteration,
* block is shared (no qdisc found), qdisc is not unlocked, classifier
* type is not specified, classifier is not unlocked.
*/
if (rtnl_held ||
(q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
- !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) {
+ !tcf_proto_is_unlocked(name)) {
rtnl_held = true;
rtnl_lock();
}
@@ -2103,9 +2055,8 @@ replay:
&chain_info));
mutex_unlock(&chain->filter_chain_lock);
- tp_new = tcf_proto_create(nla_data(tca[TCA_KIND]),
- protocol, prio, chain, rtnl_held,
- extack);
+ tp_new = tcf_proto_create(name, protocol, prio, chain,
+ rtnl_held, extack);
if (IS_ERR(tp_new)) {
err = PTR_ERR(tp_new);
goto errout_tp;
@@ -2196,6 +2147,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -2235,13 +2187,18 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (err)
return err;
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
/* Take rtnl mutex if flushing whole chain, block is shared (no qdisc
* found), qdisc is not unlocked, classifier type is not specified,
* classifier is not unlocked.
*/
if (!prio ||
(q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
- !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) {
+ !tcf_proto_is_unlocked(name)) {
rtnl_held = true;
rtnl_lock();
}
@@ -2297,6 +2254,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
err = -EINVAL;
goto errout_locked;
} else if (t->tcm_handle == 0) {
+ tcf_proto_signal_destroying(chain, tp);
tcf_chain_tp_remove(chain, &chain_info, tp);
mutex_unlock(&chain->filter_chain_lock);
@@ -2349,6 +2307,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
+ char name[IFNAMSIZ];
struct tcmsg *t;
u32 protocol;
u32 prio;
@@ -2385,12 +2344,17 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
if (err)
return err;
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC filter name too long");
+ err = -EINVAL;
+ goto errout;
+ }
/* Take rtnl mutex if block is shared (no qdisc found), qdisc is not
* unlocked, classifier type is not specified, classifier is not
* unlocked.
*/
if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) ||
- !tca[TCA_KIND] || !tcf_proto_is_unlocked(nla_data(tca[TCA_KIND]))) {
+ !tcf_proto_is_unlocked(name)) {
rtnl_held = true;
rtnl_lock();
}
@@ -2749,13 +2713,19 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
struct netlink_ext_ack *extack)
{
const struct tcf_proto_ops *ops;
+ char name[IFNAMSIZ];
void *tmplt_priv;
/* If kind is not set, user did not specify template. */
if (!tca[TCA_KIND])
return 0;
- ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), true, extack);
+ if (tcf_proto_check_kind(tca[TCA_KIND], name)) {
+ NL_SET_ERR_MSG(extack, "Specified TC chain template name too long");
+ return -EINVAL;
+ }
+
+ ops = tcf_proto_lookup_ops(name, true, extack);
if (IS_ERR(ops))
return PTR_ERR(ops);
if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
@@ -3027,8 +2997,10 @@ out:
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
- tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
- kfree(exts->actions);
+ if (exts->actions) {
+ tcf_action_destroy(exts->actions, TCA_ACT_UNBIND);
+ kfree(exts->actions);
+ }
exts->nr_actions = 0;
#endif
}
@@ -3151,17 +3123,61 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
-int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
- void *type_data, bool err_stop)
+static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
+{
+ if (*flags & TCA_CLS_FLAGS_IN_HW)
+ return;
+ *flags |= TCA_CLS_FLAGS_IN_HW;
+ atomic_inc(&block->offloadcnt);
+}
+
+static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+ if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+ return;
+ *flags &= ~TCA_CLS_FLAGS_IN_HW;
+ atomic_dec(&block->offloadcnt);
+}
+
+static void tc_cls_offload_cnt_update(struct tcf_block *block,
+ struct tcf_proto *tp, u32 *cnt,
+ u32 *flags, u32 diff, bool add)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ if (add) {
+ if (!*cnt)
+ tcf_block_offload_inc(block, flags);
+ *cnt += diff;
+ } else {
+ *cnt -= diff;
+ if (!*cnt)
+ tcf_block_offload_dec(block, flags);
+ }
+ spin_unlock(&tp->lock);
+}
+
+static void
+tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp,
+ u32 *cnt, u32 *flags)
+{
+ lockdep_assert_held(&block->cb_lock);
+
+ spin_lock(&tp->lock);
+ tcf_block_offload_dec(block, flags);
+ *cnt = 0;
+ spin_unlock(&tp->lock);
+}
+
+static int
+__tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop)
{
struct flow_block_cb *block_cb;
int ok_count = 0;
int err;
- /* Make sure all netdevs sharing this block are offload-capable. */
- if (block->nooffloaddevcnt && err_stop)
- return -EOPNOTSUPP;
-
list_for_each_entry(block_cb, &block->flow_block.cb_list, list) {
err = block_cb->cb(type, type_data, block_cb->cb_priv);
if (err) {
@@ -3173,17 +3189,261 @@ int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
}
return ok_count;
}
+
+int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type,
+ void *type_data, bool err_stop, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count;
+}
EXPORT_SYMBOL(tc_setup_cb_call);
+/* Non-destructive filter add. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offloads counter. On failure,
+ * previously offloaded filter is considered to be intact and offloads counter
+ * is not decremented.
+ */
+
+int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags,
+ ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_add);
+
+/* Destructive filter replace. If filter that wasn't already in hardware is
+ * successfully offloaded, increment block offload counter. On failure,
+ * previously offloaded filter is considered to be destroyed and offload counter
+ * is decremented.
+ */
+
+int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *old_flags, unsigned int *old_in_hw_count,
+ u32 *new_flags, unsigned int *new_in_hw_count,
+ bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ /* Make sure all netdevs sharing this block are offload-capable. */
+ if (block->nooffloaddevcnt && err_stop) {
+ ok_count = -EOPNOTSUPP;
+ goto err_unlock;
+ }
+
+ tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+ if (ok_count < 0)
+ goto err_unlock;
+
+ if (tp->ops->hw_add)
+ tp->ops->hw_add(tp, type_data);
+ if (ok_count > 0)
+ tc_cls_offload_cnt_update(block, tp, new_in_hw_count,
+ new_flags, ok_count, true);
+err_unlock:
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_replace);
+
+/* Destroy filter and decrement block offload counter, if filter was previously
+ * offloaded.
+ */
+
+int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp,
+ enum tc_setup_type type, void *type_data, bool err_stop,
+ u32 *flags, unsigned int *in_hw_count, bool rtnl_held)
+{
+ bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held;
+ int ok_count;
+
+retry:
+ if (take_rtnl)
+ rtnl_lock();
+ down_read(&block->cb_lock);
+ /* Need to obtain rtnl lock if block is bound to devs that require it.
+ * In block bind code cb_lock is obtained while holding rtnl, so we must
+ * obtain the locks in same order here.
+ */
+ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) {
+ up_read(&block->cb_lock);
+ take_rtnl = true;
+ goto retry;
+ }
+
+ ok_count = __tc_setup_cb_call(block, type, type_data, err_stop);
+
+ tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags);
+ if (tp->ops->hw_del)
+ tp->ops->hw_del(tp, type_data);
+
+ up_read(&block->cb_lock);
+ if (take_rtnl)
+ rtnl_unlock();
+ return ok_count < 0 ? ok_count : 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_destroy);
+
+int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
+ bool add, flow_setup_cb_t *cb,
+ enum tc_setup_type type, void *type_data,
+ void *cb_priv, u32 *flags, unsigned int *in_hw_count)
+{
+ int err = cb(type, type_data, cb_priv);
+
+ if (err) {
+ if (add && tc_skip_sw(*flags))
+ return err;
+ } else {
+ tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1,
+ add);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tc_setup_cb_reoffload);
+
+void tc_cleanup_flow_action(struct flow_action *flow_action)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ flow_action_for_each(i, entry, flow_action)
+ if (entry->destructor)
+ entry->destructor(entry->destructor_priv);
+}
+EXPORT_SYMBOL(tc_cleanup_flow_action);
+
+static void tcf_mirred_get_dev(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ entry->dev = act->ops->get_dev(act, &entry->destructor);
+ if (!entry->dev)
+ return;
+ entry->destructor_priv = entry->dev;
+#endif
+}
+
+static void tcf_tunnel_encap_put_tunnel(void *priv)
+{
+ struct ip_tunnel_info *tunnel = priv;
+
+ kfree(tunnel);
+}
+
+static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ entry->tunnel = tcf_tunnel_info_copy(act);
+ if (!entry->tunnel)
+ return -ENOMEM;
+ entry->destructor = tcf_tunnel_encap_put_tunnel;
+ entry->destructor_priv = entry->tunnel;
+ return 0;
+}
+
+static void tcf_sample_get_group(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+ entry->sample.psample_group =
+ act->ops->get_psample_group(act, &entry->destructor);
+ entry->destructor_priv = entry->sample.psample_group;
+#endif
+}
+
int tc_setup_flow_action(struct flow_action *flow_action,
- const struct tcf_exts *exts)
+ const struct tcf_exts *exts, bool rtnl_held)
{
const struct tc_action *act;
- int i, j, k;
+ int i, j, k, err = 0;
if (!exts)
return 0;
+ if (!rtnl_held)
+ rtnl_lock();
+
j = 0;
tcf_exts_for_each_action(i, act, exts) {
struct flow_action_entry *entry;
@@ -3200,10 +3460,16 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->chain_index = tcf_gact_goto_chain_index(act);
} else if (is_tcf_mirred_egress_redirect(act)) {
entry->id = FLOW_ACTION_REDIRECT;
- entry->dev = tcf_mirred_dev(act);
+ tcf_mirred_get_dev(entry, act);
} else if (is_tcf_mirred_egress_mirror(act)) {
entry->id = FLOW_ACTION_MIRRED;
- entry->dev = tcf_mirred_dev(act);
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_redirect(act)) {
+ entry->id = FLOW_ACTION_REDIRECT_INGRESS;
+ tcf_mirred_get_dev(entry, act);
+ } else if (is_tcf_mirred_ingress_mirror(act)) {
+ entry->id = FLOW_ACTION_MIRRED_INGRESS;
+ tcf_mirred_get_dev(entry, act);
} else if (is_tcf_vlan(act)) {
switch (tcf_vlan_action(act)) {
case TCA_VLAN_ACT_PUSH:
@@ -3222,11 +3488,14 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->vlan.prio = tcf_vlan_push_prio(act);
break;
default:
+ err = -EOPNOTSUPP;
goto err_out;
}
} else if (is_tcf_tunnel_set(act)) {
entry->id = FLOW_ACTION_TUNNEL_ENCAP;
- entry->tunnel = tcf_tunnel_info(act);
+ err = tcf_tunnel_encap_get_tunnel(entry, act);
+ if (err)
+ goto err_out;
} else if (is_tcf_tunnel_release(act)) {
entry->id = FLOW_ACTION_TUNNEL_DECAP;
} else if (is_tcf_pedit(act)) {
@@ -3239,6 +3508,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->id = FLOW_ACTION_ADD;
break;
default:
+ err = -EOPNOTSUPP;
goto err_out;
}
entry->mangle.htype = tcf_pedit_htype(act, k);
@@ -3255,11 +3525,10 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->mark = tcf_skbedit_mark(act);
} else if (is_tcf_sample(act)) {
entry->id = FLOW_ACTION_SAMPLE;
- entry->sample.psample_group =
- tcf_sample_psample_group(act);
entry->sample.trunc_size = tcf_sample_trunc_size(act);
entry->sample.truncate = tcf_sample_truncate(act);
entry->sample.rate = tcf_sample_rate(act);
+ tcf_sample_get_group(entry, act);
} else if (is_tcf_police(act)) {
entry->id = FLOW_ACTION_POLICE;
entry->police.burst = tcf_police_tcfp_burst(act);
@@ -3269,16 +3538,50 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->id = FLOW_ACTION_CT;
entry->ct.action = tcf_ct_action(act);
entry->ct.zone = tcf_ct_zone(act);
+ } else if (is_tcf_mpls(act)) {
+ switch (tcf_mpls_action(act)) {
+ case TCA_MPLS_ACT_PUSH:
+ entry->id = FLOW_ACTION_MPLS_PUSH;
+ entry->mpls_push.proto = tcf_mpls_proto(act);
+ entry->mpls_push.label = tcf_mpls_label(act);
+ entry->mpls_push.tc = tcf_mpls_tc(act);
+ entry->mpls_push.bos = tcf_mpls_bos(act);
+ entry->mpls_push.ttl = tcf_mpls_ttl(act);
+ break;
+ case TCA_MPLS_ACT_POP:
+ entry->id = FLOW_ACTION_MPLS_POP;
+ entry->mpls_pop.proto = tcf_mpls_proto(act);
+ break;
+ case TCA_MPLS_ACT_MODIFY:
+ entry->id = FLOW_ACTION_MPLS_MANGLE;
+ entry->mpls_mangle.label = tcf_mpls_label(act);
+ entry->mpls_mangle.tc = tcf_mpls_tc(act);
+ entry->mpls_mangle.bos = tcf_mpls_bos(act);
+ entry->mpls_mangle.ttl = tcf_mpls_ttl(act);
+ break;
+ default:
+ goto err_out;
+ }
+ } else if (is_tcf_skbedit_ptype(act)) {
+ entry->id = FLOW_ACTION_PTYPE;
+ entry->ptype = tcf_skbedit_ptype(act);
} else {
+ err = -EOPNOTSUPP;
goto err_out;
}
if (!is_tcf_pedit(act))
j++;
}
- return 0;
+
err_out:
- return -EOPNOTSUPP;
+ if (!rtnl_held)
+ rtnl_unlock();
+
+ if (err)
+ tc_cleanup_flow_action(flow_action);
+
+ return err;
}
EXPORT_SYMBOL(tc_setup_flow_action);
@@ -3321,6 +3624,11 @@ static struct pernet_operations tcf_net_ops = {
.size = sizeof(struct tcf_net),
};
+static struct flow_indr_block_entry block_entry = {
+ .cb = tc_indr_block_get_and_cmd,
+ .list = LIST_HEAD_INIT(block_entry.list),
+};
+
static int __init tc_filter_init(void)
{
int err;
@@ -3333,10 +3641,7 @@ static int __init tc_filter_init(void)
if (err)
goto err_register_pernet_subsys;
- err = rhashtable_init(&indr_setup_block_ht,
- &tc_indr_setup_block_ht_params);
- if (err)
- goto err_rhash_setup_block_ht;
+ flow_indr_add_block_cb(&block_entry);
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
RTNL_FLAG_DOIT_UNLOCKED);
@@ -3351,8 +3656,6 @@ static int __init tc_filter_init(void)
return 0;
-err_rhash_setup_block_ht:
- unregister_pernet_subsys(&tcf_net_ops);
err_register_pernet_subsys:
destroy_workqueue(tc_filter_wq);
return err;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 4aafbe3d435c..f256a7c69093 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -263,12 +263,17 @@ skip:
}
}
-static void basic_bind_class(void *fh, u32 classid, unsigned long cl)
+static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct basic_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 3f7a9c02b70c..6e3e63db0e01 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -162,18 +162,24 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
cls_bpf.name = obj->bpf_name;
cls_bpf.exts_integrated = obj->exts_integrated;
- if (oldprog)
- tcf_block_offload_dec(block, &oldprog->gen_flags);
+ if (oldprog && prog)
+ err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count,
+ &prog->gen_flags, &prog->in_hw_count,
+ true);
+ else if (prog)
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &prog->gen_flags,
+ &prog->in_hw_count, true);
+ else
+ err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf,
+ skip_sw, &oldprog->gen_flags,
+ &oldprog->in_hw_count, true);
- err = tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
- if (prog) {
- if (err < 0) {
- cls_bpf_offload_cmd(tp, oldprog, prog, extack);
- return err;
- } else if (err > 0) {
- prog->in_hw_count = err;
- tcf_block_offload_inc(block, &prog->gen_flags);
- }
+ if (prog && err) {
+ cls_bpf_offload_cmd(tp, oldprog, prog, extack);
+ return err;
}
if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW))
@@ -230,7 +236,7 @@ static void cls_bpf_offload_update_stats(struct tcf_proto *tp,
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true);
}
static int cls_bpf_init(struct tcf_proto *tp)
@@ -625,12 +631,17 @@ nla_put_failure:
return -1;
}
-static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl)
+static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
{
struct cls_bpf_prog *prog = fh;
- if (prog && prog->res.classid == classid)
- prog->res.class = cl;
+ if (prog && prog->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &prog->res, base);
+ else
+ __tcf_unbind_filter(q, &prog->res);
+ }
}
static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg,
@@ -673,15 +684,11 @@ static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb
cls_bpf.name = prog->bpf_name;
cls_bpf.exts_integrated = prog->exts_integrated;
- err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
- if (err) {
- if (add && tc_skip_sw(prog->gen_flags))
- return err;
- continue;
- }
-
- tc_cls_offload_cnt_update(block, &prog->in_hw_count,
- &prog->gen_flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF,
+ &cls_bpf, cb_priv, &prog->gen_flags,
+ &prog->in_hw_count);
+ if (err)
+ return err;
}
return 0;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 054123742e32..7e54d2ab5254 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -22,6 +22,8 @@
#include <net/ip.h>
#include <net/flow_dissector.h>
#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
@@ -54,8 +56,13 @@ struct fl_flow_key {
struct flow_dissector_key_ip ip;
struct flow_dissector_key_ip enc_ip;
struct flow_dissector_key_enc_opts enc_opts;
- struct flow_dissector_key_ports tp_min;
- struct flow_dissector_key_ports tp_max;
+ union {
+ struct flow_dissector_key_ports tp;
+ struct {
+ struct flow_dissector_key_ports tp_min;
+ struct flow_dissector_key_ports tp_max;
+ };
+ } tp_range;
struct flow_dissector_key_ct ct;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
@@ -198,19 +205,19 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
{
__be16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_min.dst);
- max_mask = htons(filter->mask->key.tp_max.dst);
- min_val = htons(filter->key.tp_min.dst);
- max_val = htons(filter->key.tp_max.dst);
+ min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
+ max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
+ min_val = htons(filter->key.tp_range.tp_min.dst);
+ max_val = htons(filter->key.tp_range.tp_max.dst);
if (min_mask && max_mask) {
- if (htons(key->tp.dst) < min_val ||
- htons(key->tp.dst) > max_val)
+ if (htons(key->tp_range.tp.dst) < min_val ||
+ htons(key->tp_range.tp.dst) > max_val)
return false;
/* skb does not have min and max values */
- mkey->tp_min.dst = filter->mkey.tp_min.dst;
- mkey->tp_max.dst = filter->mkey.tp_max.dst;
+ mkey->tp_range.tp_min.dst = filter->mkey.tp_range.tp_min.dst;
+ mkey->tp_range.tp_max.dst = filter->mkey.tp_range.tp_max.dst;
}
return true;
}
@@ -221,19 +228,19 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
{
__be16 min_mask, max_mask, min_val, max_val;
- min_mask = htons(filter->mask->key.tp_min.src);
- max_mask = htons(filter->mask->key.tp_max.src);
- min_val = htons(filter->key.tp_min.src);
- max_val = htons(filter->key.tp_max.src);
+ min_mask = htons(filter->mask->key.tp_range.tp_min.src);
+ max_mask = htons(filter->mask->key.tp_range.tp_max.src);
+ min_val = htons(filter->key.tp_range.tp_min.src);
+ max_val = htons(filter->key.tp_range.tp_max.src);
if (min_mask && max_mask) {
- if (htons(key->tp.src) < min_val ||
- htons(key->tp.src) > max_val)
+ if (htons(key->tp_range.tp.src) < min_val ||
+ htons(key->tp_range.tp.src) > max_val)
return false;
/* skb does not have min and max values */
- mkey->tp_min.src = filter->mkey.tp_min.src;
- mkey->tp_max.src = filter->mkey.tp_max.src;
+ mkey->tp_range.tp_min.src = filter->mkey.tp_range.tp_min.src;
+ mkey->tp_range.tp_max.src = filter->mkey.tp_range.tp_max.src;
}
return true;
}
@@ -412,41 +419,27 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
struct tcf_block *block = tp->chain->block;
struct flow_cls_offload cls_flower = {};
- if (!rtnl_held)
- rtnl_lock();
-
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = FLOW_CLS_DESTROY;
cls_flower.cookie = (unsigned long) f;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
- spin_lock(&tp->lock);
- list_del_init(&f->hw_list);
- tcf_block_offload_dec(block, &f->flags);
- spin_unlock(&tp->lock);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ &f->flags, &f->in_hw_count, rtnl_held);
- if (!rtnl_held)
- rtnl_unlock();
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
struct cls_fl_filter *f, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = fl_head_dereference(tp);
struct tcf_block *block = tp->chain->block;
struct flow_cls_offload cls_flower = {};
bool skip_sw = tc_skip_sw(f->flags);
int err = 0;
- if (!rtnl_held)
- rtnl_lock();
-
cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts));
- if (!cls_flower.rule) {
- err = -ENOMEM;
- goto errout;
- }
+ if (!cls_flower.rule)
+ return -ENOMEM;
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = FLOW_CLS_REPLACE;
@@ -456,43 +449,31 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.rule->match.key = &f->mkey;
cls_flower.classid = f->res.classid;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+ rtnl_held);
if (err) {
kfree(cls_flower.rule);
- if (skip_sw)
+ if (skip_sw) {
NL_SET_ERR_MSG_MOD(extack, "Failed to setup flow action");
- else
- err = 0;
- goto errout;
+ return err;
+ }
+ return 0;
}
- err = tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw);
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower,
+ skip_sw, &f->flags, &f->in_hw_count, rtnl_held);
+ tc_cleanup_flow_action(&cls_flower.rule->action);
kfree(cls_flower.rule);
- if (err < 0) {
- fl_hw_destroy_filter(tp, f, true, NULL);
- goto errout;
- } else if (err > 0) {
- f->in_hw_count = err;
- err = 0;
- spin_lock(&tp->lock);
- tcf_block_offload_inc(block, &f->flags);
- spin_unlock(&tp->lock);
- }
-
- if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) {
- err = -EINVAL;
- goto errout;
+ if (err) {
+ fl_hw_destroy_filter(tp, f, rtnl_held, NULL);
+ return err;
}
- spin_lock(&tp->lock);
- list_add(&f->hw_list, &head->hw_filters);
- spin_unlock(&tp->lock);
-errout:
- if (!rtnl_held)
- rtnl_unlock();
+ if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
+ return -EINVAL;
- return err;
+ return 0;
}
static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
@@ -501,22 +482,17 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
struct tcf_block *block = tp->chain->block;
struct flow_cls_offload cls_flower = {};
- if (!rtnl_held)
- rtnl_lock();
-
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
cls_flower.command = FLOW_CLS_STATS;
cls_flower.cookie = (unsigned long) f;
cls_flower.classid = f->res.classid;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false,
+ rtnl_held);
tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
cls_flower.stats.pkts,
cls_flower.stats.lastused);
-
- if (!rtnl_held)
- rtnl_unlock();
}
static void __fl_put(struct cls_fl_filter *f)
@@ -715,11 +691,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
.len = 128 / BITS_PER_BYTE },
[TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
.len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
};
static const struct nla_policy
enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPTS_UNSPEC] = {
+ .strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
[TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [TCA_FLOWER_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED },
};
static const struct nla_policy
@@ -730,6 +711,19 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
.len = 128 },
};
+static const struct nla_policy
+vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
static void fl_set_key_val(struct nlattr **tb,
void *val, int val_type,
void *mask, int mask_type, int len)
@@ -746,23 +740,25 @@ static void fl_set_key_val(struct nlattr **tb,
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask)
{
- fl_set_key_val(tb, &key->tp_min.dst,
- TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
- fl_set_key_val(tb, &key->tp_max.dst,
- TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
- fl_set_key_val(tb, &key->tp_min.src,
- TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
- fl_set_key_val(tb, &key->tp_max.src,
- TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
- TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
-
- if ((mask->tp_min.dst && mask->tp_max.dst &&
- htons(key->tp_max.dst) <= htons(key->tp_min.dst)) ||
- (mask->tp_min.src && mask->tp_max.src &&
- htons(key->tp_max.src) <= htons(key->tp_min.src)))
+ fl_set_key_val(tb, &key->tp_range.tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.dst));
+ fl_set_key_val(tb, &key->tp_range.tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_range.tp_max.dst,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.dst));
+ fl_set_key_val(tb, &key->tp_range.tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_range.tp_min.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_min.src));
+ fl_set_key_val(tb, &key->tp_range.tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src,
+ TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
+
+ if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
+ htons(key->tp_range.tp_max.dst) <=
+ htons(key->tp_range.tp_min.dst)) ||
+ (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
+ htons(key->tp_range.tp_max.src) <=
+ htons(key->tp_range.tp_min.src)))
return -EINVAL;
return 0;
@@ -959,6 +955,105 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
return sizeof(struct geneve_opt) + data_len;
}
+static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1];
+ struct vxlan_metadata *md;
+ int err;
+
+ md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) {
+ NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla,
+ vxlan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP])
+ md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]);
+
+ return sizeof(*md);
+}
+
+static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+ int depth, int option_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1];
+ struct erspan_metadata *md;
+ int err;
+
+ md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+ memset(md, 0xff, sizeof(*md));
+ md->version = 1;
+
+ if (!depth)
+ return sizeof(*md);
+
+ if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) {
+ NL_SET_ERR_MSG(extack, "Non-erspan option type for mask");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla,
+ erspan_opt_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+ return -EINVAL;
+ }
+
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER])
+ md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]);
+
+ if (md->version == 1) {
+ if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(nla);
+ }
+ } else if (md->version == 2) {
+ if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] ||
+ !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) {
+ NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+ return -EINVAL;
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(nla);
+ }
+ if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) {
+ nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(nla));
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+ return -EINVAL;
+ }
+
+ return sizeof(*md);
+}
+
static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
struct fl_flow_key *mask,
struct netlink_ext_ack *extack)
@@ -989,6 +1084,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
switch (nla_type(nla_opt_key)) {
case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+ if (key->enc_opts.dst_opt_type &&
+ key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+ return -EINVAL;
+ }
option_len = 0;
key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
option_len = fl_set_geneve_opt(nla_opt_key, key,
@@ -1017,6 +1117,72 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
if (msk_depth)
nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
break;
+ case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+ option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
+ case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
+ if (key->enc_opts.dst_opt_type) {
+ NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+ return -EINVAL;
+ }
+ option_len = 0;
+ key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_key, key,
+ key_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ key->enc_opts.len += option_len;
+ /* At the same time we need to parse through the mask
+ * in order to verify exact and mask attribute lengths.
+ */
+ mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+ option_len = fl_set_erspan_opt(nla_opt_msk, mask,
+ msk_depth, option_len,
+ extack);
+ if (option_len < 0)
+ return option_len;
+
+ mask->enc_opts.len += option_len;
+ if (key->enc_opts.len != mask->enc_opts.len) {
+ NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+ return -EINVAL;
+ }
+
+ if (msk_depth)
+ nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+ break;
default:
NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
return -EINVAL;
@@ -1316,7 +1482,7 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
}
#define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
-#define FL_KEY_MEMBER_SIZE(member) FIELD_SIZEOF(struct fl_flow_key, member)
+#define FL_KEY_MEMBER_SIZE(member) sizeof_field(struct fl_flow_key, member)
#define FL_KEY_IS_MASKED(mask, member) \
memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \
@@ -1351,9 +1517,10 @@ static void fl_init_dissector(struct flow_dissector *dissector,
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- if (FL_KEY_IS_MASKED(mask, tp) ||
- FL_KEY_IS_MASKED(mask, tp_min) || FL_KEY_IS_MASKED(mask, tp_max))
- FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS, tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE, tp_range);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1402,8 +1569,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
fl_mask_copy(newmask, mask);
- if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
- (newmask->key.tp_min.src && newmask->key.tp_max.src))
+ if ((newmask->key.tp_range.tp_min.dst &&
+ newmask->key.tp_range.tp_max.dst) ||
+ (newmask->key.tp_range.tp_min.src &&
+ newmask->key.tp_range.tp_max.src))
newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
err = fl_init_mask_hashtable(newmask);
@@ -1831,7 +2000,8 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
cls_flower.rule->match.mask = &f->mask->key;
cls_flower.rule->match.key = &f->mkey;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
+ true);
if (err) {
kfree(cls_flower.rule);
if (tc_skip_sw(f->flags)) {
@@ -1844,21 +2014,17 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
cls_flower.classid = f->res.classid;
- err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+ err = tc_setup_cb_reoffload(block, tp, add, cb,
+ TC_SETUP_CLSFLOWER, &cls_flower,
+ cb_priv, &f->flags,
+ &f->in_hw_count);
+ tc_cleanup_flow_action(&cls_flower.rule->action);
kfree(cls_flower.rule);
if (err) {
- if (add && tc_skip_sw(f->flags)) {
- __fl_put(f);
- return err;
- }
- goto next_flow;
+ __fl_put(f);
+ return err;
}
-
- spin_lock(&tp->lock);
- tc_cls_offload_cnt_update(block, &f->in_hw_count, &f->flags,
- add);
- spin_unlock(&tp->lock);
next_flow:
__fl_put(f);
}
@@ -1866,6 +2032,30 @@ next_flow:
return 0;
}
+static void fl_hw_add(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ list_add(&f->hw_list, &head->hw_filters);
+ spin_unlock(&tp->lock);
+}
+
+static void fl_hw_del(struct tcf_proto *tp, void *type_data)
+{
+ struct flow_cls_offload *cls_flower = type_data;
+ struct cls_fl_filter *f =
+ (struct cls_fl_filter *) cls_flower->cookie;
+
+ spin_lock(&tp->lock);
+ if (!list_empty(&f->hw_list))
+ list_del_init(&f->hw_list);
+ spin_unlock(&tp->lock);
+}
+
static int fl_hw_create_tmplt(struct tcf_chain *chain,
struct fl_flow_tmplt *tmplt)
{
@@ -1886,7 +2076,7 @@ static int fl_hw_create_tmplt(struct tcf_chain *chain,
/* We don't care if driver (any of them) fails to handle this
* call. It serves just as a hint for it.
*/
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
kfree(cls_flower.rule);
return 0;
@@ -1902,7 +2092,7 @@ static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
cls_flower.command = FLOW_CLS_TMPLT_DESTROY;
cls_flower.cookie = (unsigned long) tmplt;
- tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, true);
}
static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
@@ -1980,18 +2170,22 @@ static int fl_dump_key_val(struct sk_buff *skb,
static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key,
struct fl_flow_key *mask)
{
- if (fl_dump_key_val(skb, &key->tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN,
- &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_min.dst)) ||
- fl_dump_key_val(skb, &key->tp_max.dst, TCA_FLOWER_KEY_PORT_DST_MAX,
- &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_max.dst)) ||
- fl_dump_key_val(skb, &key->tp_min.src, TCA_FLOWER_KEY_PORT_SRC_MIN,
- &mask->tp_min.src, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_min.src)) ||
- fl_dump_key_val(skb, &key->tp_max.src, TCA_FLOWER_KEY_PORT_SRC_MAX,
- &mask->tp_max.src, TCA_FLOWER_UNSPEC,
- sizeof(key->tp_max.src)))
+ if (fl_dump_key_val(skb, &key->tp_range.tp_min.dst,
+ TCA_FLOWER_KEY_PORT_DST_MIN,
+ &mask->tp_range.tp_min.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_min.dst)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_max.dst,
+ TCA_FLOWER_KEY_PORT_DST_MAX,
+ &mask->tp_range.tp_max.dst, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_max.dst)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_min.src,
+ TCA_FLOWER_KEY_PORT_SRC_MIN,
+ &mask->tp_range.tp_min.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_min.src)) ||
+ fl_dump_key_val(skb, &key->tp_range.tp_max.src,
+ TCA_FLOWER_KEY_PORT_SRC_MAX,
+ &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC,
+ sizeof(key->tp_range.tp_max.src)))
return -1;
return 0;
@@ -2145,6 +2339,61 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fl_dump_key_vxlan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct vxlan_metadata *)&enc_opts->data[0];
+ if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fl_dump_key_erspan_opt(struct sk_buff *skb,
+ struct flow_dissector_key_enc_opts *enc_opts)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ md = (struct erspan_metadata *)&enc_opts->data[0];
+ if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version))
+ goto nla_put_failure;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+ goto nla_put_failure;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,
+ md->u.md2.dir) ||
+ nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int fl_dump_key_ct(struct sk_buff *skb,
struct flow_dissector_key_ct *key,
struct flow_dissector_key_ct *mask)
@@ -2198,6 +2447,16 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
if (err)
goto nla_put_failure;
break;
+ case TUNNEL_VXLAN_OPT:
+ err = fl_dump_key_vxlan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
+ case TUNNEL_ERSPAN_OPT:
+ err = fl_dump_key_erspan_opt(skb, enc_opts);
+ if (err)
+ goto nla_put_failure;
+ break;
default:
goto nla_put_failure;
}
@@ -2507,12 +2766,28 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
+static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct cls_fl_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
+}
+
+static bool fl_delete_empty(struct tcf_proto *tp)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+
+ spin_lock(&tp->lock);
+ tp->deleting = idr_is_empty(&head->handle_idr);
+ spin_unlock(&tp->lock);
+
+ return tp->deleting;
}
static struct tcf_proto_ops cls_fl_ops __read_mostly = {
@@ -2524,8 +2799,11 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.put = fl_put,
.change = fl_change,
.delete = fl_delete,
+ .delete_empty = fl_delete_empty,
.walk = fl_walk,
.reoffload = fl_reoffload,
+ .hw_add = fl_hw_add,
+ .hw_del = fl_hw_del,
.dump = fl_dump,
.bind_class = fl_bind_class,
.tmplt_create = fl_tmplt_create,
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index c9496c920d6f..ec945294626a 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -419,12 +419,17 @@ nla_put_failure:
return -1;
}
-static void fw_bind_class(void *fh, u32 classid, unsigned long cl)
+static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct fw_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static struct tcf_proto_ops cls_fw_ops __read_mostly = {
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 455ea2793f9b..610a0b728161 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -75,8 +75,8 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
- tcf_block_offload_dec(block, &head->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall, false,
+ &head->flags, &head->in_hw_count, true);
}
static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -97,7 +97,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_REPLACE;
cls_mall.cookie = cookie;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
if (err) {
kfree(cls_mall.rule);
mall_destroy_hw_filter(tp, head, cookie, NULL);
@@ -109,15 +109,14 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
return err;
}
- err = tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, skip_sw);
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSMATCHALL, &cls_mall,
+ skip_sw, &head->flags, &head->in_hw_count, true);
+ tc_cleanup_flow_action(&cls_mall.rule->action);
kfree(cls_mall.rule);
- if (err < 0) {
+ if (err) {
mall_destroy_hw_filter(tp, head, cookie, NULL);
return err;
- } else if (err > 0) {
- head->in_hw_count = err;
- tcf_block_offload_inc(block, &head->flags);
}
if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
@@ -158,6 +157,7 @@ static void *mall_get(struct tcf_proto *tp, u32 handle)
static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
[TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
[TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
+ [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 },
};
static int mall_set_parms(struct net *net, struct tcf_proto *tp,
@@ -302,7 +302,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = (unsigned long)head;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
if (err) {
kfree(cls_mall.rule);
if (add && tc_skip_sw(head->flags)) {
@@ -312,16 +312,14 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
return 0;
}
- err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSMATCHALL,
+ &cls_mall, cb_priv, &head->flags,
+ &head->in_hw_count);
+ tc_cleanup_flow_action(&cls_mall.rule->action);
kfree(cls_mall.rule);
- if (err) {
- if (add && tc_skip_sw(head->flags))
- return err;
- return 0;
- }
-
- tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+ if (err)
+ return err;
return 0;
}
@@ -337,7 +335,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_STATS;
cls_mall.cookie = cookie;
- tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
cls_mall.stats.pkts, cls_mall.stats.lastused);
@@ -396,12 +394,17 @@ nla_put_failure:
return -1;
}
-static void mall_bind_class(void *fh, u32 classid, unsigned long cl)
+static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct cls_mall_head *head = fh;
- if (head && head->res.classid == classid)
- head->res.class = cl;
+ if (head && head->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &head->res, base);
+ else
+ __tcf_unbind_filter(q, &head->res);
+ }
}
static struct tcf_proto_ops cls_mall_ops __read_mostly = {
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 2d9e0b4484ea..6f8786b06bde 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -641,12 +641,17 @@ nla_put_failure:
return -1;
}
-static void route4_bind_class(void *fh, u32 classid, unsigned long cl)
+static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct route4_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static struct tcf_proto_ops cls_route4_ops __read_mostly = {
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 2f3c03b25d5d..d36949d9382c 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -463,10 +463,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
[TCA_RSVP_CLASSID] = { .type = NLA_U32 },
- [TCA_RSVP_DST] = { .type = NLA_BINARY,
- .len = RSVP_DST_LEN * sizeof(u32) },
- [TCA_RSVP_SRC] = { .type = NLA_BINARY,
- .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) },
+ [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) },
[TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
};
@@ -738,12 +736,17 @@ nla_put_failure:
return -1;
}
-static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl)
+static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct rsvp_filter *f = fh;
- if (f && f->res.classid == classid)
- f->res.class = cl;
+ if (f && f->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &f->res, base);
+ else
+ __tcf_unbind_filter(q, &f->res);
+ }
}
static struct tcf_proto_ops RSVP_OPS __read_mostly = {
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index e573e5a5c794..09b7dc5fe7e0 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -333,12 +333,31 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
cp->fall_through = p->fall_through;
cp->tp = tp;
+ if (tb[TCA_TCINDEX_HASH])
+ cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
+
+ if (tb[TCA_TCINDEX_MASK])
+ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+
+ if (tb[TCA_TCINDEX_SHIFT])
+ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+
+ if (!cp->hash) {
+ /* Hash not specified, use perfect hash if the upper limit
+ * of the hashing index is below the threshold.
+ */
+ if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
+ cp->hash = (cp->mask >> cp->shift) + 1;
+ else
+ cp->hash = DEFAULT_HASH_SIZE;
+ }
+
if (p->perfect) {
int i;
if (tcindex_alloc_perfect_hash(net, cp) < 0)
goto errout;
- for (i = 0; i < cp->hash; i++)
+ for (i = 0; i < min(cp->hash, p->hash); i++)
cp->perfect[i].res = p->perfect[i].res;
balloc = 1;
}
@@ -346,19 +365,10 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
err = tcindex_filter_result_init(&new_filter_result, net);
if (err < 0)
- goto errout1;
+ goto errout_alloc;
if (old_r)
cr = r->res;
- if (tb[TCA_TCINDEX_HASH])
- cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
-
- if (tb[TCA_TCINDEX_MASK])
- cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
-
- if (tb[TCA_TCINDEX_SHIFT])
- cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
-
err = -EBUSY;
/* Hash already allocated, make sure that we still meet the
@@ -376,16 +386,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (tb[TCA_TCINDEX_FALL_THROUGH])
cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
- if (!cp->hash) {
- /* Hash not specified, use perfect hash if the upper limit
- * of the hashing index is below the threshold.
- */
- if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
- cp->hash = (cp->mask >> cp->shift) + 1;
- else
- cp->hash = DEFAULT_HASH_SIZE;
- }
-
if (!cp->perfect && !cp->h)
cp->alloc_hash = cp->hash;
@@ -484,7 +484,6 @@ errout_alloc:
tcindex_free_perfect_hash(cp);
else if (balloc == 2)
kfree(cp->h);
-errout1:
tcf_exts_destroy(&new_filter_result.exts);
errout:
kfree(cp);
@@ -654,12 +653,17 @@ nla_put_failure:
return -1;
}
-static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl)
+static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl,
+ void *q, unsigned long base)
{
struct tcindex_filter_result *r = fh;
- if (r && r->res.classid == classid)
- r->res.class = cl;
+ if (r && r->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &r->res, base);
+ else
+ __tcf_unbind_filter(q, &r->res);
+ }
}
static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 8614088edd1b..e15ff335953d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -480,7 +480,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
+ tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true);
}
static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
@@ -498,7 +498,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
cls_u32.hnode.handle = h->handle;
cls_u32.hnode.prio = h->prio;
- err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
+ err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true);
if (err < 0) {
u32_clear_hw_hnode(tp, h, NULL);
return err;
@@ -522,8 +522,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.command = TC_CLSU32_DELETE_KNODE;
cls_u32.knode.handle = n->handle;
- tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false);
- tcf_block_offload_dec(block, &n->flags);
+ tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false,
+ &n->flags, &n->in_hw_count, true);
}
static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -552,13 +552,11 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
if (n->ht_down)
cls_u32.knode.link_handle = ht->handle;
- err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw);
- if (err < 0) {
+ err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw,
+ &n->flags, &n->in_hw_count, true);
+ if (err) {
u32_remove_hw_knode(tp, n, NULL);
return err;
- } else if (err > 0) {
- n->in_hw_count = err;
- tcf_block_offload_inc(block, &n->flags);
}
if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -1201,14 +1199,11 @@ static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
cls_u32.knode.link_handle = ht->handle;
}
- err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
- if (err) {
- if (add && tc_skip_sw(n->flags))
- return err;
- return 0;
- }
-
- tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+ err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32,
+ &cls_u32, cb_priv, &n->flags,
+ &n->in_hw_count);
+ if (err)
+ return err;
return 0;
}
@@ -1260,12 +1255,17 @@ static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
return 0;
}
-static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
+static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
+ unsigned long base)
{
struct tc_u_knode *n = fh;
- if (n && n->res.classid == classid)
- n->res.class = cl;
+ if (n && n->res.classid == classid) {
+ if (cl)
+ __tcf_bind_filter(q, &n->res, base);
+ else
+ __tcf_unbind_filter(q, &n->res);
+ }
}
static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 82bd14e7ac93..d99966a55c84 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -446,7 +446,7 @@ META_COLLECTOR(int_sk_wmem_queued)
*err = -1;
return;
}
- dst->value = sk->sk_wmem_queued;
+ dst->value = READ_ONCE(sk->sk_wmem_queued);
}
META_COLLECTOR(int_sk_fwd_alloc)
@@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl)
*err = -1;
return;
}
- dst->value = sk->sk_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_ack_backlog);
}
META_COLLECTOR(int_sk_max_ack_bl)
@@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl)
*err = -1;
return;
}
- dst->value = sk->sk_max_ack_backlog;
+ dst->value = READ_ONCE(sk->sk_max_ack_backlog);
}
META_COLLECTOR(int_sk_prio)
@@ -554,7 +554,7 @@ META_COLLECTOR(int_sk_rcvlowat)
*err = -1;
return;
}
- dst->value = sk->sk_rcvlowat;
+ dst->value = READ_ONCE(sk->sk_rcvlowat);
}
META_COLLECTOR(int_sk_rcvtimeo)
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 8f2ad706784d..dd3b8c11a2e0 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -238,6 +238,9 @@ static int tcf_em_validate(struct tcf_proto *tp,
goto errout;
if (em->ops->change) {
+ err = -EINVAL;
+ if (em_hdr->flags & TCF_EM_SIMPLE)
+ goto errout;
err = em->ops->change(net, data, data_len, em);
if (err < 0)
goto errout;
@@ -263,12 +266,12 @@ static int tcf_em_validate(struct tcf_proto *tp,
}
em->data = (unsigned long) v;
}
+ em->datalen = data_len;
}
}
em->matchid = em_hdr->matchid;
em->flags = em_hdr->flags;
- em->datalen = data_len;
em->net = net;
err = 0;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 04faee7ccbce..50794125bf02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1891,8 +1891,9 @@ static int tclass_del_notify(struct net *net,
struct tcf_bind_args {
struct tcf_walker w;
- u32 classid;
+ unsigned long base;
unsigned long cl;
+ u32 classid;
};
static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
@@ -1903,26 +1904,30 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
struct Qdisc *q = tcf_block_q(tp->chain->block);
sch_tree_lock(q);
- tp->ops->bind_class(n, a->classid, a->cl);
+ tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
sch_tree_unlock(q);
}
return 0;
}
-static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
- unsigned long new_cl)
+struct tc_bind_class_args {
+ struct qdisc_walker w;
+ unsigned long new_cl;
+ u32 portid;
+ u32 clid;
+};
+
+static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
+ struct qdisc_walker *w)
{
+ struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
const struct Qdisc_class_ops *cops = q->ops->cl_ops;
struct tcf_block *block;
struct tcf_chain *chain;
- unsigned long cl;
- cl = cops->find(q, portid);
- if (!cl)
- return;
block = cops->tcf_block(q, cl, NULL);
if (!block)
- return;
+ return 0;
for (chain = tcf_get_next_chain(block, NULL);
chain;
chain = tcf_get_next_chain(block, chain)) {
@@ -1933,11 +1938,29 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
struct tcf_bind_args arg = {};
arg.w.fn = tcf_node_bind;
- arg.classid = clid;
- arg.cl = new_cl;
+ arg.classid = a->clid;
+ arg.base = cl;
+ arg.cl = a->new_cl;
tp->ops->walk(tp, &arg.w, true);
}
}
+
+ return 0;
+}
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+ unsigned long new_cl)
+{
+ const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+ struct tc_bind_class_args args = {};
+
+ if (!cops->tcf_block)
+ return;
+ args.portid = portid;
+ args.clid = clid;
+ args.new_cl = new_cl;
+ args.w.fn = tc_bind_class_walker;
+ q->ops->cl_ops->walk(q, &args.w);
}
#else
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 53a80bc6b13a..1496e87cd07b 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -173,8 +173,7 @@ struct cake_tin_data {
u64 tin_rate_bps;
u16 tin_rate_shft;
- u16 tin_quantum_prio;
- u16 tin_quantum_band;
+ u16 tin_quantum;
s32 tin_deficit;
u32 tin_backlog;
u32 tin_dropped;
@@ -1683,8 +1682,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (IS_ERR_OR_NULL(segs))
return qdisc_drop(skb, sch, to_free);
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
cobalt_set_enqueue_time(segs, now);
@@ -1697,7 +1695,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
slen += segs->len;
q->buffer_used += segs->truesize;
b->packets++;
- segs = nskb;
}
/* stats */
@@ -1769,7 +1766,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
q->avg_window_begin));
u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
- do_div(b, window_interval);
+ b = div64_u64(b, window_interval);
q->avg_peak_bandwidth =
cake_ewma(q->avg_peak_bandwidth, b,
b > q->avg_peak_bandwidth ? 2 : 8);
@@ -1919,7 +1916,7 @@ begin:
while (b->tin_deficit < 0 ||
!(b->sparse_flow_count + b->bulk_flow_count)) {
if (b->tin_deficit <= 0)
- b->tin_deficit += b->tin_quantum_band;
+ b->tin_deficit += b->tin_quantum;
if (b->sparse_flow_count + b->bulk_flow_count)
empty = false;
@@ -2184,6 +2181,7 @@ static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
[TCA_CAKE_MPU] = { .type = NLA_U32 },
[TCA_CAKE_INGRESS] = { .type = NLA_U32 },
[TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
+ [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
[TCA_CAKE_FWMARK] = { .type = NLA_U32 },
};
@@ -2240,8 +2238,7 @@ static int cake_config_besteffort(struct Qdisc *sch)
cake_set_rate(b, rate, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- b->tin_quantum_band = 65535;
- b->tin_quantum_prio = 65535;
+ b->tin_quantum = 65535;
return 0;
}
@@ -2252,8 +2249,7 @@ static int cake_config_precedence(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2266,18 +2262,14 @@ static int cake_config_precedence(struct Qdisc *sch)
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2346,8 +2338,7 @@ static int cake_config_diffserv8(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
u64 rate = q->rate_bps;
- u32 quantum1 = 256;
- u32 quantum2 = 256;
+ u32 quantum = 256;
u32 i;
q->tin_cnt = 8;
@@ -2363,18 +2354,14 @@ static int cake_config_diffserv8(struct Qdisc *sch)
cake_set_rate(b, rate, mtu, us_to_ns(q->target),
us_to_ns(q->interval));
- b->tin_quantum_prio = max_t(u16, 1U, quantum1);
- b->tin_quantum_band = max_t(u16, 1U, quantum2);
+ b->tin_quantum = max_t(u16, 1U, quantum);
/* calculate next class's parameters */
rate *= 7;
rate >>= 3;
- quantum1 *= 3;
- quantum1 >>= 1;
-
- quantum2 *= 7;
- quantum2 >>= 3;
+ quantum *= 7;
+ quantum >>= 3;
}
return 0;
@@ -2413,17 +2400,11 @@ static int cake_config_diffserv4(struct Qdisc *sch)
cake_set_rate(&q->tins[3], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 2;
- q->tins[3].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 1;
- q->tins[3].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 1;
+ q->tins[3].tin_quantum = quantum >> 2;
return 0;
}
@@ -2454,15 +2435,10 @@ static int cake_config_diffserv3(struct Qdisc *sch)
cake_set_rate(&q->tins[2], rate >> 2, mtu,
us_to_ns(q->target), us_to_ns(q->interval));
- /* priority weights */
- q->tins[0].tin_quantum_prio = quantum;
- q->tins[1].tin_quantum_prio = quantum >> 4;
- q->tins[2].tin_quantum_prio = quantum << 4;
-
/* bandwidth-sharing weights */
- q->tins[0].tin_quantum_band = quantum;
- q->tins[1].tin_quantum_band = quantum >> 4;
- q->tins[2].tin_quantum_band = quantum >> 2;
+ q->tins[0].tin_quantum = quantum;
+ q->tins[1].tin_quantum = quantum >> 4;
+ q->tins[2].tin_quantum = quantum >> 2;
return 0;
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 06c7a2da21bc..39b427dc7512 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1127,6 +1127,33 @@ static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
[TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
};
+static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
+ struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
+ cbq_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_CBQ_WRROPT]) {
+ const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
+
+ if (wrr->priority > TC_CBQ_MAXPRIO) {
+ NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
+ err = -EINVAL;
+ }
+ }
+ return err;
+}
+
static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -1139,13 +1166,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
q->delay_timer.function = cbq_undelay;
- if (!opt) {
- NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
- return -EINVAL;
- }
-
- err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy,
- extack);
+ err = cbq_opt_parse(tb, opt, extack);
if (err < 0)
return err;
@@ -1464,13 +1485,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
struct cbq_class *parent;
struct qdisc_rate_table *rtab = NULL;
- if (!opt) {
- NL_SET_ERR_MSG(extack, "Mandatory qdisc options missing");
- return -EINVAL;
- }
-
- err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, cbq_policy,
- extack);
+ err = cbq_opt_parse(tb, opt, extack);
if (err < 0)
return err;
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index 732e109c3055..b2905b03a432 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -181,11 +181,6 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
s64 credits;
int len;
- if (atomic64_read(&q->port_rate) == -1) {
- WARN_ONCE(1, "cbs: dequeue() called with unknown port rate.");
- return NULL;
- }
-
if (q->credits < 0) {
credits = timediff_to_credits(now - q->last, q->idleslope);
@@ -303,11 +298,19 @@ static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
{
struct ethtool_link_ksettings ecmd;
- int port_rate = -1;
+ int speed = SPEED_10;
+ int port_rate;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
- if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
- ecmd.base.speed != SPEED_UNKNOWN)
- port_rate = ecmd.base.speed * 1000 * BYTES_PER_KBIT;
+skip:
+ port_rate = speed * 1000 * BYTES_PER_KBIT;
atomic64_set(&q->port_rate, port_rate);
netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
@@ -389,7 +392,6 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
{
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- int err;
if (!opt) {
NL_SET_ERR_MSG(extack, "Missing CBS qdisc options which are mandatory");
@@ -401,6 +403,10 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
if (!q->qdisc)
return -ENOMEM;
+ spin_lock(&cbs_list_lock);
+ list_add(&q->cbs_list, &cbs_list);
+ spin_unlock(&cbs_list_lock);
+
qdisc_hash_add(q->qdisc, false);
q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
@@ -410,17 +416,7 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
qdisc_watchdog_init(&q->watchdog, sch);
- err = cbs_change(sch, opt, extack);
- if (err)
- return err;
-
- if (!q->offload) {
- spin_lock(&cbs_list_lock);
- list_add(&q->cbs_list, &cbs_list);
- spin_unlock(&cbs_list_lock);
- }
-
- return 0;
+ return cbs_change(sch, opt, extack);
}
static void cbs_destroy(struct Qdisc *sch)
@@ -428,15 +424,18 @@ static void cbs_destroy(struct Qdisc *sch)
struct cbs_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- spin_lock(&cbs_list_lock);
- list_del(&q->cbs_list);
- spin_unlock(&cbs_list_lock);
+ /* Nothing to do if we couldn't create the underlying qdisc */
+ if (!q->qdisc)
+ return;
qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
- if (q->qdisc)
- qdisc_put(q->qdisc);
+ spin_lock(&cbs_list_lock);
+ list_del(&q->cbs_list);
+ spin_unlock(&cbs_list_lock);
+
+ qdisc_put(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index dba70377bbd9..a36974e9c601 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -377,7 +377,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
if (mask != q->tab_mask) {
struct sk_buff **ntab;
- ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
+ ntab = kvcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
if (!ntab)
return -ENOMEM;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index bad1cbe59a56..05605b30bef3 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -361,6 +361,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt,
goto errout;
err = -EINVAL;
+ if (!tb[TCA_DSMARK_INDICES])
+ goto errout;
indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
if (hweight32(indices) != 1)
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
index cebfb65d8556..b1da5589a0c6 100644
--- a/net/sched/sch_etf.c
+++ b/net/sched/sch_etf.c
@@ -177,7 +177,7 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
parent = *p;
skb = rb_to_skb(parent);
- if (ktime_after(txtime, skb->tstamp)) {
+ if (ktime_compare(txtime, skb->tstamp) >= 0) {
p = &parent->rb_right;
leftmost = false;
} else {
diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
new file mode 100644
index 000000000000..a87e9159338c
--- /dev/null
+++ b/net/sched/sch_ets.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * net/sched/sch_ets.c Enhanced Transmission Selection scheduler
+ *
+ * Description
+ * -----------
+ *
+ * The Enhanced Transmission Selection scheduler is a classful queuing
+ * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler.
+ * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to
+ * implement the transmission selection described in 802.1Qaz.
+ *
+ * Although ETS is technically classful, it's not possible to add and remove
+ * classes at will. Instead one specifies number of classes, how many are
+ * PRIO-like and how many DRR-like, and quanta for the latter.
+ *
+ * Algorithm
+ * ---------
+ *
+ * The strict classes, if any, are tried for traffic first: first band 0, if it
+ * has no traffic then band 1, etc.
+ *
+ * When there is no traffic in any of the strict queues, the bandwidth-sharing
+ * ones are tried next. Each band is assigned a deficit counter, initialized to
+ * "quantum" of that band. ETS maintains a list of active bandwidth-sharing
+ * bands whose qdiscs are non-empty. A packet is dequeued from the band at the
+ * head of the list if the packet size is smaller or equal to the deficit
+ * counter. If the counter is too small, it is increased by "quantum" and the
+ * scheduler moves on to the next band in the active list.
+ */
+
+#include <linux/module.h>
+#include <net/gen_stats.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct ets_class {
+ struct list_head alist; /* In struct ets_sched.active. */
+ struct Qdisc *qdisc;
+ u32 quantum;
+ u32 deficit;
+ struct gnet_stats_basic_packed bstats;
+ struct gnet_stats_queue qstats;
+};
+
+struct ets_sched {
+ struct list_head active;
+ struct tcf_proto __rcu *filter_list;
+ struct tcf_block *block;
+ unsigned int nbands;
+ unsigned int nstrict;
+ u8 prio2band[TC_PRIO_MAX + 1];
+ struct ets_class classes[TCQ_ETS_MAX_BANDS];
+};
+
+static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_NBANDS] = { .type = NLA_U8 },
+ [TCA_ETS_NSTRICT] = { .type = NLA_U8 },
+ [TCA_ETS_QUANTA] = { .type = NLA_NESTED },
+ [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = {
+ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 },
+};
+
+static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr,
+ unsigned int *quantum,
+ struct netlink_ext_ack *extack)
+{
+ *quantum = nla_get_u32(attr);
+ if (!*quantum) {
+ NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static struct ets_class *
+ets_class_from_arg(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ return &q->classes[arg - 1];
+}
+
+static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band = cl - q->classes;
+
+ return TC_H_MAKE(sch->handle, band + 1);
+}
+
+static void ets_offload_change(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct tc_ets_qopt_offload qopt;
+ unsigned int w_psum_prev = 0;
+ unsigned int q_psum = 0;
+ unsigned int q_sum = 0;
+ unsigned int quantum;
+ unsigned int w_psum;
+ unsigned int weight;
+ unsigned int i;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.bands = q->nbands;
+ qopt.replace_params.qstats = &sch->qstats;
+ memcpy(&qopt.replace_params.priomap,
+ q->prio2band, sizeof(q->prio2band));
+
+ for (i = 0; i < q->nbands; i++)
+ q_sum += q->classes[i].quantum;
+
+ for (i = 0; i < q->nbands; i++) {
+ quantum = q->classes[i].quantum;
+ q_psum += quantum;
+ w_psum = quantum ? q_psum * 100 / q_sum : 0;
+ weight = w_psum - w_psum_prev;
+ w_psum_prev = w_psum;
+
+ qopt.replace_params.quanta[i] = quantum;
+ qopt.replace_params.weights[i] = weight;
+ }
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_ETS_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static void ets_offload_graft(struct Qdisc *sch, struct Qdisc *new,
+ struct Qdisc *old, unsigned long arg,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_GRAFT;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.graft_params.band = arg - 1;
+ qopt.graft_params.child_handle = new->handle;
+
+ qdisc_offload_graft_helper(dev, sch, new, old, TC_SETUP_QDISC_ETS,
+ &qopt, extack);
+}
+
+static int ets_offload_dump(struct Qdisc *sch)
+{
+ struct tc_ets_qopt_offload qopt;
+
+ qopt.command = TC_ETS_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_ETS, &qopt);
+}
+
+static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl)
+{
+ unsigned int band = cl - q->classes;
+
+ return band < q->nstrict;
+}
+
+static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid,
+ struct nlattr **tca, unsigned long *arg,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, *arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opt = tca[TCA_OPTIONS];
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int quantum;
+ int err;
+
+ /* Classes can be added and removed only through Qdisc_ops.change
+ * interface.
+ */
+ if (!cl) {
+ NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_QUANTA_BAND])
+ /* Nothing to configure. */
+ return 0;
+
+ if (ets_class_is_strict(q, cl)) {
+ NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum");
+ return -EINVAL;
+ }
+
+ err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum,
+ extack);
+ if (err)
+ return err;
+
+ sch_tree_lock(sch);
+ cl->quantum = quantum;
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ return 0;
+}
+
+static int ets_class_graft(struct Qdisc *sch, unsigned long arg,
+ struct Qdisc *new, struct Qdisc **old,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, cl), NULL);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
+
+ *old = qdisc_replace(sch, new, &cl->qdisc);
+ ets_offload_graft(sch, new, *old, arg, extack);
+ return 0;
+}
+
+static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+
+ return cl->qdisc;
+}
+
+static unsigned long ets_class_find(struct Qdisc *sch, u32 classid)
+{
+ unsigned long band = TC_H_MIN(classid);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (band - 1 >= q->nbands)
+ return 0;
+ return band;
+}
+
+static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+
+ /* We get notified about zero-length child Qdiscs as well if they are
+ * offloaded. Those aren't on the active list though, so don't attempt
+ * to remove them.
+ */
+ if (!ets_class_is_strict(q, cl) && sch->q.qlen)
+ list_del(&cl->alist);
+}
+
+static int ets_class_dump(struct Qdisc *sch, unsigned long arg,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *nest;
+
+ tcm->tcm_parent = TC_H_ROOT;
+ tcm->tcm_handle = ets_class_id(sch, cl);
+ tcm->tcm_info = cl->qdisc->handle;
+
+ nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+ if (!ets_class_is_strict(q, cl)) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum))
+ goto nla_put_failure;
+ }
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg,
+ struct gnet_dump *d)
+{
+ struct ets_class *cl = ets_class_from_arg(sch, arg);
+ struct Qdisc *cl_q = cl->qdisc;
+
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
+ d, NULL, &cl_q->bstats) < 0 ||
+ qdisc_qstats_copy(d, cl_q) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < q->nbands; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_block *
+ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+
+ if (cl) {
+ NL_SET_ERR_MSG(extack, "ETS classid must be zero");
+ return NULL;
+ }
+
+ return q->block;
+}
+
+static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return ets_class_find(sch, classid);
+}
+
+static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+}
+
+static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+ struct tcf_proto *fl;
+ int err;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+ fl = rcu_dereference_bh(q->filter_list);
+ err = tcf_classify(skb, fl, &res, false);
+#ifdef CONFIG_NET_CLS_ACT
+ switch (err) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+#endif
+ if (!fl || err < 0) {
+ if (TC_H_MAJ(band))
+ band = 0;
+ return &q->classes[q->prio2band[band & TC_PRIO_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band >= q->nbands)
+ return &q->classes[q->prio2band[0]];
+ return &q->classes[band];
+}
+
+static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ unsigned int len = qdisc_pkt_len(skb);
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ int err = 0;
+ bool first;
+
+ cl = ets_classify(skb, sch, &err);
+ if (!cl) {
+ if (err & __NET_XMIT_BYPASS)
+ qdisc_qstats_drop(sch);
+ __qdisc_drop(skb, to_free);
+ return err;
+ }
+
+ first = !cl->qdisc->q.qlen;
+ err = qdisc_enqueue(skb, cl->qdisc, to_free);
+ if (unlikely(err != NET_XMIT_SUCCESS)) {
+ if (net_xmit_drop_count(err)) {
+ cl->qstats.drops++;
+ qdisc_qstats_drop(sch);
+ }
+ return err;
+ }
+
+ if (first && !ets_class_is_strict(q, cl)) {
+ list_add_tail(&cl->alist, &q->active);
+ cl->deficit = cl->quantum;
+ }
+
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+ return err;
+}
+
+static struct sk_buff *
+ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
+{
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+ return skb;
+}
+
+static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct ets_class *cl;
+ struct sk_buff *skb;
+ unsigned int band;
+ unsigned int len;
+
+ while (1) {
+ for (band = 0; band < q->nstrict; band++) {
+ cl = &q->classes[band];
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (skb)
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ if (list_empty(&q->active))
+ goto out;
+
+ cl = list_first_entry(&q->active, struct ets_class, alist);
+ skb = cl->qdisc->ops->peek(cl->qdisc);
+ if (!skb) {
+ qdisc_warn_nonwc(__func__, cl->qdisc);
+ goto out;
+ }
+
+ len = qdisc_pkt_len(skb);
+ if (len <= cl->deficit) {
+ cl->deficit -= len;
+ skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (unlikely(!skb))
+ goto out;
+ if (cl->qdisc->q.qlen == 0)
+ list_del(&cl->alist);
+ return ets_qdisc_dequeue_skb(sch, skb);
+ }
+
+ cl->deficit += cl->quantum;
+ list_move_tail(&cl->alist, &q->active);
+ }
+out:
+ return NULL;
+}
+
+static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr,
+ unsigned int nbands, u8 *priomap,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int prio = 0;
+ u8 band;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX,
+ ets_priomap_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, priomap_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_PRIOMAP_BAND:
+ if (prio > TC_PRIO_MAX) {
+ NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap");
+ return -EINVAL;
+ }
+ band = nla_get_u8(attr);
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap");
+ return -EINVAL;
+ }
+ priomap[prio++] = band;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr,
+ unsigned int nbands, unsigned int nstrict,
+ unsigned int *quanta,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr *attr;
+ int band = nstrict;
+ int rem;
+ int err;
+
+ err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX,
+ ets_quanta_policy, NL_VALIDATE_STRICT,
+ extack);
+ if (err < 0)
+ return err;
+
+ nla_for_each_nested(attr, quanta_attr, rem) {
+ switch (nla_type(attr)) {
+ case TCA_ETS_QUANTA_BAND:
+ if (band >= nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands");
+ return -EINVAL;
+ }
+ err = ets_quantum_parse(sch, attr, &quanta[band++],
+ extack);
+ if (err)
+ return err;
+ break;
+ default:
+ WARN_ON_ONCE(1); /* Validate should have caught this. */
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0};
+ struct Qdisc *queues[TCQ_ETS_MAX_BANDS];
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_ETS_MAX + 1];
+ unsigned int oldbands = q->nbands;
+ u8 priomap[TC_PRIO_MAX + 1];
+ unsigned int nstrict = 0;
+ unsigned int nbands;
+ unsigned int i;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack, "ETS options are required for this operation");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETS_NBANDS]) {
+ NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument");
+ return -EINVAL;
+ }
+ nbands = nla_get_u8(tb[TCA_ETS_NBANDS]);
+ if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands");
+ return -EINVAL;
+ }
+ /* Unless overridden, traffic goes to the last band. */
+ memset(priomap, nbands - 1, sizeof(priomap));
+
+ if (tb[TCA_ETS_NSTRICT]) {
+ nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]);
+ if (nstrict > nbands) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands");
+ return -EINVAL;
+ }
+ }
+
+ if (tb[TCA_ETS_PRIOMAP]) {
+ err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP],
+ nbands, priomap, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[TCA_ETS_QUANTA]) {
+ err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA],
+ nbands, nstrict, quanta, extack);
+ if (err)
+ return err;
+ }
+ /* If there are more bands than strict + quanta provided, the remaining
+ * ones are ETS with quantum of MTU. Initialize the missing values here.
+ */
+ for (i = nstrict; i < nbands; i++) {
+ if (!quanta[i])
+ quanta[i] = psched_mtu(qdisc_dev(sch));
+ }
+
+ /* Before commit, make sure we can allocate all new qdiscs */
+ for (i = oldbands; i < nbands; i++) {
+ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ ets_class_id(sch, &q->classes[i]),
+ extack);
+ if (!queues[i]) {
+ while (i > oldbands)
+ qdisc_put(queues[--i]);
+ return -ENOMEM;
+ }
+ }
+
+ sch_tree_lock(sch);
+
+ q->nbands = nbands;
+ q->nstrict = nstrict;
+ memcpy(q->prio2band, priomap, sizeof(priomap));
+
+ for (i = q->nbands; i < oldbands; i++)
+ qdisc_tree_flush_backlog(q->classes[i].qdisc);
+
+ for (i = 0; i < q->nbands; i++)
+ q->classes[i].quantum = quanta[i];
+
+ for (i = oldbands; i < q->nbands; i++) {
+ q->classes[i].qdisc = queues[i];
+ if (q->classes[i].qdisc != &noop_qdisc)
+ qdisc_hash_add(q->classes[i].qdisc, true);
+ }
+
+ sch_tree_unlock(sch);
+
+ ets_offload_change(sch);
+ for (i = q->nbands; i < oldbands; i++) {
+ qdisc_put(q->classes[i].qdisc);
+ memset(&q->classes[i], 0, sizeof(q->classes[i]));
+ }
+ return 0;
+}
+
+static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&q->active);
+ return ets_qdisc_change(sch, opt, extack);
+}
+
+static void ets_qdisc_reset(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (q->classes[band].qdisc->q.qlen)
+ list_del(&q->classes[band].alist);
+ }
+ for (band = 0; band < q->nbands; band++)
+ qdisc_reset(q->classes[band].qdisc);
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+}
+
+static void ets_qdisc_destroy(struct Qdisc *sch)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ int band;
+
+ ets_offload_destroy(sch);
+ tcf_block_put(q->block);
+ for (band = 0; band < q->nbands; band++)
+ qdisc_put(q->classes[band].qdisc);
+}
+
+static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct ets_sched *q = qdisc_priv(sch);
+ struct nlattr *opts;
+ struct nlattr *nest;
+ int band;
+ int prio;
+ int err;
+
+ err = ets_offload_dump(sch);
+ if (err)
+ return err;
+
+ opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+ if (!opts)
+ goto nla_err;
+
+ if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands))
+ goto nla_err;
+
+ if (q->nstrict &&
+ nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict))
+ goto nla_err;
+
+ if (q->nbands > q->nstrict) {
+ nest = nla_nest_start(skb, TCA_ETS_QUANTA);
+ if (!nest)
+ goto nla_err;
+
+ for (band = q->nstrict; band < q->nbands; band++) {
+ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND,
+ q->classes[band].quantum))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ nest = nla_nest_start(skb, TCA_ETS_PRIOMAP);
+ if (!nest)
+ goto nla_err;
+
+ for (prio = 0; prio <= TC_PRIO_MAX; prio++) {
+ if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio]))
+ goto nla_err;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return nla_nest_end(skb, opts);
+
+nla_err:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static const struct Qdisc_class_ops ets_class_ops = {
+ .change = ets_class_change,
+ .graft = ets_class_graft,
+ .leaf = ets_class_leaf,
+ .find = ets_class_find,
+ .qlen_notify = ets_class_qlen_notify,
+ .dump = ets_class_dump,
+ .dump_stats = ets_class_dump_stats,
+ .walk = ets_qdisc_walk,
+ .tcf_block = ets_qdisc_tcf_block,
+ .bind_tcf = ets_qdisc_bind_tcf,
+ .unbind_tcf = ets_qdisc_unbind_tcf,
+};
+
+static struct Qdisc_ops ets_qdisc_ops __read_mostly = {
+ .cl_ops = &ets_class_ops,
+ .id = "ets",
+ .priv_size = sizeof(struct ets_sched),
+ .enqueue = ets_qdisc_enqueue,
+ .dequeue = ets_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .change = ets_qdisc_change,
+ .init = ets_qdisc_init,
+ .reset = ets_qdisc_reset,
+ .destroy = ets_qdisc_destroy,
+ .dump = ets_qdisc_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init ets_init(void)
+{
+ return register_qdisc(&ets_qdisc_ops);
+}
+
+static void __exit ets_exit(void)
+{
+ unregister_qdisc(&ets_qdisc_ops);
+}
+
+module_init(ets_init);
+module_exit(ets_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 98dd87ce1510..a5a295477ecc 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -301,6 +301,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
+ if (q->rate_enable)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
if (fq_flow_is_throttled(f))
fq_flow_unset_throttled(q, f);
f->time_next_packet = 0ULL;
@@ -322,8 +325,12 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
fq_flow_set_detached(f);
f->sk = sk;
- if (skb->sk == sk)
+ if (skb->sk == sk) {
f->socket_hash = sk->sk_hash;
+ if (q->rate_enable)
+ smp_store_release(&sk->sk_pacing_status,
+ SK_PACING_FQ);
+ }
f->credit = q->initial_quantum;
rb_link_node(&f->fq_node, parent, p);
@@ -428,17 +435,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
f->qlen++;
qdisc_qstats_backlog_inc(sch, skb);
if (fq_flow_is_detached(f)) {
- struct sock *sk = skb->sk;
-
fq_flow_add_tail(&q->new_flows, f);
if (time_after(jiffies, f->age + q->flow_refill_delay))
f->credit = max_t(u32, f->credit, q->quantum);
- if (sk && q->rate_enable) {
- if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
- SK_PACING_FQ))
- smp_store_release(&sk->sk_pacing_status,
- SK_PACING_FQ);
- }
q->inactive_flows--;
}
@@ -530,8 +529,7 @@ begin:
fq_flow_set_throttled(q, f);
goto begin;
}
- if (time_next_packet &&
- (s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
@@ -788,10 +786,12 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_FQ_QUANTUM]) {
u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
- if (quantum > 0)
+ if (quantum > 0 && quantum <= (1 << 20)) {
q->quantum = quantum;
- else
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
err = -EINVAL;
+ }
}
if (tb[TCA_FQ_INITIAL_QUANTUM])
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d59fbcc745d1..968519ff36e9 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -14,7 +14,6 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -45,7 +44,6 @@ struct fq_codel_flow {
struct sk_buff *tail;
struct list_head flowchain;
int deficit;
- u32 dropped; /* number of drops (or ECN marks) on this flow */
struct codel_vars cvars;
}; /* please try to keep this structure <= 64 bytes */
@@ -173,7 +171,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
__qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
- flow->dropped += i;
+ /* Tell codel to increase its signal strength also */
+ flow->cvars.count += i;
q->backlogs[idx] -= len;
q->memory_usage -= mem;
sch->qstats.drops += i;
@@ -211,7 +210,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch,
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
flow->deficit = q->quantum;
- flow->dropped = 0;
}
get_codel_cb(skb)->mem_usage = skb->truesize;
q->memory_usage += get_codel_cb(skb)->mem_usage;
@@ -286,7 +284,6 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
struct sk_buff *skb;
struct fq_codel_flow *flow;
struct list_head *head;
- u32 prev_drop_count, prev_ecn_mark;
begin:
head = &q->new_flows;
@@ -303,16 +300,10 @@ begin:
goto begin;
}
- prev_drop_count = q->cstats.drop_count;
- prev_ecn_mark = q->cstats.ecn_mark;
-
skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams,
&flow->cvars, &q->cstats, qdisc_pkt_len,
codel_get_enqueue_time, drop_func, dequeue_func);
- flow->dropped += q->cstats.drop_count - prev_drop_count;
- flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
-
if (!skb) {
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && !list_empty(&q->old_flows))
@@ -658,7 +649,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
sch_tree_unlock(sch);
}
qs.backlog = q->backlogs[idx];
- qs.drops = flow->dropped;
+ qs.drops = 0;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
new file mode 100644
index 000000000000..214657eb3dfd
--- /dev/null
+++ b/net/sched/sch_fq_pie.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Flow Queue PIE discipline
+ *
+ * Copyright (C) 2019 Mohit P. Tahiliani <tahiliani@nitk.edu.in>
+ * Copyright (C) 2019 Sachin D. Patil <sdp.sachin@gmail.com>
+ * Copyright (C) 2019 V. Saicharan <vsaicharan1998@gmail.com>
+ * Copyright (C) 2019 Mohit Bhasi <mohitbhasi1998@gmail.com>
+ * Copyright (C) 2019 Leslie Monis <lesliemonis@gmail.com>
+ * Copyright (C) 2019 Gautam Ramakrishnan <gautamramk@gmail.com>
+ */
+
+#include <linux/jhash.h>
+#include <linux/sizes.h>
+#include <linux/vmalloc.h>
+#include <net/pkt_cls.h>
+#include <net/pie.h>
+
+/* Flow Queue PIE
+ *
+ * Principles:
+ * - Packets are classified on flows.
+ * - This is a Stochastic model (as we use a hash, several flows might
+ * be hashed to the same slot)
+ * - Each flow has a PIE managed queue.
+ * - Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ * - For a given flow, packets are not reordered.
+ * - Drops during enqueue only.
+ * - ECN capability is off by default.
+ * - ECN threshold (if ECN is enabled) is at 10% by default.
+ * - Uses timestamps to calculate queue delay by default.
+ */
+
+/**
+ * struct fq_pie_flow - contains data for each flow
+ * @vars: pie vars associated with the flow
+ * @deficit: number of remaining byte credits
+ * @backlog: size of data in the flow
+ * @qlen: number of packets in the flow
+ * @flowchain: flowchain for the flow
+ * @head: first packet in the flow
+ * @tail: last packet in the flow
+ */
+struct fq_pie_flow {
+ struct pie_vars vars;
+ s32 deficit;
+ u32 backlog;
+ u32 qlen;
+ struct list_head flowchain;
+ struct sk_buff *head;
+ struct sk_buff *tail;
+};
+
+struct fq_pie_sched_data {
+ struct tcf_proto __rcu *filter_list; /* optional external classifier */
+ struct tcf_block *block;
+ struct fq_pie_flow *flows;
+ struct Qdisc *sch;
+ struct list_head old_flows;
+ struct list_head new_flows;
+ struct pie_params p_params;
+ u32 ecn_prob;
+ u32 flows_cnt;
+ u32 quantum;
+ u32 memory_limit;
+ u32 new_flow_count;
+ u32 memory_usage;
+ u32 overmemory;
+ struct pie_stats stats;
+ struct timer_list adapt_timer;
+};
+
+static unsigned int fq_pie_hash(const struct fq_pie_sched_data *q,
+ struct sk_buff *skb)
+{
+ return reciprocal_scale(skb_get_hash(skb), q->flows_cnt);
+}
+
+static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tcf_proto *filter;
+ struct tcf_result res;
+ int result;
+
+ if (TC_H_MAJ(skb->priority) == sch->handle &&
+ TC_H_MIN(skb->priority) > 0 &&
+ TC_H_MIN(skb->priority) <= q->flows_cnt)
+ return TC_H_MIN(skb->priority);
+
+ filter = rcu_dereference_bh(q->filter_list);
+ if (!filter)
+ return fq_pie_hash(q, skb) + 1;
+
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+ result = tcf_classify(skb, filter, &res, false);
+ if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (result) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+ /* fall through */
+ case TC_ACT_SHOT:
+ return 0;
+ }
+#endif
+ if (TC_H_MIN(res.classid) <= q->flows_cnt)
+ return TC_H_MIN(res.classid);
+ }
+ return 0;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_pie_flow *flow,
+ struct sk_buff *skb)
+{
+ if (!flow->head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+}
+
+static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct fq_pie_flow *sel_flow;
+ int uninitialized_var(ret);
+ u8 memory_limited = false;
+ u8 enqueue = false;
+ u32 pkt_len;
+ u32 idx;
+
+ /* Classifies packet into corresponding flow */
+ idx = fq_pie_classify(skb, sch, &ret);
+ sel_flow = &q->flows[idx];
+
+ /* Checks whether adding a new packet would exceed memory limit */
+ get_pie_cb(skb)->mem_usage = skb->truesize;
+ memory_limited = q->memory_usage > q->memory_limit + skb->truesize;
+
+ /* Checks if the qdisc is full */
+ if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
+ q->stats.overlimit++;
+ goto out;
+ } else if (unlikely(memory_limited)) {
+ q->overmemory++;
+ }
+
+ if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
+ sel_flow->backlog, skb->len)) {
+ enqueue = true;
+ } else if (q->p_params.ecn &&
+ sel_flow->vars.prob <= (MAX_PROB / 100) * q->ecn_prob &&
+ INET_ECN_set_ce(skb)) {
+ /* If packet is ecn capable, mark it if drop probability
+ * is lower than the parameter ecn_prob, else drop it.
+ */
+ q->stats.ecn_mark++;
+ enqueue = true;
+ }
+ if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->p_params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
+ pkt_len = qdisc_pkt_len(skb);
+ q->stats.packets_in++;
+ q->memory_usage += skb->truesize;
+ sch->qstats.backlog += pkt_len;
+ sch->q.qlen++;
+ flow_queue_add(sel_flow, skb);
+ if (list_empty(&sel_flow->flowchain)) {
+ list_add_tail(&sel_flow->flowchain, &q->new_flows);
+ q->new_flow_count++;
+ sel_flow->deficit = q->quantum;
+ sel_flow->qlen = 0;
+ sel_flow->backlog = 0;
+ }
+ sel_flow->qlen++;
+ sel_flow->backlog += pkt_len;
+ return NET_XMIT_SUCCESS;
+ }
+out:
+ q->stats.dropped++;
+ sel_flow->vars.accu_prob = 0;
+ sel_flow->vars.accu_prob_overflows = 0;
+ __qdisc_drop(skb, to_free);
+ qdisc_qstats_drop(sch);
+ return NET_XMIT_CN;
+}
+
+static const struct nla_policy fq_pie_policy[TCA_FQ_PIE_MAX + 1] = {
+ [TCA_FQ_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_FLOWS] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_FQ_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BETA] = {.type = NLA_U32},
+ [TCA_FQ_PIE_QUANTUM] = {.type = NLA_U32},
+ [TCA_FQ_PIE_MEMORY_LIMIT] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN_PROB] = {.type = NLA_U32},
+ [TCA_FQ_PIE_ECN] = {.type = NLA_U32},
+ [TCA_FQ_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_FQ_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
+};
+
+static inline struct sk_buff *dequeue_head(struct fq_pie_flow *flow)
+{
+ struct sk_buff *skb = flow->head;
+
+ flow->head = skb->next;
+ skb->next = NULL;
+ return skb;
+}
+
+static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = NULL;
+ struct fq_pie_flow *flow;
+ struct list_head *head;
+ u32 pkt_len;
+
+begin:
+ head = &q->new_flows;
+ if (list_empty(head)) {
+ head = &q->old_flows;
+ if (list_empty(head))
+ return NULL;
+ }
+
+ flow = list_first_entry(head, struct fq_pie_flow, flowchain);
+ /* Flow has exhausted all its credits */
+ if (flow->deficit <= 0) {
+ flow->deficit += q->quantum;
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ goto begin;
+ }
+
+ if (flow->head) {
+ skb = dequeue_head(flow);
+ pkt_len = qdisc_pkt_len(skb);
+ sch->qstats.backlog -= pkt_len;
+ sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
+ }
+
+ if (!skb) {
+ /* force a pass through old_flows to prevent starvation */
+ if (head == &q->new_flows && !list_empty(&q->old_flows))
+ list_move_tail(&flow->flowchain, &q->old_flows);
+ else
+ list_del_init(&flow->flowchain);
+ goto begin;
+ }
+
+ flow->qlen--;
+ flow->deficit -= pkt_len;
+ flow->backlog -= pkt_len;
+ q->memory_usage -= get_pie_cb(skb)->mem_usage;
+ pie_process_dequeue(skb, &q->p_params, &flow->vars, flow->backlog);
+ return skb;
+}
+
+static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
+ unsigned int len_dropped = 0;
+ unsigned int num_dropped = 0;
+ int err;
+
+ if (!opt)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack);
+ if (err < 0)
+ return err;
+
+ sch_tree_lock(sch);
+ if (tb[TCA_FQ_PIE_LIMIT]) {
+ u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
+
+ q->p_params.limit = limit;
+ sch->limit = limit;
+ }
+ if (tb[TCA_FQ_PIE_FLOWS]) {
+ if (q->flows) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows cannot be changed");
+ goto flow_error;
+ }
+ q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
+ if (!q->flows_cnt || q->flows_cnt > 65536) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Number of flows must be < 65536");
+ goto flow_error;
+ }
+ }
+
+ /* convert from microseconds to pschedtime */
+ if (tb[TCA_FQ_PIE_TARGET]) {
+ /* target is in us */
+ u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
+
+ /* convert to pschedtime */
+ q->p_params.target =
+ PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
+ }
+
+ /* tupdate is in jiffies */
+ if (tb[TCA_FQ_PIE_TUPDATE])
+ q->p_params.tupdate =
+ usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));
+
+ if (tb[TCA_FQ_PIE_ALPHA])
+ q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
+
+ if (tb[TCA_FQ_PIE_BETA])
+ q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
+
+ if (tb[TCA_FQ_PIE_QUANTUM])
+ q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
+
+ if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
+ q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
+
+ if (tb[TCA_FQ_PIE_ECN_PROB])
+ q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
+
+ if (tb[TCA_FQ_PIE_ECN])
+ q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
+
+ if (tb[TCA_FQ_PIE_BYTEMODE])
+ q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
+
+ if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
+ q->p_params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
+
+ /* Drop excess packets if new limit is lower */
+ while (sch->q.qlen > sch->limit) {
+ struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
+
+ len_dropped += qdisc_pkt_len(skb);
+ num_dropped += 1;
+ rtnl_kfree_skbs(skb, skb);
+ }
+ qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
+
+ sch_tree_unlock(sch);
+ return 0;
+
+flow_error:
+ sch_tree_unlock(sch);
+ return -EINVAL;
+}
+
+static void fq_pie_timer(struct timer_list *t)
+{
+ struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
+ struct Qdisc *sch = q->sch;
+ spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+ u16 idx;
+
+ root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ spin_lock(root_lock);
+
+ for (idx = 0; idx < q->flows_cnt; idx++)
+ pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
+ q->flows[idx].backlog);
+
+ /* reset the timer to fire after 'tupdate' jiffies. */
+ if (q->p_params.tupdate)
+ mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
+
+ spin_unlock(root_lock);
+}
+
+static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ int err;
+ u16 idx;
+
+ pie_params_init(&q->p_params);
+ sch->limit = 10 * 1024;
+ q->p_params.limit = sch->limit;
+ q->quantum = psched_mtu(qdisc_dev(sch));
+ q->sch = sch;
+ q->ecn_prob = 10;
+ q->flows_cnt = 1024;
+ q->memory_limit = SZ_32M;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+
+ if (opt) {
+ err = fq_pie_change(sch, opt, extack);
+
+ if (err)
+ return err;
+ }
+
+ err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ if (err)
+ goto init_failure;
+
+ q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_pie_flow),
+ GFP_KERNEL);
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ timer_setup(&q->adapt_timer, fq_pie_timer, 0);
+ mod_timer(&q->adapt_timer, jiffies + HZ / 2);
+
+ return 0;
+
+init_failure:
+ q->flows_cnt = 0;
+
+ return err;
+}
+
+static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts;
+
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (!opts)
+ return -EMSGSIZE;
+
+ /* convert target from pschedtime to us */
+ if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
+ nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TARGET,
+ ((u32)PSCHED_TICKS2NS(q->p_params.target)) /
+ NSEC_PER_USEC) ||
+ nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
+ jiffies_to_usecs(q->p_params.tupdate)) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
+ nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
+ nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
+ nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
+ nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
+ nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
+ q->p_params.dq_rate_estimator))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int fq_pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ struct tc_fq_pie_xstats st = {
+ .packets_in = q->stats.packets_in,
+ .overlimit = q->stats.overlimit,
+ .overmemory = q->overmemory,
+ .dropped = q->stats.dropped,
+ .ecn_mark = q->stats.ecn_mark,
+ .new_flow_count = q->new_flow_count,
+ .memory_usage = q->memory_usage,
+ };
+ struct list_head *pos;
+
+ sch_tree_lock(sch);
+ list_for_each(pos, &q->new_flows)
+ st.new_flows_len++;
+
+ list_for_each(pos, &q->old_flows)
+ st.old_flows_len++;
+ sch_tree_unlock(sch);
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void fq_pie_reset(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+ u16 idx;
+
+ INIT_LIST_HEAD(&q->new_flows);
+ INIT_LIST_HEAD(&q->old_flows);
+ for (idx = 0; idx < q->flows_cnt; idx++) {
+ struct fq_pie_flow *flow = q->flows + idx;
+
+ /* Removes all packets from flow */
+ rtnl_kfree_skbs(flow->head, flow->tail);
+ flow->head = NULL;
+
+ INIT_LIST_HEAD(&flow->flowchain);
+ pie_vars_init(&flow->vars);
+ }
+
+ sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
+}
+
+static void fq_pie_destroy(struct Qdisc *sch)
+{
+ struct fq_pie_sched_data *q = qdisc_priv(sch);
+
+ tcf_block_put(q->block);
+ del_timer_sync(&q->adapt_timer);
+ kvfree(q->flows);
+}
+
+static struct Qdisc_ops fq_pie_qdisc_ops __read_mostly = {
+ .id = "fq_pie",
+ .priv_size = sizeof(struct fq_pie_sched_data),
+ .enqueue = fq_pie_qdisc_enqueue,
+ .dequeue = fq_pie_qdisc_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = fq_pie_init,
+ .destroy = fq_pie_destroy,
+ .reset = fq_pie_reset,
+ .change = fq_pie_change,
+ .dump = fq_pie_dump,
+ .dump_stats = fq_pie_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init fq_pie_module_init(void)
+{
+ return register_qdisc(&fq_pie_qdisc_ops);
+}
+
+static void __exit fq_pie_module_exit(void)
+{
+ unregister_qdisc(&fq_pie_qdisc_ops);
+}
+
+module_init(fq_pie_module_init);
+module_exit(fq_pie_module_exit);
+
+MODULE_DESCRIPTION("Flow Queue Proportional Integral controller Enhanced (FQ-PIE)");
+MODULE_AUTHOR("Mohit P. Tahiliani");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 11c03cf4aa74..6c9595f1048a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -46,6 +46,8 @@ EXPORT_SYMBOL(default_qdisc_ops);
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
+#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
+
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
const struct netdev_queue *txq = q->dev_queue;
@@ -71,7 +73,7 @@ static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
q->q.qlen--;
}
} else {
- skb = NULL;
+ skb = SKB_XOFF_MAGIC;
}
}
@@ -253,8 +255,11 @@ validate:
return skb;
skb = qdisc_dequeue_skb_bad_txq(q);
- if (unlikely(skb))
+ if (unlikely(skb)) {
+ if (skb == SKB_XOFF_MAGIC)
+ return NULL;
goto bulk;
+ }
skb = q->dequeue(q);
if (skb) {
bulk:
@@ -377,13 +382,8 @@ void __qdisc_run(struct Qdisc *q)
int packets;
while (qdisc_restart(q, &packets)) {
- /*
- * Ordered by possible occurrence: Postpone processing if
- * 1. we've exceeded packet quota
- * 2. another process needs the CPU;
- */
quota -= packets;
- if (quota <= 0 || need_resched()) {
+ if (quota <= 0) {
__netif_schedule(q);
break;
}
@@ -441,7 +441,7 @@ static void dev_watchdog(struct timer_list *t)
trace_net_dev_xmit_timeout(dev, i);
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
dev->name, netdev_drivername(dev), i);
- dev->netdev_ops->ndo_tx_timeout(dev);
+ dev->netdev_ops->ndo_tx_timeout(dev, i);
}
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies +
@@ -624,8 +624,12 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
err = skb_array_produce(q, skb);
- if (unlikely(err))
- return qdisc_drop_cpu(skb, qdisc, to_free);
+ if (unlikely(err)) {
+ if (qdisc_is_percpu_stats(qdisc))
+ return qdisc_drop_cpu(skb, qdisc, to_free);
+ else
+ return qdisc_drop(skb, qdisc, to_free);
+ }
qdisc_update_stats_at_enqueue(qdisc, pkt_len);
return NET_XMIT_SUCCESS;
@@ -648,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
if (likely(skb)) {
qdisc_update_stats_at_dequeue(qdisc, skb);
} else {
- qdisc->empty = true;
+ WRITE_ONCE(qdisc->empty, true);
}
return skb;
@@ -688,11 +692,14 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
kfree_skb(skb);
}
- for_each_possible_cpu(i) {
- struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ if (qdisc_is_percpu_stats(qdisc)) {
+ for_each_possible_cpu(i) {
+ struct gnet_stats_queue *q;
- q->backlog = 0;
- q->qlen = 0;
+ q = per_cpu_ptr(qdisc->cpu_qstats, i);
+ q->backlog = 0;
+ q->qlen = 0;
+ }
}
}
@@ -787,9 +794,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
};
EXPORT_SYMBOL(pfifo_fast_ops);
-static struct lock_class_key qdisc_tx_busylock;
-static struct lock_class_key qdisc_running_key;
-
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
struct netlink_ext_ack *extack)
@@ -842,17 +846,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
}
spin_lock_init(&sch->busylock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
spin_lock_init(&sch->seqlock);
- lockdep_set_class(&sch->busylock,
- dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
seqcount_init(&sch->running);
- lockdep_set_class(&sch->running,
- dev->qdisc_running_key ?: &qdisc_running_key);
sch->ops = ops;
sch->flags = ops->static_flags;
@@ -863,6 +859,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
dev_hold(dev);
refcount_set(&sch->refcnt, 1);
+ if (sch != &noop_qdisc) {
+ lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key);
+ lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key);
+ lockdep_set_class(&sch->running, &dev->qdisc_running_key);
+ }
+
return sch;
errout1:
kfree(p);
@@ -973,6 +975,9 @@ static void qdisc_destroy(struct Qdisc *qdisc)
void qdisc_put(struct Qdisc *qdisc)
{
+ if (!qdisc)
+ return;
+
if (qdisc->flags & TCQ_F_BUILTIN ||
!refcount_dec_and_test(&qdisc->refcnt))
return;
@@ -1028,6 +1033,8 @@ static void attach_one_default_qdisc(struct net_device *dev,
if (dev->priv_flags & IFF_NO_QUEUE)
ops = &noqueue_qdisc_ops;
+ else if(dev->type == ARPHRD_CAN)
+ ops = &pfifo_fast_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
if (!qdisc) {
@@ -1202,8 +1209,13 @@ void dev_deactivate_many(struct list_head *head)
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
- while (some_qdisc_is_busy(dev))
- yield();
+ while (some_qdisc_is_busy(dev)) {
+ /* wait_event() would avoid this sleep-loop but would
+ * require expensive checks in the fast paths of packet
+ * processing which isn't worth it.
+ */
+ schedule_timeout_uninterruptible(1);
+ }
/* The new qdisc is assigned at this point so we can safely
* unwind stale skb lists and qdisc statistics
*/
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index cee6971c1c82..be35f03b657b 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -5,11 +5,11 @@
* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
*/
-#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
+#include <linux/siphash.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
@@ -126,7 +126,7 @@ struct wdrr_bucket {
struct hhf_sched_data {
struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
- u32 perturbation; /* hash perturbation */
+ siphash_key_t perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
u32 drop_overlimit; /* number of times max qdisc packet
* limit was hit
@@ -264,7 +264,7 @@ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
}
/* Get hashed flow-id of the skb. */
- hash = skb_get_hash_perturb(skb, q->perturbation);
+ hash = skb_get_hash_perturb(skb, &q->perturbation);
/* Check if this packet belongs to an already established HH flow. */
flow_pos = hash & HHF_BIT_MASK;
@@ -531,7 +531,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
- if (non_hh_quantum > INT_MAX)
+ if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX)
return -EINVAL;
sch_tree_lock(sch);
@@ -582,7 +582,7 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt,
sch->limit = 1000;
q->quantum = psched_mtu(qdisc_dev(sch));
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
INIT_LIST_HEAD(&q->new_buckets);
INIT_LIST_HEAD(&q->old_buckets);
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 7bcf20ef9145..8184c87da8be 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1302,6 +1302,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
struct htb_class *cl = (struct htb_class *)*arg, *parent;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_HTB_MAX + 1];
+ struct Qdisc *parent_qdisc = NULL;
struct tc_htb_opt *hopt;
u64 rate64, ceil64;
int warn = 0;
@@ -1401,7 +1402,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (parent && !parent->level) {
/* turn parent into inner node */
qdisc_purge_queue(parent->leaf.q);
- qdisc_put(parent->leaf.q);
+ parent_qdisc = parent->leaf.q;
if (parent->prio_activity)
htb_deactivate(q, parent);
@@ -1480,6 +1481,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
sch_tree_unlock(sch);
+ qdisc_put(parent_qdisc);
if (warn)
pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0d578333e967..e79f1afe0cfd 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -153,6 +153,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
__gnet_stats_copy_queue(&sch->qstats,
qdisc->cpu_qstats,
&qdisc->qstats, qlen);
+ sch->q.qlen += qlen;
} else {
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
@@ -245,7 +246,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+ if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats,
+ &sch->bstats) < 0 ||
qdisc_qstats_copy(d, sch) < 0)
return -1;
return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 46980b8d66c5..8766ab5b8788 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -411,6 +411,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
__gnet_stats_copy_queue(&sch->qstats,
qdisc->cpu_qstats,
&qdisc->qstats, qlen);
+ sch->q.qlen += qlen;
} else {
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
@@ -433,7 +434,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
opt.offset[tc] = dev->tc_to_txq[tc].offset;
}
- if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt))
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
if ((priv->flags & TC_MQPRIO_F_MODE) &&
@@ -557,8 +558,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &sch->bstats) < 0 ||
+ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d,
+ sch->cpu_bstats, &sch->bstats) < 0 ||
qdisc_qstats_copy(d, sch) < 0)
return -1;
}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index e1087746f6a2..1330ad224931 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -174,7 +174,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
- int i;
+ struct Qdisc **removed;
+ int i, n_removed = 0;
if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
@@ -185,6 +186,11 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+ removed = kmalloc(sizeof(*removed) * (q->max_bands - q->bands),
+ GFP_KERNEL);
+ if (!removed)
+ return -ENOMEM;
+
sch_tree_lock(sch);
q->bands = qopt->bands;
for (i = q->bands; i < q->max_bands; i++) {
@@ -192,13 +198,17 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
- qdisc_tree_flush_backlog(child);
- qdisc_put(child);
+ qdisc_purge_queue(child);
+ removed[n_removed++] = child;
}
}
sch_tree_unlock(sch);
+ for (i = 0; i < n_removed; i++)
+ qdisc_put(removed[i]);
+ kfree(removed);
+
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
@@ -213,11 +223,10 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt,
if (child != &noop_qdisc)
qdisc_hash_add(child, true);
- if (old != &noop_qdisc) {
- qdisc_tree_flush_backlog(old);
- qdisc_put(old);
- }
+ if (old != &noop_qdisc)
+ qdisc_purge_queue(old);
sch_tree_unlock(sch);
+ qdisc_put(old);
}
}
}
@@ -330,7 +339,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
qdisc_qstats_copy(d, cl_q) < 0)
return -1;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b17f2ed970e2..42e557d48e4e 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -476,7 +476,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* skb will be queued.
*/
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
- struct Qdisc *rootq = qdisc_root(sch);
+ struct Qdisc *rootq = qdisc_root_bh(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
@@ -509,6 +509,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_help(skb)) {
qdisc_drop(skb, sch, to_free);
+ skb = NULL;
goto finish_segs;
}
@@ -593,9 +594,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
finish_segs:
if (segs) {
unsigned int len, last_len;
- int nb = 0;
+ int nb;
- len = skb->len;
+ len = skb ? skb->len : 0;
+ nb = skb ? 1 : 0;
while (segs) {
skb2 = segs->next;
@@ -612,7 +614,10 @@ finish_segs:
}
segs = skb2;
}
- qdisc_tree_reduce_backlog(sch, -nb, prev_len - len);
+ /* Parent qdiscs accounted for 1 skb of size @prev_len */
+ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
+ } else if (!skb) {
+ return NET_XMIT_DROP;
}
return NET_XMIT_SUCCESS;
}
@@ -777,7 +782,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
struct disttable *d;
int i;
- if (n > NETEM_DIST_MAX)
+ if (!n || n > NETEM_DIST_MAX)
return -EINVAL;
d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index df98a887eb89..915bcdb59a9f 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -19,134 +19,76 @@
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
-
-#define QUEUE_THRESHOLD 16384
-#define DQCOUNT_INVALID -1
-#define MAX_PROB 0xffffffffffffffff
-#define PIE_SCALE 8
-
-/* parameters used */
-struct pie_params {
- psched_time_t target; /* user specified target delay in pschedtime */
- u32 tupdate; /* timer frequency (in jiffies) */
- u32 limit; /* number of packets that can be enqueued */
- u32 alpha; /* alpha and beta are between 0 and 32 */
- u32 beta; /* and are used for shift relative to 1 */
- bool ecn; /* true if ecn is enabled */
- bool bytemode; /* to scale drop early prob based on pkt size */
-};
-
-/* variables used */
-struct pie_vars {
- u64 prob; /* probability but scaled by u64 limit. */
- psched_time_t burst_time;
- psched_time_t qdelay;
- psched_time_t qdelay_old;
- u64 dq_count; /* measured in bytes */
- psched_time_t dq_tstamp; /* drain rate */
- u64 accu_prob; /* accumulated drop probability */
- u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
- u32 qlen_old; /* in bytes */
- u8 accu_prob_overflows; /* overflows of accu_prob */
-};
-
-/* statistics gathering */
-struct pie_stats {
- u32 packets_in; /* total number of packets enqueued */
- u32 dropped; /* packets dropped due to pie_action */
- u32 overlimit; /* dropped due to lack of space in queue */
- u32 maxq; /* maximum queue size */
- u32 ecn_mark; /* packets marked with ECN */
-};
+#include <net/pie.h>
/* private data for the Qdisc */
struct pie_sched_data {
- struct pie_params params;
struct pie_vars vars;
+ struct pie_params params;
struct pie_stats stats;
struct timer_list adapt_timer;
struct Qdisc *sch;
};
-static void pie_params_init(struct pie_params *params)
-{
- params->alpha = 2;
- params->beta = 20;
- params->tupdate = usecs_to_jiffies(15 * USEC_PER_MSEC); /* 15 ms */
- params->limit = 1000; /* default of 1000 packets */
- params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC); /* 15 ms */
- params->ecn = false;
- params->bytemode = false;
-}
-
-static void pie_vars_init(struct pie_vars *vars)
+bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
+ struct pie_vars *vars, u32 qlen, u32 packet_size)
{
- vars->dq_count = DQCOUNT_INVALID;
- vars->accu_prob = 0;
- vars->avg_dq_rate = 0;
- /* default of 150 ms in pschedtime */
- vars->burst_time = PSCHED_NS2TICKS(150 * NSEC_PER_MSEC);
- vars->accu_prob_overflows = 0;
-}
-
-static bool drop_early(struct Qdisc *sch, u32 packet_size)
-{
- struct pie_sched_data *q = qdisc_priv(sch);
u64 rnd;
- u64 local_prob = q->vars.prob;
+ u64 local_prob = vars->prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
- if (q->vars.burst_time > 0)
+ if (vars->burst_time > 0)
return false;
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.prob < MAX_PROB / 5))
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->prob < MAX_PROB / 5))
return false;
- /* If we have fewer than 2 mtu-sized packets, disable drop_early,
+ /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
* similar to min_th in RED
*/
- if (sch->qstats.backlog < 2 * mtu)
+ if (qlen < 2 * mtu)
return false;
/* If bytemode is turned on, use packet size to compute new
* probablity. Smaller packets will have lower drop prob in this case
*/
- if (q->params.bytemode && packet_size <= mtu)
+ if (params->bytemode && packet_size <= mtu)
local_prob = (u64)packet_size * div_u64(local_prob, mtu);
else
- local_prob = q->vars.prob;
+ local_prob = vars->prob;
if (local_prob == 0) {
- q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
+ vars->accu_prob = 0;
+ vars->accu_prob_overflows = 0;
}
- if (local_prob > MAX_PROB - q->vars.accu_prob)
- q->vars.accu_prob_overflows++;
+ if (local_prob > MAX_PROB - vars->accu_prob)
+ vars->accu_prob_overflows++;
- q->vars.accu_prob += local_prob;
+ vars->accu_prob += local_prob;
- if (q->vars.accu_prob_overflows == 0 &&
- q->vars.accu_prob < (MAX_PROB / 100) * 85)
+ if (vars->accu_prob_overflows == 0 &&
+ vars->accu_prob < (MAX_PROB / 100) * 85)
return false;
- if (q->vars.accu_prob_overflows == 8 &&
- q->vars.accu_prob >= MAX_PROB / 2)
+ if (vars->accu_prob_overflows == 8 &&
+ vars->accu_prob >= MAX_PROB / 2)
return true;
prandom_bytes(&rnd, 8);
if (rnd < local_prob) {
- q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
+ vars->accu_prob = 0;
+ vars->accu_prob_overflows = 0;
return true;
}
return false;
}
+EXPORT_SYMBOL_GPL(pie_drop_early);
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
@@ -159,7 +101,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto out;
}
- if (!drop_early(sch, skb->len)) {
+ if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
+ skb->len)) {
enqueue = true;
} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
INET_ECN_set_ce(skb)) {
@@ -172,6 +115,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* we can enqueue the packet */
if (enqueue) {
+ /* Set enqueue time only when dq_rate_estimator is disabled. */
+ if (!q->params.dq_rate_estimator)
+ pie_set_enqueue_time(skb);
+
q->stats.packets_in++;
if (qdisc_qlen(sch) > q->stats.maxq)
q->stats.maxq = qdisc_qlen(sch);
@@ -187,13 +134,14 @@ out:
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
- [TCA_PIE_TARGET] = {.type = NLA_U32},
- [TCA_PIE_LIMIT] = {.type = NLA_U32},
- [TCA_PIE_TUPDATE] = {.type = NLA_U32},
- [TCA_PIE_ALPHA] = {.type = NLA_U32},
- [TCA_PIE_BETA] = {.type = NLA_U32},
- [TCA_PIE_ECN] = {.type = NLA_U32},
- [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_TARGET] = {.type = NLA_U32},
+ [TCA_PIE_LIMIT] = {.type = NLA_U32},
+ [TCA_PIE_TUPDATE] = {.type = NLA_U32},
+ [TCA_PIE_ALPHA] = {.type = NLA_U32},
+ [TCA_PIE_BETA] = {.type = NLA_U32},
+ [TCA_PIE_ECN] = {.type = NLA_U32},
+ [TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+ [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
};
static int pie_change(struct Qdisc *sch, struct nlattr *opt,
@@ -247,6 +195,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_PIE_BYTEMODE])
q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
+ if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
+ q->params.dq_rate_estimator =
+ nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);
+
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
@@ -262,48 +214,69 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
return 0;
}
-static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
+void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
+ struct pie_vars *vars, u32 qlen)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- int qlen = sch->qstats.backlog; /* current queue size in bytes */
+ psched_time_t now = psched_get_time();
+ u32 dtime = 0;
+
+ /* If dq_rate_estimator is disabled, calculate qdelay using the
+ * packet timestamp.
+ */
+ if (!params->dq_rate_estimator) {
+ vars->qdelay = now - pie_get_enqueue_time(skb);
+
+ if (vars->dq_tstamp != DTIME_INVALID)
+ dtime = now - vars->dq_tstamp;
+
+ vars->dq_tstamp = now;
+
+ if (qlen == 0)
+ vars->qdelay = 0;
+
+ if (dtime == 0)
+ return;
+
+ goto burst_allowance_reduction;
+ }
/* If current queue is about 10 packets or more and dq_count is unset
* we have enough packets to calculate the drain rate. Save
* current time as dq_tstamp and start measurement cycle.
*/
- if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
- q->vars.dq_tstamp = psched_get_time();
- q->vars.dq_count = 0;
+ if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
+ vars->dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
}
- /* Calculate the average drain rate from this value. If queue length
- * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
+ /* Calculate the average drain rate from this value. If queue length
+ * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset
* the dq_count to -1 as we don't have enough packets to calculate the
- * drain rate anymore The following if block is entered only when we
+ * drain rate anymore. The following if block is entered only when we
* have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
* and we calculate the drain rate for the threshold here. dq_count is
* in bytes, time difference in psched_time, hence rate is in
* bytes/psched_time.
*/
- if (q->vars.dq_count != DQCOUNT_INVALID) {
- q->vars.dq_count += skb->len;
+ if (vars->dq_count != DQCOUNT_INVALID) {
+ vars->dq_count += skb->len;
+
+ if (vars->dq_count >= QUEUE_THRESHOLD) {
+ u32 count = vars->dq_count << PIE_SCALE;
- if (q->vars.dq_count >= QUEUE_THRESHOLD) {
- psched_time_t now = psched_get_time();
- u32 dtime = now - q->vars.dq_tstamp;
- u32 count = q->vars.dq_count << PIE_SCALE;
+ dtime = now - vars->dq_tstamp;
if (dtime == 0)
return;
count = count / dtime;
- if (q->vars.avg_dq_rate == 0)
- q->vars.avg_dq_rate = count;
+ if (vars->avg_dq_rate == 0)
+ vars->avg_dq_rate = count;
else
- q->vars.avg_dq_rate =
- (q->vars.avg_dq_rate -
- (q->vars.avg_dq_rate >> 3)) + (count >> 3);
+ vars->avg_dq_rate =
+ (vars->avg_dq_rate -
+ (vars->avg_dq_rate >> 3)) + (count >> 3);
/* If the queue has receded below the threshold, we hold
* on to the last drain rate calculated, else we reset
@@ -311,43 +284,54 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
* packet is dequeued
*/
if (qlen < QUEUE_THRESHOLD) {
- q->vars.dq_count = DQCOUNT_INVALID;
+ vars->dq_count = DQCOUNT_INVALID;
} else {
- q->vars.dq_count = 0;
- q->vars.dq_tstamp = psched_get_time();
+ vars->dq_count = 0;
+ vars->dq_tstamp = psched_get_time();
}
- if (q->vars.burst_time > 0) {
- if (q->vars.burst_time > dtime)
- q->vars.burst_time -= dtime;
- else
- q->vars.burst_time = 0;
- }
+ goto burst_allowance_reduction;
}
}
+
+ return;
+
+burst_allowance_reduction:
+ if (vars->burst_time > 0) {
+ if (vars->burst_time > dtime)
+ vars->burst_time -= dtime;
+ else
+ vars->burst_time = 0;
+ }
}
+EXPORT_SYMBOL_GPL(pie_process_dequeue);
-static void calculate_probability(struct Qdisc *sch)
+void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
+ u32 qlen)
{
- struct pie_sched_data *q = qdisc_priv(sch);
- u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
- psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
+ psched_time_t qdelay_old = 0; /* in pschedtime */
s64 delta = 0; /* determines the change in probability */
u64 oldprob;
u64 alpha, beta;
u32 power;
bool update_prob = true;
- q->vars.qdelay_old = q->vars.qdelay;
+ if (params->dq_rate_estimator) {
+ qdelay_old = vars->qdelay;
+ vars->qdelay_old = vars->qdelay;
- if (q->vars.avg_dq_rate > 0)
- qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
- else
- qdelay = 0;
+ if (vars->avg_dq_rate > 0)
+ qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate;
+ else
+ qdelay = 0;
+ } else {
+ qdelay = vars->qdelay;
+ qdelay_old = vars->qdelay_old;
+ }
- /* If qdelay is zero and qlen is not, it means qlen is very small, less
- * than dequeue_rate, so we do not update probabilty in this round
+ /* If qdelay is zero and qlen is not, it means qlen is very small,
+ * so we do not update probabilty in this round.
*/
if (qdelay == 0 && qlen != 0)
update_prob = false;
@@ -359,18 +343,18 @@ static void calculate_probability(struct Qdisc *sch)
* probability. alpha/beta are updated locally below by scaling down
* by 16 to come to 0-2 range.
*/
- alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
- beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
+ beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
/* We scale alpha and beta differently depending on how heavy the
* congestion is. Please see RFC 8033 for details.
*/
- if (q->vars.prob < MAX_PROB / 10) {
+ if (vars->prob < MAX_PROB / 10) {
alpha >>= 1;
beta >>= 1;
power = 100;
- while (q->vars.prob < div_u64(MAX_PROB, power) &&
+ while (vars->prob < div_u64(MAX_PROB, power) &&
power <= 1000000) {
alpha >>= 2;
beta >>= 2;
@@ -379,14 +363,14 @@ static void calculate_probability(struct Qdisc *sch)
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
- delta += alpha * (u64)(qdelay - q->params.target);
+ delta += alpha * (u64)(qdelay - params->target);
delta += beta * (u64)(qdelay - qdelay_old);
- oldprob = q->vars.prob;
+ oldprob = vars->prob;
/* to ensure we increase probability in steps of no more than 2% */
if (delta > (s64)(MAX_PROB / (100 / 2)) &&
- q->vars.prob >= MAX_PROB / 10)
+ vars->prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
/* Non-linear drop:
@@ -397,12 +381,12 @@ static void calculate_probability(struct Qdisc *sch)
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
- q->vars.prob += delta;
+ vars->prob += delta;
if (delta > 0) {
/* prevent overflow */
- if (q->vars.prob < oldprob) {
- q->vars.prob = MAX_PROB;
+ if (vars->prob < oldprob) {
+ vars->prob = MAX_PROB;
/* Prevent normalization error. If probability is at
* maximum value already, we normalize it here, and
* skip the check to do a non-linear drop in the next
@@ -412,8 +396,8 @@ static void calculate_probability(struct Qdisc *sch)
}
} else {
/* prevent underflow */
- if (q->vars.prob > oldprob)
- q->vars.prob = 0;
+ if (vars->prob > oldprob)
+ vars->prob = 0;
}
/* Non-linear drop in probability: Reduce drop probability quickly if
@@ -422,23 +406,28 @@ static void calculate_probability(struct Qdisc *sch)
if (qdelay == 0 && qdelay_old == 0 && update_prob)
/* Reduce drop probability to 98.4% */
- q->vars.prob -= q->vars.prob / 64u;
+ vars->prob -= vars->prob / 64;
- q->vars.qdelay = qdelay;
- q->vars.qlen_old = qlen;
+ vars->qdelay = qdelay;
+ vars->qlen_old = qlen;
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
- * 3. We have atleast one estimate for the avg_dq_rate ie.,
- * is a non-zero value
+ * 3. If average dq_rate_estimator is enabled, we have atleast one
+ * estimate for the avg_dq_rate ie., is a non-zero value
*/
- if ((q->vars.qdelay < q->params.target / 2) &&
- (q->vars.qdelay_old < q->params.target / 2) &&
- q->vars.prob == 0 &&
- q->vars.avg_dq_rate > 0)
- pie_vars_init(&q->vars);
+ if ((vars->qdelay < params->target / 2) &&
+ (vars->qdelay_old < params->target / 2) &&
+ vars->prob == 0 &&
+ (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) {
+ pie_vars_init(vars);
+ }
+
+ if (!params->dq_rate_estimator)
+ vars->qdelay_old = qdelay;
}
+EXPORT_SYMBOL_GPL(pie_calculate_probability);
static void pie_timer(struct timer_list *t)
{
@@ -447,7 +436,7 @@ static void pie_timer(struct timer_list *t)
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
- calculate_probability(sch);
+ pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
if (q->params.tupdate)
@@ -497,7 +486,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
- nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+ nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
+ nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
+ q->params.dq_rate_estimator))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -514,9 +505,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.prob = q->vars.prob,
.delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
- /* unscale and return dq_rate in bytes per sec */
- .avg_dq_rate = q->vars.avg_dq_rate *
- (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
.packets_in = q->stats.packets_in,
.overlimit = q->stats.overlimit,
.maxq = q->stats.maxq,
@@ -524,17 +512,26 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
.ecn_mark = q->stats.ecn_mark,
};
+ /* avg_dq_rate is only valid if dq_rate_estimator is enabled */
+ st.dq_rate_estimating = q->params.dq_rate_estimator;
+
+ /* unscale and return dq_rate in bytes per sec */
+ if (q->params.dq_rate_estimator)
+ st.avg_dq_rate = q->vars.avg_dq_rate *
+ (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
+ struct pie_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = qdisc_dequeue_head(sch);
if (!skb)
return NULL;
- pie_process_dequeue(sch, skb);
+ pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog);
return skb;
}
@@ -555,7 +552,7 @@ static void pie_destroy(struct Qdisc *sch)
}
static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
- .id = "pie",
+ .id = "pie",
.priv_size = sizeof(struct pie_sched_data),
.enqueue = pie_qdisc_enqueue,
.dequeue = pie_qdisc_dequeue,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0f8fedb8809a..647941702f9f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -292,8 +292,14 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct tc_prio_qopt_offload graft_offload;
unsigned long band = arg - 1;
- if (new == NULL)
- new = &noop_qdisc;
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, arg), extack);
+ if (!new)
+ new = &noop_qdisc;
+ else
+ qdisc_hash_add(new, true);
+ }
*old = qdisc_replace(sch, new, &q->queues[band]);
@@ -356,7 +362,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
- d, NULL, &cl_q->bstats) < 0 ||
+ d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
qdisc_qstats_copy(d, cl_q) < 0)
return -1;
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 1dff8506a715..4074c50ac3d7 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -18,7 +18,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/random.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -45,7 +45,7 @@ struct sfb_bucket {
* (Section 4.4 of SFB reference : moving hash functions)
*/
struct sfb_bins {
- u32 perturbation; /* jhash perturbation */
+ siphash_key_t perturbation; /* siphash key */
struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
};
@@ -217,7 +217,8 @@ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_da
static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
{
- q->bins[slot].perturbation = prandom_u32();
+ get_random_bytes(&q->bins[slot].perturbation,
+ sizeof(q->bins[slot].perturbation));
}
static void sfb_swap_slot(struct sfb_sched_data *q)
@@ -314,9 +315,9 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
- sfbhash = jhash_1word(salt, q->bins[slot].perturbation);
+ sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation);
} else {
- sfbhash = skb_get_hash_perturb(skb, q->bins[slot].perturbation);
+ sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation);
}
@@ -352,7 +353,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* Inelastic flow */
if (q->double_buffering) {
sfbhash = skb_get_hash_perturb(skb,
- q->bins[slot].perturbation);
+ &q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
@@ -488,7 +489,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct sfb_sched_data *q = qdisc_priv(sch);
- struct Qdisc *child;
+ struct Qdisc *child, *old;
struct nlattr *tb[TCA_SFB_MAX + 1];
const struct tc_sfb_qopt *ctl = &sfb_default_ops;
u32 limit;
@@ -518,8 +519,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
qdisc_hash_add(child, true);
sch_tree_lock(sch);
- qdisc_tree_flush_backlog(q->qdisc);
- qdisc_put(q->qdisc);
+ qdisc_purge_queue(q->qdisc);
+ old = q->qdisc;
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
@@ -542,6 +543,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt,
sfb_init_perturbation(1, q);
sch_tree_unlock(sch);
+ qdisc_put(old);
return 0;
}
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 68404a9d2ce4..c787d4d46017 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -14,7 +14,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
@@ -117,7 +117,7 @@ struct sfq_sched_data {
u8 headdrop;
u8 maxdepth; /* limit of packets per flow */
- u32 perturbation;
+ siphash_key_t perturbation;
u8 cur_depth; /* depth of longest slot */
u8 flags;
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
@@ -157,7 +157,7 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
static unsigned int sfq_hash(const struct sfq_sched_data *q,
const struct sk_buff *skb)
{
- return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
+ return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
@@ -607,9 +607,11 @@ static void sfq_perturbation(struct timer_list *t)
struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
struct Qdisc *sch = q->sch;
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+ siphash_key_t nkey;
+ get_random_bytes(&nkey, sizeof(nkey));
spin_lock(root_lock);
- q->perturbation = prandom_u32();
+ q->perturbation = nkey;
if (!q->filter_list && q->tail)
sfq_rehash(sch);
spin_unlock(root_lock);
@@ -688,7 +690,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
del_timer(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
}
sch_tree_unlock(sch);
kfree(p);
@@ -745,7 +747,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = 0;
- q->perturbation = prandom_u32();
+ get_random_bytes(&q->perturbation, sizeof(q->perturbation));
if (opt) {
int err = sfq_change(sch, opt);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index e25d414ae12f..660fc45ee40f 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -29,8 +29,9 @@ static DEFINE_SPINLOCK(taprio_list_lock);
#define TAPRIO_ALL_GATES_OPEN -1
-#define FLAGS_VALID(flags) (!((flags) & ~TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST))
#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
+#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
+#define TAPRIO_FLAGS_INVALID U32_MAX
struct sched_entry {
struct list_head list;
@@ -75,9 +76,16 @@ struct taprio_sched {
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
+ struct sk_buff *(*dequeue)(struct Qdisc *sch);
+ struct sk_buff *(*peek)(struct Qdisc *sch);
u32 txtime_delay;
};
+struct __tc_taprio_qopt_offload {
+ refcount_t users;
+ struct tc_taprio_qopt_offload offload;
+};
+
static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
if (!sched)
@@ -268,6 +276,19 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
return entry;
}
+static bool taprio_flags_valid(u32 flags)
+{
+ /* Make sure no other flag bits are set. */
+ if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST |
+ TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+ return false;
+ /* txtime-assist and full offload are mutually exclusive */
+ if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
+ (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD))
+ return false;
+ return true;
+}
+
/* This returns the tstamp value set by TCP in terms of the set clock. */
static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
{
@@ -417,7 +438,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_enqueue(skb, child, to_free);
}
-static struct sk_buff *taprio_peek(struct Qdisc *sch)
+static struct sk_buff *taprio_peek_soft(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
@@ -461,6 +482,36 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
return NULL;
}
+static struct sk_buff *taprio_peek_offload(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->peek(child);
+ if (!skb)
+ continue;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ return q->peek(sch);
+}
+
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
@@ -468,7 +519,7 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
atomic64_read(&q->picos_per_byte)));
}
-static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
@@ -477,11 +528,6 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
u32 gate_mask;
int i;
- if (atomic64_read(&q->picos_per_byte) == -1) {
- WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte.");
- return NULL;
- }
-
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
/* if there's no entry, it means that the schedule didn't
@@ -555,6 +601,40 @@ done:
return skb;
}
+static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct Qdisc *child = q->qdiscs[i];
+
+ if (unlikely(!child))
+ continue;
+
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ continue;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+ }
+
+ return NULL;
+}
+
+static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
+ return q->dequeue(sch);
+}
+
static bool should_restart_cycle(const struct sched_gate_list *oper,
const struct sched_entry *entry)
{
@@ -677,10 +757,6 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
[TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 },
};
-static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
- [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
-};
-
static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_PRIOMAP] = {
.len = sizeof(struct tc_mqprio_qopt)
@@ -691,6 +767,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
+ [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 },
};
static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
@@ -847,7 +924,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
}
/* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
+ for (i = 0; i <= TC_BITMASK; i++) {
if (qopt->prio_tc_map[i] >= qopt->num_tc) {
NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
return -EINVAL;
@@ -941,6 +1018,9 @@ static void taprio_start_sched(struct Qdisc *sch,
struct taprio_sched *q = qdisc_priv(sch);
ktime_t expires;
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return;
+
expires = hrtimer_get_expires(&q->advance_timer);
if (expires == 0)
expires = KTIME_MAX;
@@ -958,12 +1038,19 @@ static void taprio_set_picos_per_byte(struct net_device *dev,
struct taprio_sched *q)
{
struct ethtool_link_ksettings ecmd;
- int picos_per_byte = -1;
+ int speed = SPEED_10;
+ int picos_per_byte;
+ int err;
+
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
+ if (err < 0)
+ goto skip;
+
+ if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
+ speed = ecmd.base.speed;
- if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
- ecmd.base.speed != SPEED_UNKNOWN)
- picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
- ecmd.base.speed * 1000 * 1000);
+skip:
+ picos_per_byte = (USEC_PER_SEC * 8) / speed;
atomic64_set(&q->picos_per_byte, picos_per_byte);
netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
@@ -1012,6 +1099,303 @@ static void setup_txtime(struct taprio_sched *q,
}
}
+static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
+{
+ size_t size = sizeof(struct tc_taprio_sched_entry) * num_entries +
+ sizeof(struct __tc_taprio_qopt_offload);
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = kzalloc(size, GFP_KERNEL);
+ if (!__offload)
+ return NULL;
+
+ refcount_set(&__offload->users, 1);
+
+ return &__offload->offload;
+}
+
+struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
+ *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ refcount_inc(&__offload->users);
+
+ return offload;
+}
+EXPORT_SYMBOL_GPL(taprio_offload_get);
+
+void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
+{
+ struct __tc_taprio_qopt_offload *__offload;
+
+ __offload = container_of(offload, struct __tc_taprio_qopt_offload,
+ offload);
+
+ if (!refcount_dec_and_test(&__offload->users))
+ return;
+
+ kfree(__offload);
+}
+EXPORT_SYMBOL_GPL(taprio_offload_free);
+
+/* The function will only serve to keep the pointers to the "oper" and "admin"
+ * schedules valid in relation to their base times, so when calling dump() the
+ * users looks at the right schedules.
+ * When using full offload, the admin configuration is promoted to oper at the
+ * base_time in the PHC time domain. But because the system time is not
+ * necessarily in sync with that, we can't just trigger a hrtimer to call
+ * switch_schedules at the right hardware time.
+ * At the moment we call this by hand right away from taprio, but in the future
+ * it will be useful to create a mechanism for drivers to notify taprio of the
+ * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
+ * This is left as TODO.
+ */
+static void taprio_offload_config_changed(struct taprio_sched *q)
+{
+ struct sched_gate_list *oper, *admin;
+
+ spin_lock(&q->current_entry_lock);
+
+ oper = rcu_dereference_protected(q->oper_sched,
+ lockdep_is_held(&q->current_entry_lock));
+ admin = rcu_dereference_protected(q->admin_sched,
+ lockdep_is_held(&q->current_entry_lock));
+
+ switch_schedules(q, &admin, &oper);
+
+ spin_unlock(&q->current_entry_lock);
+}
+
+static void taprio_sched_to_offload(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ const struct tc_mqprio_qopt *mqprio,
+ struct tc_taprio_qopt_offload *offload)
+{
+ struct sched_entry *entry;
+ int i = 0;
+
+ offload->base_time = sched->base_time;
+ offload->cycle_time = sched->cycle_time;
+ offload->cycle_time_extension = sched->cycle_time_extension;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ struct tc_taprio_sched_entry *e = &offload->entries[i];
+
+ e->command = entry->command;
+ e->interval = entry->interval;
+ e->gate_mask = entry->gate_mask;
+ i++;
+ }
+
+ offload->num_entries = i;
+}
+
+static int taprio_enable_offload(struct net_device *dev,
+ struct tc_mqprio_qopt *mqprio,
+ struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err = 0;
+
+ if (!ops->ndo_setup_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not support taprio offload");
+ return -EOPNOTSUPP;
+ }
+
+ offload = taprio_offload_alloc(sched->num_entries);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory for enabling offload mode");
+ return -ENOMEM;
+ }
+ offload->enable = 1;
+ taprio_sched_to_offload(q, sched, mqprio, offload);
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to setup taprio offload");
+ goto done;
+ }
+
+done:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+static int taprio_disable_offload(struct net_device *dev,
+ struct taprio_sched *q,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct tc_taprio_qopt_offload *offload;
+ int err;
+
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags))
+ return 0;
+
+ if (!ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ offload = taprio_offload_alloc(0);
+ if (!offload) {
+ NL_SET_ERR_MSG(extack,
+ "Not enough memory to disable offload mode");
+ return -ENOMEM;
+ }
+ offload->enable = 0;
+
+ err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device failed to disable offload");
+ goto out;
+ }
+
+out:
+ taprio_offload_free(offload);
+
+ return err;
+}
+
+/* If full offload is enabled, the only possible clockid is the net device's
+ * PHC. For that reason, specifying a clockid through netlink is incorrect.
+ * For txtime-assist, it is implicitly assumed that the device's PHC is kept
+ * in sync with the specified clockid via a user space daemon such as phc2sys.
+ * For both software taprio and txtime-assist, the clockid is used for the
+ * hrtimer that advances the schedule and hence mandatory.
+ */
+static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err = -EINVAL;
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_ts_info info = {
+ .cmd = ETHTOOL_GET_TS_INFO,
+ .phc_index = -1,
+ };
+
+ if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ NL_SET_ERR_MSG(extack,
+ "The 'clockid' cannot be specified for full offload");
+ goto out;
+ }
+
+ if (ops && ops->get_ts_info)
+ err = ops->get_ts_info(dev, &info);
+
+ if (err || info.phc_index < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Device does not have a PTP clock");
+ err = -ENOTSUPP;
+ goto out;
+ }
+ } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+ int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+ /* We only support static clockids and we don't allow
+ * for it to be modified after the first init.
+ */
+ if (clockid < 0 ||
+ (q->clockid != -1 && q->clockid != clockid)) {
+ NL_SET_ERR_MSG(extack,
+ "Changing the 'clockid' of a running schedule is not supported");
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ q->tk_offset = TK_OFFS_REAL;
+ break;
+ case CLOCK_MONOTONIC:
+ q->tk_offset = TK_OFFS_MAX;
+ break;
+ case CLOCK_BOOTTIME:
+ q->tk_offset = TK_OFFS_BOOT;
+ break;
+ case CLOCK_TAI:
+ q->tk_offset = TK_OFFS_TAI;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+ err = -EINVAL;
+ goto out;
+ }
+
+ q->clockid = clockid;
+ } else {
+ NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
+ goto out;
+ }
+
+ /* Everything went ok, return success. */
+ err = 0;
+
+out:
+ return err;
+}
+
+static int taprio_mqprio_cmp(const struct net_device *dev,
+ const struct tc_mqprio_qopt *mqprio)
+{
+ int i;
+
+ if (!mqprio || mqprio->num_tc != dev->num_tc)
+ return -1;
+
+ for (i = 0; i < mqprio->num_tc; i++)
+ if (dev->tc_to_txq[i].count != mqprio->count[i] ||
+ dev->tc_to_txq[i].offset != mqprio->offset[i])
+ return -1;
+
+ for (i = 0; i <= TC_BITMASK; i++)
+ if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
+ return -1;
+
+ return 0;
+}
+
+/* The semantics of the 'flags' argument in relation to 'change()'
+ * requests, are interpreted following two rules (which are applied in
+ * this order): (1) an omitted 'flags' argument is interpreted as
+ * zero; (2) the 'flags' of a "running" taprio instance cannot be
+ * changed.
+ */
+static int taprio_new_flags(const struct nlattr *attr, u32 old,
+ struct netlink_ext_ack *extack)
+{
+ u32 new = 0;
+
+ if (attr)
+ new = nla_get_u32(attr);
+
+ if (old != TAPRIO_FLAGS_INVALID && old != new) {
+ NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!taprio_flags_valid(new)) {
+ NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
+ return -EINVAL;
+ }
+
+ return new;
+}
+
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -1020,10 +1404,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
struct tc_mqprio_qopt *mqprio = NULL;
- u32 taprio_flags = 0;
- int i, err, clockid;
unsigned long flags;
ktime_t start;
+ int i, err;
err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
taprio_policy, extack);
@@ -1033,21 +1416,14 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
- if (tb[TCA_TAPRIO_ATTR_FLAGS]) {
- taprio_flags = nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]);
-
- if (q->flags != 0 && q->flags != taprio_flags) {
- NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported");
- return -EOPNOTSUPP;
- } else if (!FLAGS_VALID(taprio_flags)) {
- NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid");
- return -EINVAL;
- }
+ err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS],
+ q->flags, extack);
+ if (err < 0)
+ return err;
- q->flags = taprio_flags;
- }
+ q->flags = err;
- err = taprio_parse_mqprio_opt(dev, mqprio, extack, taprio_flags);
+ err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags);
if (err < 0)
return err;
@@ -1063,6 +1439,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
admin = rcu_dereference(q->admin_sched);
rcu_read_unlock();
+ /* no changes - no new mqprio settings */
+ if (!taprio_mqprio_cmp(dev, mqprio))
+ mqprio = NULL;
+
if (mqprio && (oper || admin)) {
NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
err = -ENOTSUPP;
@@ -1079,29 +1459,31 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto free_sched;
}
- if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
- clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+ err = taprio_parse_clockid(sch, tb, extack);
+ if (err < 0)
+ goto free_sched;
- /* We only support static clockids and we don't allow
- * for it to be modified after the first init.
- */
- if (clockid < 0 ||
- (q->clockid != -1 && q->clockid != clockid)) {
- NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported");
- err = -ENOTSUPP;
- goto free_sched;
- }
+ taprio_set_picos_per_byte(dev, q);
- q->clockid = clockid;
- }
+ if (mqprio) {
+ netdev_set_num_tc(dev, mqprio->num_tc);
+ for (i = 0; i < mqprio->num_tc; i++)
+ netdev_set_tc_queue(dev, i,
+ mqprio->count[i],
+ mqprio->offset[i]);
- if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
- NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
- err = -EINVAL;
- goto free_sched;
+ /* Always use supplied priority mappings */
+ for (i = 0; i <= TC_BITMASK; i++)
+ netdev_set_prio_tc_map(dev, i,
+ mqprio->prio_tc_map[i]);
}
- taprio_set_picos_per_byte(dev, q);
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ err = taprio_enable_offload(dev, mqprio, q, new_admin, extack);
+ else
+ err = taprio_disable_offload(dev, q, extack);
+ if (err)
+ goto free_sched;
/* Protects against enqueue()/dequeue() */
spin_lock_bh(qdisc_lock(sch));
@@ -1116,42 +1498,22 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
}
- if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
+ if (!TXTIME_ASSIST_IS_ENABLED(q->flags) &&
+ !FULL_OFFLOAD_IS_ENABLED(q->flags) &&
!hrtimer_active(&q->advance_timer)) {
hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
}
- if (mqprio) {
- netdev_set_num_tc(dev, mqprio->num_tc);
- for (i = 0; i < mqprio->num_tc; i++)
- netdev_set_tc_queue(dev, i,
- mqprio->count[i],
- mqprio->offset[i]);
-
- /* Always use supplied priority mappings */
- for (i = 0; i < TC_BITMASK + 1; i++)
- netdev_set_prio_tc_map(dev, i,
- mqprio->prio_tc_map[i]);
- }
-
- switch (q->clockid) {
- case CLOCK_REALTIME:
- q->tk_offset = TK_OFFS_REAL;
- break;
- case CLOCK_MONOTONIC:
- q->tk_offset = TK_OFFS_MAX;
- break;
- case CLOCK_BOOTTIME:
- q->tk_offset = TK_OFFS_BOOT;
- break;
- case CLOCK_TAI:
- q->tk_offset = TK_OFFS_TAI;
- break;
- default:
- NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
- err = -EINVAL;
- goto unlock;
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+ q->dequeue = taprio_dequeue_offload;
+ q->peek = taprio_peek_offload;
+ } else {
+ /* Be sure to always keep the function pointers
+ * in a consistent state.
+ */
+ q->dequeue = taprio_dequeue_soft;
+ q->peek = taprio_peek_soft;
}
err = taprio_get_start_time(sch, new_admin, &start);
@@ -1160,9 +1522,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto unlock;
}
- if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) {
- setup_txtime(q, new_admin, start);
+ setup_txtime(q, new_admin, start);
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
if (!oper) {
rcu_assign_pointer(q->oper_sched, new_admin);
err = 0;
@@ -1186,6 +1548,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
call_rcu(&admin->rcu, taprio_free_sched_cb);
spin_unlock_irqrestore(&q->current_entry_lock, flags);
+
+ if (FULL_OFFLOAD_IS_ENABLED(q->flags))
+ taprio_offload_config_changed(q);
}
new_admin = NULL;
@@ -1213,6 +1578,8 @@ static void taprio_destroy(struct Qdisc *sch)
hrtimer_cancel(&q->advance_timer);
+ taprio_disable_offload(dev, q, NULL);
+
if (q->qdiscs) {
for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++)
qdisc_put(q->qdiscs[i]);
@@ -1221,7 +1588,7 @@ static void taprio_destroy(struct Qdisc *sch)
}
q->qdiscs = NULL;
- netdev_set_num_tc(dev, 0);
+ netdev_reset_tc(dev);
if (q->oper_sched)
call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb);
@@ -1242,12 +1609,20 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
q->advance_timer.function = advance_sched;
+ q->dequeue = taprio_dequeue_soft;
+ q->peek = taprio_peek_soft;
+
q->root = sch;
/* We only support static clockids. Use an invalid value as default
* and get the valid one on taprio_change().
*/
q->clockid = -1;
+ q->flags = TAPRIO_FLAGS_INVALID;
+
+ spin_lock(&taprio_list_lock);
+ list_add(&q->taprio_list, &taprio_list);
+ spin_unlock(&taprio_list_lock);
if (sch->parent != TC_H_ROOT)
return -EOPNOTSUPP;
@@ -1266,10 +1641,6 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt)
return -EINVAL;
- spin_lock(&taprio_list_lock);
- list_add(&q->taprio_list, &taprio_list);
- spin_unlock(&taprio_list_lock);
-
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
@@ -1424,7 +1795,8 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
goto options_error;
- if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+ if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
+ nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
goto options_error;
if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 5f72f3f916a5..78e79029dc63 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -15,6 +15,7 @@
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
@@ -137,6 +138,52 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r,
return len;
}
+static void tbf_offload_change(struct Qdisc *sch)
+{
+ struct tbf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.replace_params.rate = q->rate;
+ qopt.replace_params.max_size = q->max_size;
+ qopt.replace_params.qstats = &sch->qstats;
+
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static void tbf_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_tbf_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_TBF_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBF, &qopt);
+}
+
+static int tbf_offload_dump(struct Qdisc *sch)
+{
+ struct tc_tbf_qopt_offload qopt;
+
+ qopt.command = TC_TBF_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt);
+}
+
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
@@ -155,8 +202,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_drop(skb, sch, to_free);
nb = 0;
- while (segs) {
- nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
@@ -167,7 +213,6 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
} else {
nb++;
}
- segs = nskb;
}
sch->q.qlen += nb;
if (nb > 1)
@@ -409,6 +454,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
sch_tree_unlock(sch);
err = 0;
+
+ tbf_offload_change(sch);
done:
return err;
}
@@ -434,6 +481,7 @@ static void tbf_destroy(struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
+ tbf_offload_destroy(sch);
qdisc_put(q->qdisc);
}
@@ -442,8 +490,12 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_tbf_qopt opt;
+ int err;
+
+ err = tbf_offload_dump(sch);
+ if (err)
+ return err;
- sch->qstats.backlog = q->qdisc->qstats.backlog;
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5010cce52c93..437079a4883d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -54,7 +54,6 @@ static struct sctp_association *sctp_association_init(
const struct sock *sk,
enum sctp_scope scope, gfp_t gfp)
{
- struct net *net = sock_net(sk);
struct sctp_sock *sp;
struct sctp_paramhdr *p;
int i;
@@ -65,6 +64,7 @@ static struct sctp_association *sctp_association_init(
/* Discarding const is appropriate here. */
asoc->ep = (struct sctp_endpoint *)ep;
asoc->base.sk = (struct sock *)sk;
+ asoc->base.net = sock_net(sk);
sctp_endpoint_hold(asoc->ep);
sock_hold(asoc->base.sk);
@@ -87,6 +87,8 @@ static struct sctp_association *sctp_association_init(
*/
asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
asoc->pf_retrans = sp->pf_retrans;
+ asoc->ps_retrans = sp->ps_retrans;
+ asoc->pf_expose = sp->pf_expose;
asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
@@ -214,14 +216,6 @@ static struct sctp_association *sctp_association_init(
asoc->peer.sack_needed = 1;
asoc->peer.sack_generation = 1;
- /* Assume that the peer will tell us if he recognizes ASCONF
- * as part of INIT exchange.
- * The sctp_addip_noauth option is there for backward compatibility
- * and will revert old behavior.
- */
- if (net->sctp.addip_noauth)
- asoc->peer.asconf_capable = 1;
-
/* Create an input queue. */
sctp_inq_init(&asoc->base.inqueue);
sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv);
@@ -333,7 +327,7 @@ void sctp_association_free(struct sctp_association *asoc)
* socket.
*/
if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
}
/* Mark as dead, so other users can know this structure is
@@ -438,6 +432,8 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
changeover = 1 ;
asoc->peer.primary_path = transport;
+ sctp_ulpevent_nofity_peer_addr_change(transport,
+ SCTP_ADDR_MADE_PRIM, 0);
/* Set a default msg_name for events. */
memcpy(&asoc->peer.primary_addr, &transport->ipaddr,
@@ -578,6 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
asoc->peer.transport_count--;
+ sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0);
sctp_transport_free(peer);
}
@@ -587,7 +584,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
const gfp_t gfp,
const int peer_state)
{
- struct net *net = sock_net(asoc->base.sk);
struct sctp_transport *peer;
struct sctp_sock *sp;
unsigned short port;
@@ -617,7 +613,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
return peer;
}
- peer = sctp_transport_new(net, addr, gfp);
+ peer = sctp_transport_new(asoc->base.net, addr, gfp);
if (!peer)
return NULL;
@@ -633,6 +629,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* And the partial failure retrans threshold */
peer->pf_retrans = asoc->pf_retrans;
+ /* And the primary path switchover retrans threshold */
+ peer->ps_retrans = asoc->ps_retrans;
/* Initialize the peer's SACK delay timeout based on the
* association configured value.
@@ -716,6 +714,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
asoc->peer.transport_count++;
+ sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0);
+
/* If we do not yet have a primary path, set one. */
if (!asoc->peer.primary_path) {
sctp_assoc_set_primary(asoc, peer);
@@ -790,9 +790,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
enum sctp_transport_cmd command,
sctp_sn_error_t error)
{
- struct sctp_ulpevent *event;
- struct sockaddr_storage addr;
- int spc_state = 0;
+ int spc_state = SCTP_ADDR_AVAILABLE;
bool ulp_notify = true;
/* Record the transition on the transport. */
@@ -802,19 +800,13 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
* to heartbeat success, report the SCTP_ADDR_CONFIRMED
* state to the user, otherwise report SCTP_ADDR_AVAILABLE.
*/
- if (SCTP_UNCONFIRMED == transport->state &&
- SCTP_HEARTBEAT_SUCCESS == error)
- spc_state = SCTP_ADDR_CONFIRMED;
- else
- spc_state = SCTP_ADDR_AVAILABLE;
- /* Don't inform ULP about transition from PF to
- * active state and set cwnd to 1 MTU, see SCTP
- * Quick failover draft section 5.1, point 5
- */
- if (transport->state == SCTP_PF) {
+ if (transport->state == SCTP_PF &&
+ asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE)
ulp_notify = false;
- transport->cwnd = asoc->pathmtu;
- }
+ else if (transport->state == SCTP_UNCONFIRMED &&
+ error == SCTP_HEARTBEAT_SUCCESS)
+ spc_state = SCTP_ADDR_CONFIRMED;
+
transport->state = SCTP_ACTIVE;
break;
@@ -823,19 +815,21 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
* to inactive state. Also, release the cached route since
* there may be a better route next time.
*/
- if (transport->state != SCTP_UNCONFIRMED)
+ if (transport->state != SCTP_UNCONFIRMED) {
transport->state = SCTP_INACTIVE;
- else {
+ spc_state = SCTP_ADDR_UNREACHABLE;
+ } else {
sctp_transport_dst_release(transport);
ulp_notify = false;
}
-
- spc_state = SCTP_ADDR_UNREACHABLE;
break;
case SCTP_TRANSPORT_PF:
transport->state = SCTP_PF;
- ulp_notify = false;
+ if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE)
+ ulp_notify = false;
+ else
+ spc_state = SCTP_ADDR_POTENTIALLY_FAILED;
break;
default:
@@ -845,16 +839,9 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
/* Generate and send a SCTP_PEER_ADDR_CHANGE notification
* to the user.
*/
- if (ulp_notify) {
- memset(&addr, 0, sizeof(struct sockaddr_storage));
- memcpy(&addr, &transport->ipaddr,
- transport->af_specific->sockaddr_len);
-
- event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
- 0, spc_state, error, GFP_ATOMIC);
- if (event)
- asoc->stream.si->enqueue_event(&asoc->ulpq, event);
- }
+ if (ulp_notify)
+ sctp_ulpevent_nofity_peer_addr_change(transport,
+ spc_state, error);
/* Select new active and retran paths. */
sctp_select_active_and_retran_path(asoc);
@@ -986,7 +973,7 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
struct sctp_association *asoc =
container_of(work, struct sctp_association,
base.inqueue.immediate);
- struct net *net = sock_net(asoc->base.sk);
+ struct net *net = asoc->base.net;
union sctp_subtype subtype;
struct sctp_endpoint *ep;
struct sctp_chunk *chunk;
@@ -1086,7 +1073,7 @@ void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk)
/* Decrement the backlog value for a TCP-style socket. */
if (sctp_style(oldsk, TCP))
- oldsk->sk_ack_backlog--;
+ sk_acceptq_removed(oldsk);
/* Release references to the old endpoint and the sock. */
sctp_endpoint_put(assoc->ep);
@@ -1454,7 +1441,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
/* Should we send a SACK to update our peer? */
static inline bool sctp_peer_needs_update(struct sctp_association *asoc)
{
- struct net *net = sock_net(asoc->base.sk);
+ struct net *net = asoc->base.net;
+
switch (asoc->state) {
case SCTP_STATE_ESTABLISHED:
case SCTP_STATE_SHUTDOWN_PENDING:
@@ -1588,7 +1576,7 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
if (asoc->peer.ipv6_address)
flags |= SCTP_ADDR6_PEERSUPP;
- return sctp_bind_addr_copy(sock_net(asoc->base.sk),
+ return sctp_bind_addr_copy(asoc->base.net,
&asoc->base.bind_addr,
&asoc->ep->base.bind_addr,
scope, gfp, flags);
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index de4c78d4a21e..4278764d82b8 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -389,7 +389,7 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
/* If we don't support AUTH, or peer is not capable
* we don't need to do anything.
*/
- if (!asoc->ep->auth_enable || !asoc->peer.auth_capable)
+ if (!asoc->peer.auth_capable)
return 0;
/* If the key_id is non-zero and we couldn't find an
@@ -675,7 +675,7 @@ int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc)
if (!asoc)
return 0;
- if (!asoc->ep->auth_enable || !asoc->peer.auth_capable)
+ if (!asoc->peer.auth_capable)
return 0;
return __sctp_auth_cid(chunk, asoc->peer.peer_chunks);
@@ -687,7 +687,7 @@ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc)
if (!asoc)
return 0;
- if (!asoc->ep->auth_enable)
+ if (!asoc->peer.auth_capable)
return 0;
return __sctp_auth_cid(chunk,
@@ -831,10 +831,15 @@ int sctp_auth_set_key(struct sctp_endpoint *ep,
/* Try to find the given key id to see if
* we are doing a replace, or adding a new key
*/
- if (asoc)
+ if (asoc) {
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
sh_keys = &asoc->endpoint_shared_keys;
- else
+ } else {
+ if (!ep->auth_enable)
+ return -EACCES;
sh_keys = &ep->endpoint_shared_keys;
+ }
key_for_each(shkey, sh_keys) {
if (shkey->key_id == auth_key->sca_keynumber) {
@@ -875,10 +880,15 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep,
int found = 0;
/* The key identifier MUST correst to an existing key */
- if (asoc)
+ if (asoc) {
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
sh_keys = &asoc->endpoint_shared_keys;
- else
+ } else {
+ if (!ep->auth_enable)
+ return -EACCES;
sh_keys = &ep->endpoint_shared_keys;
+ }
key_for_each(key, sh_keys) {
if (key->key_id == key_id) {
@@ -911,11 +921,15 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep,
* The key identifier MUST correst to an existing key
*/
if (asoc) {
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
if (asoc->active_key_id == key_id)
return -EINVAL;
sh_keys = &asoc->endpoint_shared_keys;
} else {
+ if (!ep->auth_enable)
+ return -EACCES;
if (ep->active_key_id == key_id)
return -EINVAL;
@@ -950,11 +964,15 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
* The key identifier MUST correst to an existing key
*/
if (asoc) {
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
if (asoc->active_key_id == key_id)
return -EINVAL;
sh_keys = &asoc->endpoint_shared_keys;
} else {
+ if (!ep->auth_enable)
+ return -EACCES;
if (ep->active_key_id == key_id)
return -EINVAL;
@@ -989,3 +1007,72 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
return 0;
}
+
+int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp)
+{
+ int err = -ENOMEM;
+
+ /* Allocate space for HMACS and CHUNKS authentication
+ * variables. There are arrays that we encode directly
+ * into parameters to make the rest of the operations easier.
+ */
+ if (!ep->auth_hmacs_list) {
+ struct sctp_hmac_algo_param *auth_hmacs;
+
+ auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids,
+ SCTP_AUTH_NUM_HMACS), gfp);
+ if (!auth_hmacs)
+ goto nomem;
+ /* Initialize the HMACS parameter.
+ * SCTP-AUTH: Section 3.3
+ * Every endpoint supporting SCTP chunk authentication MUST
+ * support the HMAC based on the SHA-1 algorithm.
+ */
+ auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO;
+ auth_hmacs->param_hdr.length =
+ htons(sizeof(struct sctp_paramhdr) + 2);
+ auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1);
+ ep->auth_hmacs_list = auth_hmacs;
+ }
+
+ if (!ep->auth_chunk_list) {
+ struct sctp_chunks_param *auth_chunks;
+
+ auth_chunks = kzalloc(sizeof(*auth_chunks) +
+ SCTP_NUM_CHUNK_TYPES, gfp);
+ if (!auth_chunks)
+ goto nomem;
+ /* Initialize the CHUNKS parameter */
+ auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS;
+ auth_chunks->param_hdr.length =
+ htons(sizeof(struct sctp_paramhdr));
+ ep->auth_chunk_list = auth_chunks;
+ }
+
+ /* Allocate and initialize transorms arrays for supported
+ * HMACs.
+ */
+ err = sctp_auth_init_hmacs(ep, gfp);
+ if (err)
+ goto nomem;
+
+ return 0;
+
+nomem:
+ /* Free all allocations */
+ kfree(ep->auth_hmacs_list);
+ kfree(ep->auth_chunk_list);
+ ep->auth_hmacs_list = NULL;
+ ep->auth_chunk_list = NULL;
+ return err;
+}
+
+void sctp_auth_free(struct sctp_endpoint *ep)
+{
+ kfree(ep->auth_hmacs_list);
+ kfree(ep->auth_chunk_list);
+ ep->auth_hmacs_list = NULL;
+ ep->auth_chunk_list = NULL;
+ sctp_auth_destroy_hmacs(ep->auth_hmacs);
+ ep->auth_hmacs = NULL;
+}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index cc0405c79dfc..ab6a997e222f 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -75,41 +75,39 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
struct list_head *pos, *temp;
struct sctp_chunk *chunk;
struct sctp_ulpevent *ev;
- int error = 0, notify;
-
- /* If we failed, we may need to notify. */
- notify = msg->send_failed ? -1 : 0;
+ int error, sent;
/* Release all references. */
list_for_each_safe(pos, temp, &msg->chunks) {
list_del_init(pos);
chunk = list_entry(pos, struct sctp_chunk, frag_list);
- /* Check whether we _really_ need to notify. */
- if (notify < 0) {
- asoc = chunk->asoc;
- if (msg->send_error)
- error = msg->send_error;
- else
- error = asoc->outqueue.error;
-
- notify = sctp_ulpevent_type_enabled(asoc->subscribe,
- SCTP_SEND_FAILED);
+
+ if (!msg->send_failed) {
+ sctp_chunk_put(chunk);
+ continue;
}
- /* Generate a SEND FAILED event only if enabled. */
- if (notify > 0) {
- int sent;
- if (chunk->has_tsn)
- sent = SCTP_DATA_SENT;
- else
- sent = SCTP_DATA_UNSENT;
+ asoc = chunk->asoc;
+ error = msg->send_error ?: asoc->outqueue.error;
+ sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT;
+ if (sctp_ulpevent_type_enabled(asoc->subscribe,
+ SCTP_SEND_FAILED)) {
ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
error, GFP_ATOMIC);
if (ev)
asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
}
+ if (sctp_ulpevent_type_enabled(asoc->subscribe,
+ SCTP_SEND_FAILED_EVENT)) {
+ ev = sctp_ulpevent_make_send_failed_event(asoc, chunk,
+ sent, error,
+ GFP_ATOMIC);
+ if (ev)
+ asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
+ }
+
sctp_chunk_put(chunk);
}
@@ -227,7 +225,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
if (msg_len >= first_len) {
msg->can_delay = 0;
if (msg_len > first_len)
- SCTP_INC_STATS(sock_net(asoc->base.sk),
+ SCTP_INC_STATS(asoc->base.net,
SCTP_MIB_FRAGUSRMSGS);
} else {
/* Which may be the only one... */
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index fc9a4c6629ce..8a15146faaeb 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -175,7 +175,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
- mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0)
@@ -425,8 +425,8 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc);
r->idiag_wqueue = infox->asoc->sndbuf_used;
} else {
- r->idiag_rqueue = sk->sk_ack_backlog;
- r->idiag_wqueue = sk->sk_max_ack_backlog;
+ r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
}
if (infox->sctpinfo)
sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 69cebb2c998b..48c9c2c7602f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -43,62 +43,21 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
gfp_t gfp)
{
struct net *net = sock_net(sk);
- struct sctp_hmac_algo_param *auth_hmacs = NULL;
- struct sctp_chunks_param *auth_chunks = NULL;
struct sctp_shared_key *null_key;
- int err;
ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp);
if (!ep->digest)
return NULL;
+ ep->asconf_enable = net->sctp.addip_enable;
ep->auth_enable = net->sctp.auth_enable;
if (ep->auth_enable) {
- /* Allocate space for HMACS and CHUNKS authentication
- * variables. There are arrays that we encode directly
- * into parameters to make the rest of the operations easier.
- */
- auth_hmacs = kzalloc(struct_size(auth_hmacs, hmac_ids,
- SCTP_AUTH_NUM_HMACS), gfp);
- if (!auth_hmacs)
- goto nomem;
-
- auth_chunks = kzalloc(sizeof(*auth_chunks) +
- SCTP_NUM_CHUNK_TYPES, gfp);
- if (!auth_chunks)
+ if (sctp_auth_init(ep, gfp))
goto nomem;
-
- /* Initialize the HMACS parameter.
- * SCTP-AUTH: Section 3.3
- * Every endpoint supporting SCTP chunk authentication MUST
- * support the HMAC based on the SHA-1 algorithm.
- */
- auth_hmacs->param_hdr.type = SCTP_PARAM_HMAC_ALGO;
- auth_hmacs->param_hdr.length =
- htons(sizeof(struct sctp_paramhdr) + 2);
- auth_hmacs->hmac_ids[0] = htons(SCTP_AUTH_HMAC_ID_SHA1);
-
- /* Initialize the CHUNKS parameter */
- auth_chunks->param_hdr.type = SCTP_PARAM_CHUNKS;
- auth_chunks->param_hdr.length =
- htons(sizeof(struct sctp_paramhdr));
-
- /* If the Add-IP functionality is enabled, we must
- * authenticate, ASCONF and ASCONF-ACK chunks
- */
- if (net->sctp.addip_enable) {
- auth_chunks->chunks[0] = SCTP_CID_ASCONF;
- auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK;
- auth_chunks->param_hdr.length =
- htons(sizeof(struct sctp_paramhdr) + 2);
+ if (ep->asconf_enable) {
+ sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
+ sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
}
-
- /* Allocate and initialize transorms arrays for supported
- * HMACs.
- */
- err = sctp_auth_init_hmacs(ep, gfp);
- if (err)
- goto nomem;
}
/* Initialize the base structure. */
@@ -145,23 +104,20 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Add the null key to the endpoint shared keys list and
* set the hmcas and chunks pointers.
*/
- ep->auth_hmacs_list = auth_hmacs;
- ep->auth_chunk_list = auth_chunks;
ep->prsctp_enable = net->sctp.prsctp_enable;
ep->reconf_enable = net->sctp.reconf_enable;
+ ep->ecn_enable = net->sctp.ecn_enable;
/* Remember who we are attached to. */
ep->base.sk = sk;
+ ep->base.net = sock_net(sk);
sock_hold(ep->base.sk);
return ep;
nomem_shkey:
- sctp_auth_destroy_hmacs(ep->auth_hmacs);
+ sctp_auth_free(ep);
nomem:
- /* Free all allocations */
- kfree(auth_hmacs);
- kfree(auth_chunks);
kfree(ep->digest);
return NULL;
@@ -209,7 +165,7 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep,
/* Increment the backlog value for a TCP-style listening socket. */
if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
}
/* Free the endpoint structure. Delay cleanup until
@@ -244,11 +200,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
* chunks and hmacs arrays that were allocated
*/
sctp_auth_destroy_keys(&ep->endpoint_shared_keys);
- kfree(ep->auth_hmacs_list);
- kfree(ep->auth_chunk_list);
-
- /* AUTH - Free any allocated HMAC transform containers */
- sctp_auth_destroy_hmacs(ep->auth_hmacs);
+ sctp_auth_free(ep);
/* Cleanup. */
sctp_inq_free(&ep->base.inqueue);
@@ -292,7 +244,7 @@ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
struct sctp_endpoint *retval = NULL;
if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) &&
- net_eq(sock_net(ep->base.sk), net)) {
+ net_eq(ep->base.net, net)) {
if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
sctp_sk(ep->base.sk)))
retval = ep;
@@ -340,8 +292,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
const union sctp_addr *paddr)
{
struct sctp_sockaddr_entry *addr;
+ struct net *net = ep->base.net;
struct sctp_bind_addr *bp;
- struct net *net = sock_net(ep->base.sk);
bp = &ep->base.bind_addr;
/* This function is called with the socket lock held,
@@ -432,7 +384,7 @@ normal:
if (asoc && sctp_chunk_is_data(chunk))
asoc->peer.last_data_from = chunk->transport;
else {
- SCTP_INC_STATS(sock_net(ep->base.sk), SCTP_MIB_INCTRLCHUNKS);
+ SCTP_INC_STATS(ep->base.net, SCTP_MIB_INCTRLCHUNKS);
if (asoc)
asoc->stats.ictrlchunks++;
}
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1008cdc44dd6..efaaefc3bb1c 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -201,7 +201,7 @@ int sctp_rcv(struct sk_buff *skb)
if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
goto discard_release;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (sk_filter(sk, skb))
goto discard_release;
@@ -243,7 +243,7 @@ int sctp_rcv(struct sk_buff *skb)
bh_lock_sock(sk);
}
- if (sock_owned_by_user(sk)) {
+ if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) {
if (sctp_add_backlog(sk, skb)) {
bh_unlock_sock(sk);
sctp_chunk_free(chunk);
@@ -321,8 +321,8 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
local_bh_disable();
bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- if (sk_add_backlog(sk, skb, sk->sk_rcvbuf))
+ if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) {
+ if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))
sctp_chunk_free(chunk);
else
backloged = 1;
@@ -336,7 +336,13 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
if (backloged)
return 0;
} else {
- sctp_inq_push(inqueue, chunk);
+ if (!sctp_newsk_ready(sk)) {
+ if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))
+ return 0;
+ sctp_chunk_free(chunk);
+ } else {
+ sctp_inq_push(inqueue, chunk);
+ }
}
done:
@@ -358,7 +364,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb)
struct sctp_ep_common *rcvr = chunk->rcvr;
int ret;
- ret = sk_add_backlog(sk, skb, sk->sk_rcvbuf);
+ ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf));
if (!ret) {
/* Hold the assoc/ep while hanging on the backlog queue.
* This way, we know structures we need will not disappear
@@ -876,7 +882,7 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
if (!sctp_transport_hold(t))
return err;
- if (!net_eq(sock_net(t->asoc->base.sk), x->net))
+ if (!net_eq(t->asoc->base.net, x->net))
goto out;
if (x->lport != htons(t->asoc->base.bind_addr.port))
goto out;
@@ -891,7 +897,7 @@ static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
{
const struct sctp_transport *t = data;
- return sctp_hashfn(sock_net(t->asoc->base.sk),
+ return sctp_hashfn(t->asoc->base.net,
htons(t->asoc->base.bind_addr.port),
&t->ipaddr, seed);
}
@@ -931,7 +937,7 @@ int sctp_hash_transport(struct sctp_transport *t)
if (t->asoc->temp)
return 0;
- arg.net = sock_net(t->asoc->base.sk);
+ arg.net = t->asoc->base.net;
arg.paddr = &t->ipaddr;
arg.lport = htons(t->asoc->base.bind_addr.port);
@@ -998,12 +1004,11 @@ struct sctp_transport *sctp_epaddr_lookup_transport(
const struct sctp_endpoint *ep,
const union sctp_addr *paddr)
{
- struct net *net = sock_net(ep->base.sk);
struct rhlist_head *tmp, *list;
struct sctp_transport *t;
struct sctp_hash_cmp_arg arg = {
.paddr = paddr,
- .net = net,
+ .net = ep->base.net,
.lport = htons(ep->base.bind_addr.port),
};
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e5f2fc726a98..bc734cfaa29e 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -215,7 +215,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
rcu_read_lock();
res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
- tclass);
+ tclass, sk->sk_priority);
rcu_read_unlock();
return res;
}
@@ -275,7 +275,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (!asoc || saddr)
goto out;
@@ -328,7 +328,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl6->saddr = laddr->a.v6.sin6_addr;
fl6->fl6_sport = laddr->a.v6.sin6_port;
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
- bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(bdst))
continue;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index dbda7e7927fd..1441eaf460bb 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -282,7 +282,7 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt,
sctp_chunk_free(sack);
goto out;
}
- SCTP_INC_STATS(sock_net(asoc->base.sk),
+ SCTP_INC_STATS(asoc->base.net,
SCTP_MIB_OUTCTRLCHUNKS);
asoc->stats.octrlchunks++;
asoc->peer.sack_needed = 0;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 0dab62b67b9a..577e3bc4ee6f 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -36,6 +36,7 @@
#include <net/sctp/sctp.h>
#include <net/sctp/sm.h>
#include <net/sctp/stream_sched.h>
+#include <trace/events/sctp.h>
/* Declare internal functions here. */
static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn);
@@ -279,7 +280,7 @@ void sctp_outq_free(struct sctp_outq *q)
/* Put a new chunk in an sctp_outq. */
void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
{
- struct net *net = sock_net(q->asoc->base.sk);
+ struct net *net = q->asoc->base.net;
pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
chunk && chunk->chunk_hdr ?
@@ -533,7 +534,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
enum sctp_retransmit_reason reason)
{
- struct net *net = sock_net(q->asoc->base.sk);
+ struct net *net = q->asoc->base.net;
switch (reason) {
case SCTP_RTXR_T3_RTX:
@@ -1238,6 +1239,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
/* Grab the association's destination address list. */
transport_list = &asoc->peer.transport_addr_list;
+ /* SCTP path tracepoint for congestion control debugging. */
+ if (trace_sctp_probe_path_enabled()) {
+ list_for_each_entry(transport, transport_list, transports)
+ trace_sctp_probe_path(transport, asoc);
+ }
+
sack_ctsn = ntohl(sack->cum_tsn_ack);
gap_ack_blocks = ntohs(sack->num_gap_ack_blocks);
asoc->stats.gapcnt += gap_ack_blocks;
@@ -1884,6 +1891,6 @@ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
if (ftsn_chunk) {
list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
- SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS);
+ SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS);
}
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2d47adcb4cbe..78af2fcf90cc 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -227,6 +227,7 @@ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb,
sa->sin_port = sh->dest;
sa->sin_addr.s_addr = ip_hdr(skb)->daddr;
}
+ memset(sa->sin_zero, 0, sizeof(sa->sin_zero));
}
/* Initialize an sctp_addr from a socket. */
@@ -235,6 +236,7 @@ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk)
addr->v4.sin_family = AF_INET;
addr->v4.sin_port = 0;
addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr;
+ memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
}
/* Initialize sk->sk_rcv_saddr from sctp_addr. */
@@ -257,6 +259,7 @@ static void sctp_v4_from_addr_param(union sctp_addr *addr,
addr->v4.sin_family = AF_INET;
addr->v4.sin_port = port;
addr->v4.sin_addr.s_addr = param->v4.addr.s_addr;
+ memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
}
/* Initialize an address parameter from a sctp_addr and return the length
@@ -281,6 +284,7 @@ static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4,
saddr->v4.sin_family = AF_INET;
saddr->v4.sin_port = port;
saddr->v4.sin_addr.s_addr = fl4->saddr;
+ memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero));
}
/* Compare two addresses exactly. */
@@ -303,6 +307,7 @@ static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port)
addr->v4.sin_family = AF_INET;
addr->v4.sin_addr.s_addr = htonl(INADDR_ANY);
addr->v4.sin_port = port;
+ memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero));
}
/* Is this a wildcard address? */
@@ -1217,9 +1222,15 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Max.Burst - 4 */
net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST;
+ /* Disable of Primary Path Switchover by default */
+ net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX;
+
/* Enable pf state by default */
net->sctp.pf_enable = 1;
+ /* Ignore pf exposure feature by default */
+ net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET;
+
/* Association.Max.Retrans - 10 attempts
* Path.Max.Retrans - 5 attempts (per destination address)
* Max.Init.Retransmits - 8 attempts
@@ -1254,6 +1265,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Disable AUTH by default. */
net->sctp.auth_enable = 0;
+ /* Enable ECN by default. */
+ net->sctp.ecn_enable = 1;
+
/* Set SCOPE policy to enabled */
net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE;
@@ -1336,7 +1350,7 @@ static int __net_init sctp_ctrlsock_init(struct net *net)
return status;
}
-static void __net_init sctp_ctrlsock_exit(struct net *net)
+static void __net_exit sctp_ctrlsock_exit(struct net *net)
{
/* Free the control endpoint. */
inet_ctl_sock_destroy(net->sctp.ctl_sock);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 36bd8a6e82df..09050c1d5517 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -207,7 +207,6 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
const struct sctp_bind_addr *bp,
gfp_t gfp, int vparam_len)
{
- struct net *net = sock_net(asoc->base.sk);
struct sctp_supported_ext_param ext_param;
struct sctp_adaptation_ind_param aiparam;
struct sctp_paramhdr *auth_chunks = NULL;
@@ -245,7 +244,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
chunksize = sizeof(init) + addrs_len;
chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
- chunksize += sizeof(ecap_param);
+
+ if (asoc->ep->ecn_enable)
+ chunksize += sizeof(ecap_param);
if (asoc->ep->prsctp_enable)
chunksize += sizeof(prsctp_param);
@@ -255,7 +256,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
* the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and
* INIT-ACK parameters.
*/
- if (net->sctp.addip_enable) {
+ if (asoc->ep->asconf_enable) {
extensions[num_ext] = SCTP_CID_ASCONF;
extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
num_ext += 2;
@@ -336,7 +337,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
sctp_addto_chunk(retval, sizeof(sat), &sat);
sctp_addto_chunk(retval, num_types * sizeof(__u16), &types);
- sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
+ if (asoc->ep->ecn_enable)
+ sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
/* Add the supported extensions parameter. Be nice and add this
* fist before addiding the parameters for the extensions themselves
@@ -1964,7 +1966,9 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
return 0;
}
-static int sctp_verify_ext_param(struct net *net, union sctp_params param)
+static int sctp_verify_ext_param(struct net *net,
+ const struct sctp_endpoint *ep,
+ union sctp_params param)
{
__u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
int have_asconf = 0;
@@ -1991,7 +1995,7 @@ static int sctp_verify_ext_param(struct net *net, union sctp_params param)
if (net->sctp.addip_noauth)
return 1;
- if (net->sctp.addip_enable && !have_auth && have_asconf)
+ if (ep->asconf_enable && !have_auth && have_asconf)
return 0;
return 1;
@@ -2001,7 +2005,6 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
union sctp_params param)
{
__u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
- struct net *net = sock_net(asoc->base.sk);
int i;
for (i = 0; i < num_ext; i++) {
@@ -2023,7 +2026,7 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
break;
case SCTP_CID_ASCONF:
case SCTP_CID_ASCONF_ACK:
- if (net->sctp.addip_enable)
+ if (asoc->ep->asconf_enable)
asoc->peer.asconf_capable = 1;
break;
case SCTP_CID_I_DATA:
@@ -2145,14 +2148,14 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
break;
case SCTP_PARAM_SUPPORTED_EXT:
- if (!sctp_verify_ext_param(net, param))
+ if (!sctp_verify_ext_param(net, ep, param))
return SCTP_IERROR_ABORT;
break;
case SCTP_PARAM_SET_PRIMARY:
- if (net->sctp.addip_enable)
+ if (ep->asconf_enable)
break;
- goto fallthrough;
+ goto unhandled;
case SCTP_PARAM_HOST_NAME_ADDRESS:
/* Tell the peer, we won't support this param. */
@@ -2163,11 +2166,11 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
case SCTP_PARAM_FWD_TSN_SUPPORT:
if (ep->prsctp_enable)
break;
- goto fallthrough;
+ goto unhandled;
case SCTP_PARAM_RANDOM:
if (!ep->auth_enable)
- goto fallthrough;
+ goto unhandled;
/* SCTP-AUTH: Secion 6.1
* If the random number is not 32 byte long the association
@@ -2184,7 +2187,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
case SCTP_PARAM_CHUNKS:
if (!ep->auth_enable)
- goto fallthrough;
+ goto unhandled;
/* SCTP-AUTH: Section 3.2
* The CHUNKS parameter MUST be included once in the INIT or
@@ -2200,7 +2203,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
case SCTP_PARAM_HMAC_ALGO:
if (!ep->auth_enable)
- goto fallthrough;
+ goto unhandled;
hmacs = (struct sctp_hmac_algo_param *)param.p;
n_elt = (ntohs(param.p->length) -
@@ -2223,7 +2226,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
retval = SCTP_IERROR_ABORT;
}
break;
-fallthrough:
+unhandled:
default:
pr_debug("%s: unrecognized param:%d for chunk:%d\n",
__func__, ntohs(param.p->type), cid);
@@ -2304,7 +2307,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
const union sctp_addr *peer_addr,
struct sctp_init_chunk *peer_init, gfp_t gfp)
{
- struct net *net = sock_net(asoc->base.sk);
struct sctp_transport *transport;
struct list_head *pos, *temp;
union sctp_params param;
@@ -2360,8 +2362,8 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
* also give us an option to silently ignore the packet, which
* is what we'll do here.
*/
- if (!net->sctp.addip_noauth &&
- (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) {
+ if (!asoc->base.net->sctp.addip_noauth &&
+ (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) {
asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP |
SCTP_PARAM_DEL_IP |
SCTP_PARAM_SET_PRIMARY);
@@ -2488,9 +2490,9 @@ static int sctp_process_param(struct sctp_association *asoc,
const union sctp_addr *peer_addr,
gfp_t gfp)
{
- struct net *net = sock_net(asoc->base.sk);
struct sctp_endpoint *ep = asoc->ep;
union sctp_addr_param *addr_param;
+ struct net *net = asoc->base.net;
struct sctp_transport *t;
enum sctp_scope scope;
union sctp_addr addr;
@@ -2597,15 +2599,20 @@ do_addr_param:
break;
case SCTP_PARAM_ECN_CAPABLE:
- asoc->peer.ecn_capable = 1;
- break;
+ if (asoc->ep->ecn_enable) {
+ asoc->peer.ecn_capable = 1;
+ break;
+ }
+ /* Fall Through */
+ goto fall_through;
+
case SCTP_PARAM_ADAPTATION_LAYER_IND:
asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind);
break;
case SCTP_PARAM_SET_PRIMARY:
- if (!net->sctp.addip_enable)
+ if (!ep->asconf_enable)
goto fall_through;
addr_param = param.v + sizeof(struct sctp_addip_param);
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 1cf5bb5b73c4..2bc29463e1dc 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -516,8 +516,6 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
struct sctp_transport *transport,
int is_hb)
{
- struct net *net = sock_net(asoc->base.sk);
-
/* The check for association's overall error counter exceeding the
* threshold is done in the state function.
*/
@@ -544,10 +542,10 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
* is SCTP_ACTIVE, then mark this transport as Partially Failed,
* see SCTP Quick Failover Draft, section 5.1
*/
- if (net->sctp.pf_enable &&
- (transport->state == SCTP_ACTIVE) &&
- (transport->error_count < transport->pathmaxrxt) &&
- (transport->error_count > asoc->pf_retrans)) {
+ if (asoc->base.net->sctp.pf_enable &&
+ transport->state == SCTP_ACTIVE &&
+ transport->error_count < transport->pathmaxrxt &&
+ transport->error_count > transport->pf_retrans) {
sctp_assoc_control_transport(asoc, transport,
SCTP_TRANSPORT_PF,
@@ -567,6 +565,11 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
SCTP_FAILED_THRESHOLD);
}
+ if (transport->error_count > transport->ps_retrans &&
+ asoc->peer.primary_path == transport &&
+ asoc->peer.active_path != transport)
+ sctp_assoc_set_primary(asoc, asoc->peer.active_path);
+
/* E2) For the destination address for which the timer
* expires, set RTO <- RTO * 2 ("back off the timer"). The
* maximum value discussed in rule C7 above (RTO.max) may be
@@ -793,10 +796,8 @@ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds,
int err = 0;
if (sctp_outq_sack(&asoc->outqueue, chunk)) {
- struct net *net = sock_net(asoc->base.sk);
-
/* There are no more TSNs awaiting SACK. */
- err = sctp_do_sm(net, SCTP_EVENT_T_OTHER,
+ err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER,
SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN),
asoc->state, asoc->ep, asoc, NULL,
GFP_ATOMIC);
@@ -829,7 +830,7 @@ static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds,
struct sctp_association *asoc,
struct sctp_association *new)
{
- struct net *net = sock_net(asoc->base.sk);
+ struct net *net = asoc->base.net;
struct sctp_chunk *abort;
if (!sctp_assoc_update(asoc, new))
@@ -1358,8 +1359,10 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
/* Generate an INIT ACK chunk. */
new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC,
0);
- if (!new_obj)
- goto nomem;
+ if (!new_obj) {
+ error = -ENOMEM;
+ break;
+ }
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
SCTP_CHUNK(new_obj));
@@ -1381,7 +1384,8 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
if (!new_obj) {
if (cmd->obj.chunk)
sctp_chunk_free(cmd->obj.chunk);
- goto nomem;
+ error = -ENOMEM;
+ break;
}
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
SCTP_CHUNK(new_obj));
@@ -1428,8 +1432,10 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
/* Generate a SHUTDOWN chunk. */
new_obj = sctp_make_shutdown(asoc, chunk);
- if (!new_obj)
- goto nomem;
+ if (!new_obj) {
+ error = -ENOMEM;
+ break;
+ }
sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
SCTP_CHUNK(new_obj));
break;
@@ -1765,11 +1771,17 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
break;
}
- if (error)
+ if (error) {
+ cmd = sctp_next_cmd(commands);
+ while (cmd) {
+ if (cmd->verb == SCTP_CMD_REPLY)
+ sctp_chunk_free(cmd->obj.chunk);
+ cmd = sctp_next_cmd(commands);
+ }
break;
+ }
}
-out:
/* If this is in response to a received chunk, wait until
* we are done with the packet to open the queue so that we don't
* send multiple packets in response to a single request.
@@ -1784,7 +1796,4 @@ out:
sp->data_ready_signalled = 0;
return error;
-nomem:
- error = -ENOMEM;
- goto out;
}
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 2c244b29a199..748e3b19ec1d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1320,7 +1320,7 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
struct sctp_chunk *init,
struct sctp_cmd_seq *commands)
{
- struct net *net = sock_net(new_asoc->base.sk);
+ struct net *net = new_asoc->base.net;
struct sctp_transport *new_addr;
int ret = 1;
@@ -2160,8 +2160,10 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
/* Update socket peer label if first association. */
if (security_sctp_assoc_request((struct sctp_endpoint *)ep,
- chunk->skb))
+ chunk->skb)) {
+ sctp_association_free(new_asoc);
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+ }
/* Set temp so that it won't be added into hashtable */
new_asoc->temp = 1;
@@ -3279,8 +3281,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
struct sctp_sackhdr *sackh;
__u32 ctsn;
- trace_sctp_probe(ep, asoc, chunk);
-
if (!sctp_vtag_verify(chunk, asoc))
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -3297,6 +3297,15 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
chunk->subh.sack_hdr = sackh;
ctsn = ntohl(sackh->cum_tsn_ack);
+ /* If Cumulative TSN Ack beyond the max tsn currently
+ * send, terminating the association and respond to the
+ * sender with an ABORT.
+ */
+ if (TSN_lte(asoc->next_tsn, ctsn))
+ return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);
+
+ trace_sctp_probe(ep, asoc, chunk);
+
/* i) If Cumulative TSN Ack is less than the Cumulative TSN
* Ack Point, then drop the SACK. Since Cumulative TSN
* Ack is monotonically increasing, a SACK whose
@@ -3310,13 +3319,6 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
return SCTP_DISPOSITION_DISCARD;
}
- /* If Cumulative TSN Ack beyond the max tsn currently
- * send, terminating the association and respond to the
- * sender with an ABORT.
- */
- if (!TSN_lt(ctsn, asoc->next_tsn))
- return sctp_sf_violation_ctsn(net, ep, asoc, type, arg, commands);
-
/* Return this SACK for further processing. */
sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk));
@@ -3721,7 +3723,8 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net,
* is received unauthenticated it MUST be silently discarded as
* described in [I-D.ietf-tsvwg-sctp-auth].
*/
- if (!net->sctp.addip_noauth && !chunk->auth)
+ if (!asoc->peer.asconf_capable ||
+ (!net->sctp.addip_noauth && !chunk->auth))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
commands);
@@ -3863,7 +3866,8 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
* is received unauthenticated it MUST be silently discarded as
* described in [I-D.ietf-tsvwg-sctp-auth].
*/
- if (!net->sctp.addip_noauth && !asconf_ack->auth)
+ if (!asoc->peer.asconf_capable ||
+ (!net->sctp.addip_noauth && !asconf_ack->auth))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
commands);
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 61ed9c6e3be3..88ea87f4f0e7 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -976,26 +976,22 @@ static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
if (cid <= SCTP_CID_BASE_MAX)
return &chunk_event_table[cid][state];
- if (net->sctp.prsctp_enable) {
- if (cid == SCTP_CID_FWD_TSN || cid == SCTP_CID_I_FWD_TSN)
- return &prsctp_chunk_event_table[0][state];
- }
+ switch ((u16)cid) {
+ case SCTP_CID_FWD_TSN:
+ case SCTP_CID_I_FWD_TSN:
+ return &prsctp_chunk_event_table[0][state];
- if (net->sctp.addip_enable) {
- if (cid == SCTP_CID_ASCONF)
- return &addip_chunk_event_table[0][state];
+ case SCTP_CID_ASCONF:
+ return &addip_chunk_event_table[0][state];
- if (cid == SCTP_CID_ASCONF_ACK)
- return &addip_chunk_event_table[1][state];
- }
+ case SCTP_CID_ASCONF_ACK:
+ return &addip_chunk_event_table[1][state];
- if (net->sctp.reconf_enable)
- if (cid == SCTP_CID_RECONF)
- return &reconf_chunk_event_table[0][state];
+ case SCTP_CID_RECONF:
+ return &reconf_chunk_event_table[0][state];
- if (net->sctp.auth_enable) {
- if (cid == SCTP_CID_AUTH)
- return &auth_chunk_event_table[0][state];
+ case SCTP_CID_AUTH:
+ return &auth_chunk_event_table[0][state];
}
return &chunk_event_table_unknown[state];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9d1f83b10c0a..1b56fc440606 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -309,7 +309,7 @@ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
return retval;
}
-static long sctp_get_port_local(struct sock *, union sctp_addr *);
+static int sctp_get_port_local(struct sock *, union sctp_addr *);
/* Verify this is a valid sockaddr. */
static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt,
@@ -384,7 +384,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
}
}
- if (snum && snum < inet_prot_sock(net) &&
+ if (snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
@@ -399,9 +399,8 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
* detection.
*/
addr->v4.sin_port = htons(snum);
- if ((ret = sctp_get_port_local(sk, addr))) {
+ if (sctp_get_port_local(sk, addr))
return -EADDRINUSE;
- }
/* Refresh ephemeral port. */
if (!bp->port)
@@ -413,11 +412,13 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len,
SCTP_ADDR_SRC, GFP_ATOMIC);
- /* Copy back into socket for getsockname() use. */
- if (!ret) {
- inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
- sp->pf->to_sk_saddr(addr, sk);
+ if (ret) {
+ sctp_put_port(sk);
+ return ret;
}
+ /* Copy back into socket for getsockname() use. */
+ inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num);
+ sp->pf->to_sk_saddr(addr, sk);
return ret;
}
@@ -435,8 +436,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
static int sctp_send_asconf(struct sctp_association *asoc,
struct sctp_chunk *chunk)
{
- struct net *net = sock_net(asoc->base.sk);
- int retval = 0;
+ int retval = 0;
/* If there is an outstanding ASCONF chunk, queue it for later
* transmission.
@@ -448,7 +448,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
/* Hold the chunk until an ASCONF_ACK is received. */
sctp_chunk_hold(chunk);
- retval = sctp_primitive_ASCONF(net, asoc, chunk);
+ retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk);
if (retval)
sctp_chunk_free(chunk);
else
@@ -524,7 +524,6 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
struct sockaddr *addrs,
int addrcnt)
{
- struct net *net = sock_net(sk);
struct sctp_sock *sp;
struct sctp_endpoint *ep;
struct sctp_association *asoc;
@@ -539,12 +538,12 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
int i;
int retval = 0;
- if (!net->sctp.addip_enable)
- return retval;
-
sp = sctp_sk(sk);
ep = sp->ep;
+ if (!ep->asconf_enable)
+ return retval;
+
pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
__func__, sk, addrs, addrcnt);
@@ -727,7 +726,6 @@ static int sctp_send_asconf_del_ip(struct sock *sk,
struct sockaddr *addrs,
int addrcnt)
{
- struct net *net = sock_net(sk);
struct sctp_sock *sp;
struct sctp_endpoint *ep;
struct sctp_association *asoc;
@@ -743,12 +741,12 @@ static int sctp_send_asconf_del_ip(struct sock *sk,
int stored = 0;
chunk = NULL;
- if (!net->sctp.addip_enable)
- return retval;
-
sp = sctp_sk(sk);
ep = sp->ep;
+ if (!ep->asconf_enable)
+ return retval;
+
pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n",
__func__, sk, addrs, addrcnt);
@@ -1044,158 +1042,161 @@ out:
return err;
}
-/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
- *
- * Common routine for handling connect() and sctp_connectx().
- * Connect will come in with just a single address.
- */
-static int __sctp_connect(struct sock *sk,
- struct sockaddr *kaddrs,
- int addrs_size, int flags,
- sctp_assoc_t *assoc_id)
+static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
+ const union sctp_addr *daddr,
+ const struct sctp_initmsg *init,
+ struct sctp_transport **tp)
{
+ struct sctp_association *asoc;
+ struct sock *sk = ep->base.sk;
struct net *net = sock_net(sk);
- struct sctp_sock *sp;
- struct sctp_endpoint *ep;
- struct sctp_association *asoc = NULL;
- struct sctp_association *asoc2;
- struct sctp_transport *transport;
- union sctp_addr to;
enum sctp_scope scope;
- long timeo;
- int err = 0;
- int addrcnt = 0;
- int walk_size = 0;
- union sctp_addr *sa_addr = NULL;
- void *addr_buf;
- unsigned short port;
+ int err;
- sp = sctp_sk(sk);
- ep = sp->ep;
+ if (sctp_endpoint_is_peeled_off(ep, daddr))
+ return -EADDRNOTAVAIL;
- /* connect() cannot be done on a socket that is already in ESTABLISHED
- * state - UDP-style peeled off socket or a TCP-style socket that
- * is already connected.
- * It cannot be done even on a TCP-style listening socket.
- */
- if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) ||
- (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
- err = -EISCONN;
- goto out_free;
+ if (!ep->base.bind_addr.port) {
+ if (sctp_autobind(sk))
+ return -EAGAIN;
+ } else {
+ if (inet_port_requires_bind_service(net, ep->base.bind_addr.port) &&
+ !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+ return -EACCES;
}
- /* Walk through the addrs buffer and count the number of addresses. */
- addr_buf = kaddrs;
- while (walk_size < addrs_size) {
- struct sctp_af *af;
-
- if (walk_size + sizeof(sa_family_t) > addrs_size) {
- err = -EINVAL;
- goto out_free;
- }
+ scope = sctp_scope(daddr);
+ asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+ if (!asoc)
+ return -ENOMEM;
- sa_addr = addr_buf;
- af = sctp_get_af_specific(sa_addr->sa.sa_family);
+ err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+ if (err < 0)
+ goto free;
- /* If the address family is not supported or if this address
- * causes the address buffer to overflow return EINVAL.
- */
- if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
- err = -EINVAL;
- goto out_free;
- }
+ *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+ if (!*tp) {
+ err = -ENOMEM;
+ goto free;
+ }
- port = ntohs(sa_addr->v4.sin_port);
+ if (!init)
+ return 0;
- /* Save current address so we can work with it */
- memcpy(&to, sa_addr, af->sockaddr_len);
+ if (init->sinit_num_ostreams) {
+ __u16 outcnt = init->sinit_num_ostreams;
- err = sctp_verify_addr(sk, &to, af->sockaddr_len);
+ asoc->c.sinit_num_ostreams = outcnt;
+ /* outcnt has been changed, need to re-init stream */
+ err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL);
if (err)
- goto out_free;
+ goto free;
+ }
- /* Make sure the destination port is correctly set
- * in all addresses.
- */
- if (asoc && asoc->peer.port && asoc->peer.port != port) {
- err = -EINVAL;
- goto out_free;
- }
+ if (init->sinit_max_instreams)
+ asoc->c.sinit_max_instreams = init->sinit_max_instreams;
- /* Check if there already is a matching association on the
- * endpoint (other than the one created here).
- */
- asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
- if (asoc2 && asoc2 != asoc) {
- if (asoc2->state >= SCTP_STATE_ESTABLISHED)
- err = -EISCONN;
- else
- err = -EALREADY;
- goto out_free;
- }
+ if (init->sinit_max_attempts)
+ asoc->max_init_attempts = init->sinit_max_attempts;
- /* If we could not find a matching association on the endpoint,
- * make sure that there is no peeled-off association matching
- * the peer address even on another socket.
- */
- if (sctp_endpoint_is_peeled_off(ep, &to)) {
- err = -EADDRNOTAVAIL;
- goto out_free;
- }
+ if (init->sinit_max_init_timeo)
+ asoc->max_init_timeo =
+ msecs_to_jiffies(init->sinit_max_init_timeo);
- if (!asoc) {
- /* If a bind() or sctp_bindx() is not called prior to
- * an sctp_connectx() call, the system picks an
- * ephemeral port and will choose an address set
- * equivalent to binding with a wildcard address.
- */
- if (!ep->base.bind_addr.port) {
- if (sctp_autobind(sk)) {
- err = -EAGAIN;
- goto out_free;
- }
- } else {
- /*
- * If an unprivileged user inherits a 1-many
- * style socket with open associations on a
- * privileged port, it MAY be permitted to
- * accept new associations, but it SHOULD NOT
- * be permitted to open new associations.
- */
- if (ep->base.bind_addr.port <
- inet_prot_sock(net) &&
- !ns_capable(net->user_ns,
- CAP_NET_BIND_SERVICE)) {
- err = -EACCES;
- goto out_free;
- }
- }
+ return 0;
+free:
+ sctp_association_free(asoc);
+ return err;
+}
- scope = sctp_scope(&to);
- asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
- if (!asoc) {
- err = -ENOMEM;
- goto out_free;
- }
+static int sctp_connect_add_peer(struct sctp_association *asoc,
+ union sctp_addr *daddr, int addr_len)
+{
+ struct sctp_endpoint *ep = asoc->ep;
+ struct sctp_association *old;
+ struct sctp_transport *t;
+ int err;
- err = sctp_assoc_set_bind_addr_from_ep(asoc, scope,
- GFP_KERNEL);
- if (err < 0) {
- goto out_free;
- }
+ err = sctp_verify_addr(ep->base.sk, daddr, addr_len);
+ if (err)
+ return err;
- }
+ old = sctp_endpoint_lookup_assoc(ep, daddr, &t);
+ if (old && old != asoc)
+ return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
+ : -EALREADY;
+
+ if (sctp_endpoint_is_peeled_off(ep, daddr))
+ return -EADDRNOTAVAIL;
+
+ t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+ if (!t)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
+ *
+ * Common routine for handling connect() and sctp_connectx().
+ * Connect will come in with just a single address.
+ */
+static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs,
+ int addrs_size, int flags, sctp_assoc_t *assoc_id)
+{
+ struct sctp_sock *sp = sctp_sk(sk);
+ struct sctp_endpoint *ep = sp->ep;
+ struct sctp_transport *transport;
+ struct sctp_association *asoc;
+ void *addr_buf = kaddrs;
+ union sctp_addr *daddr;
+ struct sctp_af *af;
+ int walk_size, err;
+ long timeo;
+
+ if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) ||
+ (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)))
+ return -EISCONN;
- /* Prime the peer's transport structures. */
- transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
- SCTP_UNKNOWN);
- if (!transport) {
- err = -ENOMEM;
+ daddr = addr_buf;
+ af = sctp_get_af_specific(daddr->sa.sa_family);
+ if (!af || af->sockaddr_len > addrs_size)
+ return -EINVAL;
+
+ err = sctp_verify_addr(sk, daddr, af->sockaddr_len);
+ if (err)
+ return err;
+
+ asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
+ if (asoc)
+ return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
+ : -EALREADY;
+
+ err = sctp_connect_new_asoc(ep, daddr, NULL, &transport);
+ if (err)
+ return err;
+ asoc = transport->asoc;
+
+ addr_buf += af->sockaddr_len;
+ walk_size = af->sockaddr_len;
+ while (walk_size < addrs_size) {
+ err = -EINVAL;
+ if (walk_size + sizeof(sa_family_t) > addrs_size)
goto out_free;
- }
- addrcnt++;
- addr_buf += af->sockaddr_len;
+ daddr = addr_buf;
+ af = sctp_get_af_specific(daddr->sa.sa_family);
+ if (!af || af->sockaddr_len + walk_size > addrs_size)
+ goto out_free;
+
+ if (asoc->peer.port != ntohs(daddr->v4.sin_port))
+ goto out_free;
+
+ err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len);
+ if (err)
+ goto out_free;
+
+ addr_buf += af->sockaddr_len;
walk_size += af->sockaddr_len;
}
@@ -1208,40 +1209,25 @@ static int __sctp_connect(struct sock *sk,
goto out_free;
}
- err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
- if (err < 0) {
+ err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL);
+ if (err < 0)
goto out_free;
- }
/* Initialize sk's dport and daddr for getpeername() */
inet_sk(sk)->inet_dport = htons(asoc->peer.port);
- sp->pf->to_sk_daddr(sa_addr, sk);
+ sp->pf->to_sk_daddr(daddr, sk);
sk->sk_err = 0;
- timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
-
if (assoc_id)
*assoc_id = asoc->assoc_id;
- err = sctp_wait_for_connect(asoc, &timeo);
- /* Note: the asoc may be freed after the return of
- * sctp_wait_for_connect.
- */
-
- /* Don't free association on exit. */
- asoc = NULL;
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+ return sctp_wait_for_connect(asoc, &timeo);
out_free:
pr_debug("%s: took out_free path with asoc:%p kaddrs:%p err:%d\n",
__func__, asoc, kaddrs, err);
-
- if (asoc) {
- /* sctp_primitive_ASSOCIATE may have added this association
- * To the hash table, try to unhash it, just in case, its a noop
- * if it wasn't hashed so we're safe
- */
- sctp_association_free(asoc);
- }
+ sctp_association_free(asoc);
return err;
}
@@ -1311,7 +1297,8 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
__func__, sk, addrs, addrs_size);
- if (unlikely(addrs_size <= 0))
+ /* make sure the 1st addr's sa_family is accessible later */
+ if (unlikely(addrs_size < sizeof(sa_family_t)))
return -EINVAL;
kaddrs = memdup_user(addrs, addrs_size);
@@ -1659,9 +1646,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
struct sctp_transport **tp)
{
struct sctp_endpoint *ep = sctp_sk(sk)->ep;
- struct net *net = sock_net(sk);
struct sctp_association *asoc;
- enum sctp_scope scope;
struct cmsghdr *cmsg;
__be32 flowinfo = 0;
struct sctp_af *af;
@@ -1676,20 +1661,6 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
sctp_sstate(sk, CLOSING)))
return -EADDRNOTAVAIL;
- if (sctp_endpoint_is_peeled_off(ep, daddr))
- return -EADDRNOTAVAIL;
-
- if (!ep->base.bind_addr.port) {
- if (sctp_autobind(sk))
- return -EAGAIN;
- } else {
- if (ep->base.bind_addr.port < inet_prot_sock(net) &&
- !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
- return -EACCES;
- }
-
- scope = sctp_scope(daddr);
-
/* Label connection socket for first association 1-to-many
* style for client sequence socket()->sendmsg(). This
* needs to be done before sctp_assoc_add_peer() as that will
@@ -1705,45 +1676,10 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
if (err < 0)
return err;
- asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
- if (!asoc)
- return -ENOMEM;
-
- if (sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL) < 0) {
- err = -ENOMEM;
- goto free;
- }
-
- if (cmsgs->init) {
- struct sctp_initmsg *init = cmsgs->init;
-
- if (init->sinit_num_ostreams) {
- __u16 outcnt = init->sinit_num_ostreams;
-
- asoc->c.sinit_num_ostreams = outcnt;
- /* outcnt has been changed, need to re-init stream */
- err = sctp_stream_init(&asoc->stream, outcnt, 0,
- GFP_KERNEL);
- if (err)
- goto free;
- }
-
- if (init->sinit_max_instreams)
- asoc->c.sinit_max_instreams = init->sinit_max_instreams;
-
- if (init->sinit_max_attempts)
- asoc->max_init_attempts = init->sinit_max_attempts;
-
- if (init->sinit_max_init_timeo)
- asoc->max_init_timeo =
- msecs_to_jiffies(init->sinit_max_init_timeo);
- }
-
- *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
- if (!*tp) {
- err = -ENOMEM;
- goto free;
- }
+ err = sctp_connect_new_asoc(ep, daddr, cmsgs->init, tp);
+ if (err)
+ return err;
+ asoc = (*tp)->asoc;
if (!cmsgs->addrs_msg)
return 0;
@@ -1753,8 +1689,6 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
/* sendv addr list parse */
for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
- struct sctp_transport *transport;
- struct sctp_association *old;
union sctp_addr _daddr;
int dlen;
@@ -1788,30 +1722,10 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
daddr->v6.sin6_port = htons(asoc->peer.port);
memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
}
- err = sctp_verify_addr(sk, daddr, sizeof(*daddr));
- if (err)
- goto free;
-
- old = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
- if (old && old != asoc) {
- if (old->state >= SCTP_STATE_ESTABLISHED)
- err = -EISCONN;
- else
- err = -EALREADY;
- goto free;
- }
-
- if (sctp_endpoint_is_peeled_off(ep, daddr)) {
- err = -EADDRNOTAVAIL;
- goto free;
- }
- transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL,
- SCTP_UNKNOWN);
- if (!transport) {
- err = -ENOMEM;
+ err = sctp_connect_add_peer(asoc, daddr, sizeof(*daddr));
+ if (err)
goto free;
- }
}
return 0;
@@ -2513,9 +2427,8 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
int error;
if (params->spp_flags & SPP_HB_DEMAND && trans) {
- struct net *net = sock_net(trans->asoc->base.sk);
-
- error = sctp_primitive_REQUESTHEARTBEAT(net, trans->asoc, trans);
+ error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net,
+ trans->asoc, trans);
if (error)
return error;
}
@@ -3414,7 +3327,6 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval,
unsigned int optlen)
{
- struct net *net = sock_net(sk);
struct sctp_sock *sp;
struct sctp_association *asoc = NULL;
struct sctp_setpeerprim prim;
@@ -3424,7 +3336,7 @@ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optva
sp = sctp_sk(sk);
- if (!net->sctp.addip_enable)
+ if (!sp->ep->asconf_enable)
return -EPERM;
if (optlen != sizeof(struct sctp_setpeerprim))
@@ -3774,9 +3686,6 @@ static int sctp_setsockopt_auth_key(struct sock *sk,
struct sctp_association *asoc;
int ret = -EINVAL;
- if (!ep->auth_enable)
- return -EACCES;
-
if (optlen <= sizeof(struct sctp_authkey))
return -EINVAL;
/* authkey->sca_keylength is u16, so optlen can't be bigger than
@@ -3843,9 +3752,6 @@ static int sctp_setsockopt_active_key(struct sock *sk,
struct sctp_authkeyid val;
int ret = 0;
- if (!ep->auth_enable)
- return -EACCES;
-
if (optlen != sizeof(struct sctp_authkeyid))
return -EINVAL;
if (copy_from_user(&val, optval, optlen))
@@ -3897,9 +3803,6 @@ static int sctp_setsockopt_del_key(struct sock *sk,
struct sctp_authkeyid val;
int ret = 0;
- if (!ep->auth_enable)
- return -EACCES;
-
if (optlen != sizeof(struct sctp_authkeyid))
return -EINVAL;
if (copy_from_user(&val, optval, optlen))
@@ -3950,9 +3853,6 @@ static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval,
struct sctp_authkeyid val;
int ret = 0;
- if (!ep->auth_enable)
- return -EACCES;
-
if (optlen != sizeof(struct sctp_authkeyid))
return -EINVAL;
if (copy_from_user(&val, optval, optlen))
@@ -4041,18 +3941,22 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
*/
static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
char __user *optval,
- unsigned int optlen)
+ unsigned int optlen, bool v2)
{
- struct sctp_paddrthlds val;
+ struct sctp_paddrthlds_v2 val;
struct sctp_transport *trans;
struct sctp_association *asoc;
+ int len;
- if (optlen < sizeof(struct sctp_paddrthlds))
+ len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds);
+ if (optlen < len)
return -EINVAL;
- if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
- sizeof(struct sctp_paddrthlds)))
+ if (copy_from_user(&val, optval, len))
return -EFAULT;
+ if (v2 && val.spt_pathpfthld > val.spt_pathcpthld)
+ return -EINVAL;
+
if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
trans = sctp_addr_id2transport(sk, &val.spt_address,
val.spt_assoc_id);
@@ -4061,6 +3965,8 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
if (val.spt_pathmaxrxt)
trans->pathmaxrxt = val.spt_pathmaxrxt;
+ if (v2)
+ trans->ps_retrans = val.spt_pathcpthld;
trans->pf_retrans = val.spt_pathpfthld;
return 0;
@@ -4076,17 +3982,23 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
transports) {
if (val.spt_pathmaxrxt)
trans->pathmaxrxt = val.spt_pathmaxrxt;
+ if (v2)
+ trans->ps_retrans = val.spt_pathcpthld;
trans->pf_retrans = val.spt_pathpfthld;
}
if (val.spt_pathmaxrxt)
asoc->pathmaxrxt = val.spt_pathmaxrxt;
+ if (v2)
+ asoc->ps_retrans = val.spt_pathcpthld;
asoc->pf_retrans = val.spt_pathpfthld;
} else {
struct sctp_sock *sp = sctp_sk(sk);
if (val.spt_pathmaxrxt)
sp->pathmaxrxt = val.spt_pathmaxrxt;
+ if (v2)
+ sp->ps_retrans = val.spt_pathcpthld;
sp->pf_retrans = val.spt_pathpfthld;
}
@@ -4583,6 +4495,144 @@ static int sctp_setsockopt_event(struct sock *sk, char __user *optval,
return retval;
}
+static int sctp_setsockopt_asconf_supported(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ struct sctp_endpoint *ep;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP))
+ goto out;
+
+ ep = sctp_sk(sk)->ep;
+ ep->asconf_enable = !!params.assoc_value;
+
+ if (ep->asconf_enable && ep->auth_enable) {
+ sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
+ sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
+ }
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_setsockopt_auth_supported(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ struct sctp_endpoint *ep;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP))
+ goto out;
+
+ ep = sctp_sk(sk)->ep;
+ if (params.assoc_value) {
+ retval = sctp_auth_init(ep, GFP_KERNEL);
+ if (retval)
+ goto out;
+ if (ep->asconf_enable) {
+ sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF);
+ sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK);
+ }
+ }
+
+ ep->auth_enable = !!params.assoc_value;
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_setsockopt_ecn_supported(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP))
+ goto out;
+
+ sctp_sk(sk)->ep->ecn_enable = !!params.assoc_value;
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_setsockopt_pf_expose(struct sock *sk,
+ char __user *optval,
+ unsigned int optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EINVAL;
+
+ if (optlen != sizeof(params))
+ goto out;
+
+ if (copy_from_user(&params, optval, optlen)) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ if (params.assoc_value > SCTP_PF_EXPOSE_MAX)
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP))
+ goto out;
+
+ if (asoc)
+ asoc->pf_expose = params.assoc_value;
+ else
+ sctp_sk(sk)->pf_expose = params.assoc_value;
+ retval = 0;
+
+out:
+ return retval;
+}
+
/* API 6.2 setsockopt(), getsockopt()
*
* Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4738,7 +4788,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
break;
case SCTP_PEER_ADDR_THLDS:
- retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+ retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen,
+ false);
+ break;
+ case SCTP_PEER_ADDR_THLDS_V2:
+ retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen,
+ true);
break;
case SCTP_RECVRCVINFO:
retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen);
@@ -4783,6 +4838,18 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
case SCTP_EVENT:
retval = sctp_setsockopt_event(sk, optval, optlen);
break;
+ case SCTP_ASCONF_SUPPORTED:
+ retval = sctp_setsockopt_asconf_supported(sk, optval, optlen);
+ break;
+ case SCTP_AUTH_SUPPORTED:
+ retval = sctp_setsockopt_auth_supported(sk, optval, optlen);
+ break;
+ case SCTP_ECN_SUPPORTED:
+ retval = sctp_setsockopt_ecn_supported(sk, optval, optlen);
+ break;
+ case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
+ retval = sctp_setsockopt_pf_expose(sk, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -5026,6 +5093,8 @@ static int sctp_init_sock(struct sock *sk)
sp->hbinterval = net->sctp.hb_interval;
sp->pathmaxrxt = net->sctp.max_retrans_path;
sp->pf_retrans = net->sctp.pf_retrans;
+ sp->ps_retrans = net->sctp.ps_retrans;
+ sp->pf_expose = net->sctp.pf_expose;
sp->pathmtu = 0; /* allow default discovery */
sp->sackdelay = net->sctp.sack_timeout;
sp->sackfreq = 2;
@@ -5293,7 +5362,7 @@ struct sctp_transport *sctp_transport_get_next(struct net *net,
if (!sctp_transport_hold(t))
continue;
- if (net_eq(sock_net(t->asoc->base.sk), net) &&
+ if (net_eq(t->asoc->base.net, net) &&
t->asoc->peer.primary_path == t)
break;
@@ -5506,8 +5575,16 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address,
pinfo.spinfo_assoc_id);
- if (!transport)
- return -EINVAL;
+ if (!transport) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (transport->state == SCTP_PF &&
+ transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) {
+ retval = -EACCES;
+ goto out;
+ }
pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
pinfo.spinfo_state = transport->state;
@@ -6920,9 +6997,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len,
struct sctp_authkeyid val;
struct sctp_association *asoc;
- if (!ep->auth_enable)
- return -EACCES;
-
if (len < sizeof(struct sctp_authkeyid))
return -EINVAL;
@@ -6934,10 +7008,15 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len,
if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
return -EINVAL;
- if (asoc)
+ if (asoc) {
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
val.scact_keynumber = asoc->active_key_id;
- else
+ } else {
+ if (!ep->auth_enable)
+ return -EACCES;
val.scact_keynumber = ep->active_key_id;
+ }
if (put_user(len, optlen))
return -EFAULT;
@@ -6950,7 +7029,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len,
static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
char __user *optval, int __user *optlen)
{
- struct sctp_endpoint *ep = sctp_sk(sk)->ep;
struct sctp_authchunks __user *p = (void __user *)optval;
struct sctp_authchunks val;
struct sctp_association *asoc;
@@ -6958,9 +7036,6 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
u32 num_chunks = 0;
char __user *to;
- if (!ep->auth_enable)
- return -EACCES;
-
if (len < sizeof(struct sctp_authchunks))
return -EINVAL;
@@ -6972,6 +7047,9 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len,
if (!asoc)
return -EINVAL;
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
+
ch = asoc->peer.peer_chunks;
if (!ch)
goto num;
@@ -7003,9 +7081,6 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len,
u32 num_chunks = 0;
char __user *to;
- if (!ep->auth_enable)
- return -EACCES;
-
if (len < sizeof(struct sctp_authchunks))
return -EINVAL;
@@ -7018,8 +7093,15 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len,
sctp_style(sk, UDP))
return -EINVAL;
- ch = asoc ? (struct sctp_chunks_param *)asoc->c.auth_chunks
- : ep->auth_chunk_list;
+ if (asoc) {
+ if (!asoc->peer.auth_capable)
+ return -EACCES;
+ ch = (struct sctp_chunks_param *)asoc->c.auth_chunks;
+ } else {
+ if (!ep->auth_enable)
+ return -EACCES;
+ ch = ep->auth_chunk_list;
+ }
if (!ch)
goto num;
@@ -7150,18 +7232,19 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
* http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
*/
static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
- char __user *optval,
- int len,
- int __user *optlen)
+ char __user *optval, int len,
+ int __user *optlen, bool v2)
{
- struct sctp_paddrthlds val;
+ struct sctp_paddrthlds_v2 val;
struct sctp_transport *trans;
struct sctp_association *asoc;
+ int min;
- if (len < sizeof(struct sctp_paddrthlds))
+ min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds);
+ if (len < min)
return -EINVAL;
- len = sizeof(struct sctp_paddrthlds);
- if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
+ len = min;
+ if (copy_from_user(&val, optval, len))
return -EFAULT;
if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
@@ -7172,8 +7255,9 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
val.spt_pathmaxrxt = trans->pathmaxrxt;
val.spt_pathpfthld = trans->pf_retrans;
+ val.spt_pathcpthld = trans->ps_retrans;
- return 0;
+ goto out;
}
asoc = sctp_id2assoc(sk, val.spt_assoc_id);
@@ -7184,13 +7268,16 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
if (asoc) {
val.spt_pathpfthld = asoc->pf_retrans;
val.spt_pathmaxrxt = asoc->pathmaxrxt;
+ val.spt_pathcpthld = asoc->ps_retrans;
} else {
struct sctp_sock *sp = sctp_sk(sk);
val.spt_pathpfthld = sp->pf_retrans;
val.spt_pathmaxrxt = sp->pathmaxrxt;
+ val.spt_pathcpthld = sp->ps_retrans;
}
+out:
if (put_user(len, optlen) || copy_to_user(optval, &val, len))
return -EFAULT;
@@ -7762,6 +7849,162 @@ static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval,
return 0;
}
+static int sctp_getsockopt_asconf_supported(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ params.assoc_value = asoc ? asoc->peer.asconf_capable
+ : sctp_sk(sk)->ep->asconf_enable;
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_getsockopt_auth_supported(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ params.assoc_value = asoc ? asoc->peer.auth_capable
+ : sctp_sk(sk)->ep->auth_enable;
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_getsockopt_ecn_supported(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ params.assoc_value = asoc ? asoc->peer.ecn_capable
+ : sctp_sk(sk)->ep->ecn_enable;
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
+static int sctp_getsockopt_pf_expose(struct sock *sk, int len,
+ char __user *optval,
+ int __user *optlen)
+{
+ struct sctp_assoc_value params;
+ struct sctp_association *asoc;
+ int retval = -EFAULT;
+
+ if (len < sizeof(params)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ len = sizeof(params);
+ if (copy_from_user(&params, optval, len))
+ goto out;
+
+ asoc = sctp_id2assoc(sk, params.assoc_id);
+ if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+ sctp_style(sk, UDP)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ params.assoc_value = asoc ? asoc->pf_expose
+ : sctp_sk(sk)->pf_expose;
+
+ if (put_user(len, optlen))
+ goto out;
+
+ if (copy_to_user(optval, &params, len))
+ goto out;
+
+ retval = 0;
+
+out:
+ return retval;
+}
+
static int sctp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -7911,7 +8154,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
break;
case SCTP_PEER_ADDR_THLDS:
- retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
+ retval = sctp_getsockopt_paddr_thresholds(sk, optval, len,
+ optlen, false);
+ break;
+ case SCTP_PEER_ADDR_THLDS_V2:
+ retval = sctp_getsockopt_paddr_thresholds(sk, optval, len,
+ optlen, true);
break;
case SCTP_GET_ASSOC_STATS:
retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen);
@@ -7963,6 +8211,20 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
case SCTP_EVENT:
retval = sctp_getsockopt_event(sk, len, optval, optlen);
break;
+ case SCTP_ASCONF_SUPPORTED:
+ retval = sctp_getsockopt_asconf_supported(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_AUTH_SUPPORTED:
+ retval = sctp_getsockopt_auth_supported(sk, len, optval,
+ optlen);
+ break;
+ case SCTP_ECN_SUPPORTED:
+ retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen);
+ break;
+ case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
+ retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen);
+ break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7998,11 +8260,12 @@ static void sctp_unhash(struct sock *sk)
static struct sctp_bind_bucket *sctp_bucket_create(
struct sctp_bind_hashbucket *head, struct net *, unsigned short snum);
-static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
+static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
{
struct sctp_sock *sp = sctp_sk(sk);
bool reuse = (sk->sk_reuse || sp->reuse);
struct sctp_bind_hashbucket *head; /* hash list */
+ struct net *net = sock_net(sk);
kuid_t uid = sock_i_uid(sk);
struct sctp_bind_bucket *pp;
unsigned short snum;
@@ -8018,7 +8281,6 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
/* Search for an available port. */
int low, high, remaining, index;
unsigned int rover;
- struct net *net = sock_net(sk);
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
@@ -8030,12 +8292,12 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
rover = low;
if (inet_is_local_reserved_port(net, rover))
continue;
- index = sctp_phashfn(sock_net(sk), rover);
+ index = sctp_phashfn(net, rover);
head = &sctp_port_hashtable[index];
spin_lock(&head->lock);
sctp_for_each_hentry(pp, &head->chain)
if ((pp->port == rover) &&
- net_eq(sock_net(sk), pp->net))
+ net_eq(net, pp->net))
goto next;
break;
next:
@@ -8059,10 +8321,10 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
* to the port number (snum) - we detect that with the
* port iterator, pp being NULL.
*/
- head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), snum)];
+ head = &sctp_port_hashtable[sctp_phashfn(net, snum)];
spin_lock(&head->lock);
sctp_for_each_hentry(pp, &head->chain) {
- if ((pp->port == snum) && net_eq(pp->net, sock_net(sk)))
+ if ((pp->port == snum) && net_eq(pp->net, net))
goto pp_found;
}
}
@@ -8108,7 +8370,7 @@ pp_found:
if (sctp_bind_addr_conflict(&ep2->base.bind_addr,
addr, sp2, sp)) {
- ret = (long)sk2;
+ ret = 1;
goto fail_unlock;
}
}
@@ -8118,7 +8380,7 @@ pp_found:
pp_not_found:
/* If there was a hash table miss, create a new port. */
ret = 1;
- if (!pp && !(pp = sctp_bucket_create(head, sock_net(sk), snum)))
+ if (!pp && !(pp = sctp_bucket_create(head, net, snum)))
goto fail_unlock;
/* In either case (hit or miss), make sure fastreuse is 1 only
@@ -8180,7 +8442,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
addr.v4.sin_port = htons(snum);
/* Note: sk->sk_num gets filled in if ephemeral port request. */
- return !!sctp_get_port_local(sk, &addr);
+ return sctp_get_port_local(sk, &addr);
}
/*
@@ -8227,7 +8489,7 @@ static int sctp_listen_start(struct sock *sk, int backlog)
}
}
- sk->sk_max_ack_backlog = backlog;
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
return sctp_hash_endpoint(ep);
}
@@ -8281,7 +8543,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
/* If we are already listening, just update the backlog */
if (sctp_sstate(sk, LISTENING))
- sk->sk_max_ack_backlog = backlog;
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
else {
err = sctp_listen_start(sk, backlog);
if (err)
@@ -8327,7 +8589,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
mask = 0;
/* Is there any exceptional events? */
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
if (sk->sk_shutdown & RCV_SHUTDOWN)
@@ -8336,7 +8598,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= EPOLLHUP;
/* Is it readable? Reconsider this code with TCP-style support. */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* The association is either gone or not ready. */
@@ -8722,7 +8984,7 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
if (sk_can_busy_loop(sk)) {
sk_busy_loop(sk, noblock);
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
continue;
}
@@ -9157,7 +9419,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newinet->inet_rcv_saddr = inet->inet_rcv_saddr;
newinet->inet_dport = htons(asoc->peer.port);
newinet->pmtudisc = inet->pmtudisc;
- newinet->inet_id = asoc->next_tsn ^ jiffies;
+ newinet->inet_id = prandom_u32();
newinet->uc_ttl = inet->uc_ttl;
newinet->mc_loop = 1;
@@ -9351,7 +9613,7 @@ struct proto sctp_prot = {
.backlog_rcv = sctp_backlog_rcv,
.hash = sctp_hash,
.unhash = sctp_unhash,
- .get_port = sctp_get_port,
+ .no_autobind = true,
.obj_size = sizeof(struct sctp_sock),
.useroffset = offsetof(struct sctp_sock, subscribe),
.usersize = offsetof(struct sctp_sock, initmsg) -
@@ -9393,7 +9655,7 @@ struct proto sctpv6_prot = {
.backlog_rcv = sctp_backlog_rcv,
.hash = sctp_hash,
.unhash = sctp_unhash,
- .get_port = sctp_get_port,
+ .no_autobind = true,
.obj_size = sizeof(struct sctp6_sock),
.useroffset = offsetof(struct sctp6_sock, sctp.subscribe),
.usersize = offsetof(struct sctp6_sock, sctp.initmsg) -
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index e83cdaa2ab76..67f7e71f9129 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -119,7 +119,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
* a new one with new outcnt to save memory if needed.
*/
if (outcnt == stream->outcnt)
- goto in;
+ goto handle_in;
/* Filter out chunks queued on streams that won't exist anymore */
sched->unsched_all(stream);
@@ -128,24 +128,28 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
ret = sctp_stream_alloc_out(stream, outcnt, gfp);
if (ret)
- goto out;
+ goto out_err;
for (i = 0; i < stream->outcnt; i++)
SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
-in:
+handle_in:
sctp_stream_interleave_init(stream);
if (!incnt)
goto out;
ret = sctp_stream_alloc_in(stream, incnt, gfp);
- if (ret) {
- sched->free(stream);
- genradix_free(&stream->out);
- stream->outcnt = 0;
- goto out;
- }
+ if (ret)
+ goto in_err;
+ goto out;
+
+in_err:
+ sched->free(stream);
+ genradix_free(&stream->in);
+out_err:
+ genradix_free(&stream->out);
+ stream->outcnt = 0;
out:
return ret;
}
@@ -218,10 +222,9 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new)
static int sctp_send_reconf(struct sctp_association *asoc,
struct sctp_chunk *chunk)
{
- struct net *net = sock_net(asoc->base.sk);
int retval = 0;
- retval = sctp_primitive_RECONF(net, asoc, chunk);
+ retval = sctp_primitive_RECONF(asoc->base.net, asoc, chunk);
if (retval)
sctp_chunk_free(chunk);
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index 40c40be23fcb..6b13f737ebf2 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -241,9 +241,8 @@ out:
if (!first_frag)
return NULL;
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
- &ulpq->reasm, first_frag,
- last_frag);
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm,
+ first_frag, last_frag);
if (retval) {
sin->fsn = next_fsn;
if (is_last) {
@@ -326,7 +325,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
pd_point = sctp_sk(asoc->base.sk)->pd_point;
if (pd_point && pd_point <= pd_len) {
- retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ retval = sctp_make_reassembled_event(asoc->base.net,
&ulpq->reasm,
pd_first, pd_last);
if (retval) {
@@ -337,8 +336,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled(
goto out;
found:
- retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
- &ulpq->reasm,
+ retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm,
first_frag, pos);
if (retval)
retval->msg_flags |= MSG_EOR;
@@ -630,7 +628,7 @@ out:
if (!first_frag)
return NULL;
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
&ulpq->reasm_uo, first_frag,
last_frag);
if (retval) {
@@ -716,7 +714,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
pd_point = sctp_sk(asoc->base.sk)->pd_point;
if (pd_point && pd_point <= pd_len) {
- retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ retval = sctp_make_reassembled_event(asoc->base.net,
&ulpq->reasm_uo,
pd_first, pd_last);
if (retval) {
@@ -727,8 +725,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo(
goto out;
found:
- retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
- &ulpq->reasm_uo,
+ retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo,
first_frag, pos);
if (retval)
retval->msg_flags |= MSG_EOR;
@@ -814,7 +811,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq)
return NULL;
out:
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
&ulpq->reasm_uo, first_frag,
last_frag);
if (retval) {
@@ -921,7 +918,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq)
return NULL;
out:
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
&ulpq->reasm, first_frag,
last_frag);
if (retval) {
@@ -1159,7 +1156,7 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
if (ftsn_chunk) {
list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
- SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_OUTCTRLCHUNKS);
+ SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS);
}
}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 1250751bca1b..4740aa70e652 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -34,6 +34,8 @@ static int rto_alpha_min = 0;
static int rto_beta_min = 0;
static int rto_alpha_max = 1000;
static int rto_beta_max = 1000;
+static int pf_expose_max = SCTP_PF_EXPOSE_MAX;
+static int ps_retrans_max = SCTP_PS_RETRANS_MAX;
static unsigned long max_autoclose_min = 0;
static unsigned long max_autoclose_max =
@@ -212,7 +214,16 @@ static struct ctl_table sctp_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_INT_MAX,
+ .extra2 = &init_net.sctp.ps_retrans,
+ },
+ {
+ .procname = "ps_retrans",
+ .data = &init_net.sctp.ps_retrans,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &init_net.sctp.pf_retrans,
+ .extra2 = &ps_retrans_max,
},
{
.procname = "sndbuf_policy",
@@ -278,6 +289,13 @@ static struct ctl_table sctp_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ecn_enable",
+ .data = &init_net.sctp.ecn_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "addr_scope_policy",
.data = &init_net.sctp.scope_policy,
.maxlen = sizeof(int),
@@ -311,6 +329,15 @@ static struct ctl_table sctp_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "pf_expose",
+ .data = &init_net.sctp.pf_expose,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &pf_expose_max,
+ },
{ /* sentinel */ }
};
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index e2f8e369cd08..806af58f4375 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -43,8 +43,8 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
gfp_t gfp)
{
/* Copy in the address. */
- peer->ipaddr = *addr;
peer->af_specific = sctp_get_af_specific(addr->sa.sa_family);
+ memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len);
memset(&peer->saddr, 0, sizeof(union sctp_addr));
peer->sack_generation = 0;
@@ -263,7 +263,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
pf->af->from_sk(&addr, sk);
pf->to_sk_daddr(&t->ipaddr, sk);
- dst->ops->update_pmtu(dst, sk, NULL, pmtu);
+ dst->ops->update_pmtu(dst, sk, NULL, pmtu, true);
pf->to_sk_daddr(&addr, sk);
dst = sctp_transport_dst_check(t);
@@ -334,7 +334,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
pr_debug("%s: rto_pending not set on transport %p!\n", __func__, tp);
if (tp->rttvar || tp->srtt) {
- struct net *net = sock_net(tp->asoc->base.sk);
+ struct net *net = tp->asoc->base.net;
/* 6.3.1 C3) When a new RTT measurement R' is made, set
* RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'|
* SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R'
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index e0cc1edf49a0..c82dbdcf13f2 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -238,7 +238,7 @@ fail:
* When a destination address on a multi-homed peer encounters a change
* an interface details event is sent.
*/
-struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
+static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
const struct sctp_association *asoc,
const struct sockaddr_storage *aaddr,
int flags, int state, int error, gfp_t gfp)
@@ -336,6 +336,22 @@ fail:
return NULL;
}
+void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport,
+ int state, int error)
+{
+ struct sctp_association *asoc = transport->asoc;
+ struct sockaddr_storage addr;
+ struct sctp_ulpevent *event;
+
+ memset(&addr, 0, sizeof(struct sockaddr_storage));
+ memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
+
+ event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state,
+ error, GFP_ATOMIC);
+ if (event)
+ asoc->stream.si->enqueue_event(&asoc->ulpq, event);
+}
+
/* Create and initialize an SCTP_REMOTE_ERROR notification.
*
* Note: This assumes that the chunk->skb->data already points to the
@@ -511,6 +527,45 @@ fail:
return NULL;
}
+struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event(
+ const struct sctp_association *asoc, struct sctp_chunk *chunk,
+ __u16 flags, __u32 error, gfp_t gfp)
+{
+ struct sctp_send_failed_event *ssf;
+ struct sctp_ulpevent *event;
+ struct sk_buff *skb;
+ int len;
+
+ skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp);
+ if (!skb)
+ return NULL;
+
+ len = ntohs(chunk->chunk_hdr->length);
+ len -= sctp_datachk_len(&asoc->stream);
+
+ skb_pull(skb, sctp_datachk_len(&asoc->stream));
+ event = sctp_skb2event(skb);
+ sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+ ssf = skb_push(skb, sizeof(*ssf));
+ ssf->ssf_type = SCTP_SEND_FAILED_EVENT;
+ ssf->ssf_flags = flags;
+ ssf->ssf_length = sizeof(*ssf) + len;
+ skb_trim(skb, ssf->ssf_length);
+ ssf->ssf_error = error;
+
+ ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream;
+ ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid;
+ ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context;
+ ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id;
+ ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags;
+
+ sctp_ulpevent_set_owner(event, asoc);
+ ssf->ssf_assoc_id = sctp_assoc2id(asoc);
+
+ return event;
+}
+
/* Create and initialize a SCTP_SHUTDOWN_EVENT notification.
*
* Socket Extensions for SCTP - draft-01
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index b6536b7f14c0..1c6c640607c5 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -486,10 +486,9 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul
cevent = sctp_skb2event(pd_first);
pd_point = sctp_sk(asoc->base.sk)->pd_point;
if (pd_point && pd_point <= pd_len) {
- retval = sctp_make_reassembled_event(sock_net(asoc->base.sk),
+ retval = sctp_make_reassembled_event(asoc->base.net,
&ulpq->reasm,
- pd_first,
- pd_last);
+ pd_first, pd_last);
if (retval)
sctp_ulpq_set_pd(ulpq);
}
@@ -497,7 +496,7 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ul
done:
return retval;
found:
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net,
&ulpq->reasm, first_frag, pos);
if (retval)
retval->msg_flags |= MSG_EOR;
@@ -563,8 +562,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq)
* further.
*/
done:
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
- &ulpq->reasm, first_frag, last_frag);
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm,
+ first_frag, last_frag);
if (retval && is_last)
retval->msg_flags |= MSG_EOR;
@@ -664,8 +663,8 @@ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq)
* further.
*/
done:
- retval = sctp_make_reassembled_event(sock_net(ulpq->asoc->base.sk),
- &ulpq->reasm, first_frag, last_frag);
+ retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm,
+ first_frag, last_frag);
return retval;
}
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b932583e407..90988a511cd5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -25,6 +25,7 @@
#include <linux/in.h>
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
+#include <linux/rcupdate_wait.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -123,6 +124,12 @@ struct proto smc_proto6 = {
};
EXPORT_SYMBOL_GPL(smc_proto6);
+static void smc_restore_fallback_changes(struct smc_sock *smc)
+{
+ smc->clcsock->file->private_data = smc->sk.sk_socket;
+ smc->clcsock->file = NULL;
+}
+
static int __smc_release(struct smc_sock *smc)
{
struct sock *sk = &smc->sk;
@@ -141,6 +148,7 @@ static int __smc_release(struct smc_sock *smc)
}
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk);
+ smc_restore_fallback_changes(smc);
}
sk->sk_prot->unhash(sk);
@@ -167,6 +175,7 @@ static int smc_release(struct socket *sock)
if (!sk)
goto out;
+ sock_hold(sk); /* sock_put below */
smc = smc_sk(sk);
/* cleanup for a dangling non-blocking connect */
@@ -189,6 +198,7 @@ static int smc_release(struct socket *sock)
sock->sk = NULL;
release_sock(sk);
+ sock_put(sk); /* sock_hold above */
sock_put(sk); /* final sock_put */
out:
return rc;
@@ -460,6 +470,8 @@ static void smc_switch_to_fallback(struct smc_sock *smc)
if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
smc->clcsock->file = smc->sk.sk_socket->file;
smc->clcsock->file->private_data = smc->clcsock;
+ smc->clcsock->wq.fasync_list =
+ smc->sk.sk_socket->wq.fasync_list;
}
}
@@ -700,8 +712,6 @@ static int __smc_connect(struct smc_sock *smc)
int smc_type;
int rc = 0;
- sock_hold(&smc->sk); /* sock put in passive closing */
-
if (smc->use_fallback)
return smc_connect_fallback(smc, smc->fallback_rsn);
@@ -791,6 +801,7 @@ static void smc_connect_work(struct work_struct *work)
smc->sk.sk_err = EPIPE;
else if (signal_pending(current))
smc->sk.sk_err = -sock_intr_errno(timeo);
+ sock_put(&smc->sk); /* passive closing */
goto out;
}
@@ -846,6 +857,10 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
rc = kernel_connect(smc->clcsock, addr, alen, flags);
if (rc && rc != -EINPROGRESS)
goto out;
+
+ sock_hold(&smc->sk); /* sock put in passive closing */
+ if (smc->use_fallback)
+ goto out;
if (flags & O_NONBLOCK) {
if (schedule_work(&smc->connect_work))
smc->connect_nonblock = 1;
@@ -970,12 +985,14 @@ void smc_close_non_accepted(struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
+ sock_hold(sk); /* sock_put below */
lock_sock(sk);
if (!sk->sk_lingertime)
/* wait for peer closing */
sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
__smc_release(smc);
release_sock(sk);
+ sock_put(sk); /* sock_hold above */
sock_put(sk); /* final sock_put */
}
@@ -1291,8 +1308,8 @@ static void smc_listen_work(struct work_struct *work)
/* check if RDMA is available */
if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
/* prepare RDMA check */
- memset(&ini, 0, sizeof(ini));
ini.is_smcd = false;
+ ini.ism_dev = NULL;
ini.ib_lcl = &pclc->lcl;
rc = smc_find_rdma_device(new_smc, &ini);
if (rc) {
@@ -1708,8 +1725,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
sk->sk_err = smc->clcsock->sk->sk_err;
sk->sk_error_report(sk);
}
- if (rc)
- return rc;
if (optlen < sizeof(int))
return -EINVAL;
@@ -1717,6 +1732,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;
lock_sock(sk);
+ if (rc || smc->use_fallback)
+ goto out;
switch (optname) {
case TCP_ULP:
case TCP_FASTOPEN:
@@ -1724,19 +1741,18 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
- if (sk->sk_state == SMC_INIT) {
+ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
smc_switch_to_fallback(smc);
smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
- if (!smc->use_fallback)
- rc = -EINVAL;
+ rc = -EINVAL;
}
break;
case TCP_NODELAY:
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
- if (val && !smc->use_fallback)
+ if (val)
mod_delayed_work(system_wq, &smc->conn.tx_work,
0);
}
@@ -1745,7 +1761,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
- if (!val && !smc->use_fallback)
+ if (!val)
mod_delayed_work(system_wq, &smc->conn.tx_work,
0);
}
@@ -1756,6 +1772,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
default:
break;
}
+out:
release_sock(sk);
return rc;
@@ -2027,22 +2044,28 @@ static int __init smc_init(void)
if (rc)
goto out_pernet_subsys;
+ rc = smc_core_init();
+ if (rc) {
+ pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
rc = smc_llc_init();
if (rc) {
pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = smc_cdc_init();
if (rc) {
pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = proto_register(&smc_proto, 1);
if (rc) {
pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
- goto out_pnet;
+ goto out_core;
}
rc = proto_register(&smc_proto6, 1);
@@ -2074,6 +2097,8 @@ out_proto6:
proto_unregister(&smc_proto6);
out_proto:
proto_unregister(&smc_proto);
+out_core:
+ smc_core_exit();
out_pnet:
smc_pnet_exit();
out_pernet_subsys:
@@ -2084,14 +2109,15 @@ out_pernet_subsys:
static void __exit smc_exit(void)
{
- smc_core_exit();
static_branch_disable(&tcp_have_smc);
- smc_ib_unregister_client();
sock_unregister(PF_SMC);
+ smc_core_exit();
+ smc_ib_unregister_client();
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
unregister_pernet_subsys(&smc_net_ops);
+ rcu_barrier();
}
module_init(smc_init);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 878313f8d6c1..be11ba41190f 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -188,6 +188,7 @@ struct smc_connection {
* 0 for SMC-R, 32 for SMC-D
*/
u64 peer_token; /* SMC-D token of peer */
+ u8 killed : 1; /* abnormal termination */
};
struct smc_sock { /* smc sock container */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index d0b0f4c865b4..164f1584861b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -63,7 +63,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
wr_rdma_buf,
(struct smc_wr_tx_pend_priv **)pend);
- if (!conn->alert_token_local)
+ if (conn->killed)
/* abnormal termination */
rc = -EPIPE;
return rc;
@@ -131,6 +131,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
{
int rc;
+ if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown))
+ return -EPIPE;
+
if (conn->lgr->is_smcd) {
spin_lock_bh(&conn->send_lock);
rc = smcd_cdc_msg_send(conn);
@@ -328,7 +331,7 @@ static void smcd_cdc_rx_tsklet(unsigned long data)
struct smcd_cdc_msg cdc;
struct smc_sock *smc;
- if (!conn)
+ if (!conn || conn->killed)
return;
data_cdc = (struct smcd_cdc_msg *)conn->rmb_desc->cpu_addr;
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 49bcebff6378..86cccc24e52e 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -349,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
smc->conn.lgr->sync_err = 1;
- smc_lgr_terminate(smc->conn.lgr);
+ smc_lgr_terminate(smc->conn.lgr, true);
}
}
@@ -372,7 +372,9 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
dclc.hdr.version = SMC_CLC_V1;
dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
- memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+ if (smc->conn.lgr && !smc->conn.lgr->is_smcd)
+ memcpy(dclc.id_for_peer, local_systemid,
+ sizeof(local_systemid));
dclc.peer_diagnosis = htonl(peer_diag_info);
memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index fc06720b53c1..290270c821ca 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -13,14 +13,13 @@
#include <linux/sched/signal.h>
#include <net/sock.h>
+#include <net/tcp.h>
#include "smc.h"
#include "smc_tx.h"
#include "smc_cdc.h"
#include "smc_close.h"
-#define SMC_CLOSE_WAIT_LISTEN_CLCSOCK_TIME (5 * HZ)
-
/* release the clcsock that is assigned to the smc_sock */
void smc_clcsock_release(struct smc_sock *smc)
{
@@ -65,8 +64,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
rc = sk_wait_event(sk, &timeout,
!smc_tx_prepared_sends(&smc->conn) ||
- (sk->sk_err == ECONNABORTED) ||
- (sk->sk_err == ECONNRESET),
+ sk->sk_err == ECONNABORTED ||
+ sk->sk_err == ECONNRESET ||
+ smc->conn.killed,
&wait);
if (rc)
break;
@@ -95,68 +95,73 @@ static int smc_close_final(struct smc_connection *conn)
conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
else
conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+ if (conn->killed)
+ return -EPIPE;
return smc_cdc_get_slot_and_msg_send(conn);
}
-static int smc_close_abort(struct smc_connection *conn)
+int smc_close_abort(struct smc_connection *conn)
{
conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
return smc_cdc_get_slot_and_msg_send(conn);
}
+static void smc_close_cancel_work(struct smc_sock *smc)
+{
+ struct sock *sk = &smc->sk;
+
+ release_sock(sk);
+ cancel_work_sync(&smc->conn.close_work);
+ cancel_delayed_work_sync(&smc->conn.tx_work);
+ lock_sock(sk);
+ sk->sk_state = SMC_CLOSED;
+}
+
/* terminate smc socket abnormally - active abort
* link group is terminated, i.e. RDMA communication no longer possible
*/
-static void smc_close_active_abort(struct smc_sock *smc)
+void smc_close_active_abort(struct smc_sock *smc)
{
struct sock *sk = &smc->sk;
-
- struct smc_cdc_conn_state_flags *txflags =
- &smc->conn.local_tx_ctrl.conn_state_flags;
+ bool release_clcsock = false;
if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) {
sk->sk_err = ECONNABORTED;
- if (smc->clcsock && smc->clcsock->sk) {
- smc->clcsock->sk->sk_err = ECONNABORTED;
- smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
- }
+ if (smc->clcsock && smc->clcsock->sk)
+ tcp_abort(smc->clcsock->sk, ECONNABORTED);
}
switch (sk->sk_state) {
case SMC_ACTIVE:
sk->sk_state = SMC_PEERABORTWAIT;
- release_sock(sk);
- cancel_delayed_work_sync(&smc->conn.tx_work);
- lock_sock(sk);
+ smc_close_cancel_work(smc);
+ sk->sk_state = SMC_CLOSED;
sock_put(sk); /* passive closing */
break;
case SMC_APPCLOSEWAIT1:
case SMC_APPCLOSEWAIT2:
- if (!smc_cdc_rxed_any_close(&smc->conn))
- sk->sk_state = SMC_PEERABORTWAIT;
- else
- sk->sk_state = SMC_CLOSED;
- release_sock(sk);
- cancel_delayed_work_sync(&smc->conn.tx_work);
- lock_sock(sk);
+ smc_close_cancel_work(smc);
+ sk->sk_state = SMC_CLOSED;
+ sock_put(sk); /* postponed passive closing */
break;
case SMC_PEERCLOSEWAIT1:
case SMC_PEERCLOSEWAIT2:
- if (!txflags->peer_conn_closed) {
- /* just SHUTDOWN_SEND done */
- sk->sk_state = SMC_PEERABORTWAIT;
- } else {
- sk->sk_state = SMC_CLOSED;
- }
+ case SMC_PEERFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
+ sk->sk_state = SMC_CLOSED;
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
sock_put(sk); /* passive closing */
break;
case SMC_PROCESSABORT:
case SMC_APPFINCLOSEWAIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ smc_close_cancel_work(smc);
sk->sk_state = SMC_CLOSED;
- break;
- case SMC_PEERFINCLOSEWAIT:
- sock_put(sk); /* passive closing */
+ smc_conn_free(&smc->conn);
+ release_clcsock = true;
break;
case SMC_INIT:
case SMC_PEERABORTWAIT:
@@ -166,6 +171,12 @@ static void smc_close_active_abort(struct smc_sock *smc)
sock_set_flag(sk, SOCK_DEAD);
sk->sk_state_change(sk);
+
+ if (release_clcsock) {
+ release_sock(sk);
+ smc_clcsock_release(smc);
+ lock_sock(sk);
+ }
}
static inline bool smc_close_sent_any_close(struct smc_connection *conn)
@@ -215,8 +226,6 @@ again:
if (sk->sk_state == SMC_ACTIVE) {
/* send close request */
rc = smc_close_final(conn);
- if (rc)
- break;
sk->sk_state = SMC_PEERCLOSEWAIT1;
} else {
/* peer event has changed the state */
@@ -229,8 +238,6 @@ again:
!smc_close_sent_any_close(conn)) {
/* just shutdown wr done, send close request */
rc = smc_close_final(conn);
- if (rc)
- break;
}
sk->sk_state = SMC_CLOSED;
break;
@@ -246,8 +253,6 @@ again:
goto again;
/* confirm close from peer */
rc = smc_close_final(conn);
- if (rc)
- break;
if (smc_cdc_rxed_any_close(conn)) {
/* peer has closed the socket already */
sk->sk_state = SMC_CLOSED;
@@ -263,8 +268,6 @@ again:
!smc_close_sent_any_close(conn)) {
/* just shutdown wr done, send close request */
rc = smc_close_final(conn);
- if (rc)
- break;
}
/* peer sending PeerConnectionClosed will cause transition */
break;
@@ -272,10 +275,12 @@ again:
/* peer sending PeerConnectionClosed will cause transition */
break;
case SMC_PROCESSABORT:
- smc_close_abort(conn);
+ rc = smc_close_abort(conn);
sk->sk_state = SMC_CLOSED;
break;
case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
case SMC_CLOSED:
/* nothing to do, add tracing in future patch */
break;
@@ -344,12 +349,6 @@ static void smc_close_passive_work(struct work_struct *work)
lock_sock(sk);
old_state = sk->sk_state;
- if (!conn->alert_token_local) {
- /* abnormal termination */
- smc_close_active_abort(smc);
- goto wakeup;
- }
-
rxflags = &conn->local_rx_ctrl.conn_state_flags;
if (rxflags->peer_conn_abort) {
/* peer has not received all data */
@@ -451,8 +450,6 @@ again:
goto again;
/* send close wr request */
rc = smc_close_wr(conn);
- if (rc)
- break;
sk->sk_state = SMC_PEERCLOSEWAIT1;
break;
case SMC_APPCLOSEWAIT1:
@@ -466,8 +463,6 @@ again:
goto again;
/* confirm close from peer */
rc = smc_close_wr(conn);
- if (rc)
- break;
sk->sk_state = SMC_APPCLOSEWAIT2;
break;
case SMC_APPCLOSEWAIT2:
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
index e0e3b5df25d2..634fea2b7c95 100644
--- a/net/smc/smc_close.h
+++ b/net/smc/smc_close.h
@@ -24,5 +24,7 @@ int smc_close_active(struct smc_sock *smc);
int smc_close_shutdown_write(struct smc_sock *smc);
void smc_close_init(struct smc_sock *smc);
void smc_clcsock_release(struct smc_sock *smc);
+int smc_close_abort(struct smc_connection *conn);
+void smc_close_active_abort(struct smc_sock *smc);
#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 4ca50ddf8d16..2249de5379ee 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -13,6 +13,8 @@
#include <linux/if_vlan.h>
#include <linux/random.h>
#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/reboot.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <rdma/ib_verbs.h>
@@ -39,23 +41,46 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */
.num = 0,
};
+static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
+static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
+
static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
struct smc_buf_desc *buf_desc);
+/* return head of link group list and its lock for a given link group */
+static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
+ spinlock_t **lgr_lock)
+{
+ if (lgr->is_smcd) {
+ *lgr_lock = &lgr->smcd->lgr_lock;
+ return &lgr->smcd->lgr_list;
+ }
+
+ *lgr_lock = &smc_lgr_list.lock;
+ return &smc_lgr_list.list;
+}
+
static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
{
/* client link group creation always follows the server link group
* creation. For client use a somewhat higher removal delay time,
* otherwise there is a risk of out-of-sync link groups.
*/
- mod_delayed_work(system_wq, &lgr->free_work,
- (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
- SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
+ if (!lgr->freeing && !lgr->freefast) {
+ mod_delayed_work(system_wq, &lgr->free_work,
+ (!lgr->is_smcd && lgr->role == SMC_CLNT) ?
+ SMC_LGR_FREE_DELAY_CLNT :
+ SMC_LGR_FREE_DELAY_SERV);
+ }
}
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
{
- mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST);
+ if (!lgr->freeing && !lgr->freefast) {
+ lgr->freefast = 1;
+ mod_delayed_work(system_wq, &lgr->free_work,
+ SMC_LGR_FREE_DELAY_FAST);
+ }
}
/* Register connection's alert token in our lookup structure.
@@ -134,16 +159,17 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
__smc_lgr_unregister_conn(conn);
}
write_unlock_bh(&lgr->conns_lock);
+ conn->lgr = NULL;
}
/* Send delete link, either as client to request the initiation
* of the DELETE LINK sequence from server; or as server to
* initiate the delete processing. See smc_llc_rx_delete_link().
*/
-static int smc_link_send_delete(struct smc_link *lnk)
+static int smc_link_send_delete(struct smc_link *lnk, bool orderly)
{
if (lnk->state == SMC_LNK_ACTIVE &&
- !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
+ !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) {
smc_llc_link_deleting(lnk);
return 0;
}
@@ -157,48 +183,62 @@ static void smc_lgr_free_work(struct work_struct *work)
struct smc_link_group *lgr = container_of(to_delayed_work(work),
struct smc_link_group,
free_work);
+ spinlock_t *lgr_lock;
+ struct smc_link *lnk;
bool conns;
- spin_lock_bh(&smc_lgr_list.lock);
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ if (lgr->freeing) {
+ spin_unlock_bh(lgr_lock);
+ return;
+ }
read_lock_bh(&lgr->conns_lock);
conns = RB_EMPTY_ROOT(&lgr->conns_all);
read_unlock_bh(&lgr->conns_lock);
if (!conns) { /* number of lgr connections is no longer zero */
- spin_unlock_bh(&smc_lgr_list.lock);
+ spin_unlock_bh(lgr_lock);
return;
}
- if (!list_empty(&lgr->list))
- list_del_init(&lgr->list); /* remove from smc_lgr_list */
- spin_unlock_bh(&smc_lgr_list.lock);
+ list_del_init(&lgr->list); /* remove from smc_lgr_list */
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
if (!lgr->is_smcd && !lgr->terminating) {
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
-
/* try to send del link msg, on error free lgr immediately */
if (lnk->state == SMC_LNK_ACTIVE &&
- !smc_link_send_delete(lnk)) {
+ !smc_link_send_delete(lnk, true)) {
/* reschedule in case we never receive a response */
smc_lgr_schedule_free_work(lgr);
+ spin_unlock_bh(lgr_lock);
return;
}
}
+ lgr->freeing = 1; /* this instance does the freeing, no new schedule */
+ spin_unlock_bh(lgr_lock);
+ cancel_delayed_work(&lgr->free_work);
+
+ if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
+ smc_llc_link_inactive(lnk);
+ if (lgr->is_smcd && !lgr->terminating)
+ smc_ism_signal_shutdown(lgr);
+ smc_lgr_free(lgr);
+}
- if (!delayed_work_pending(&lgr->free_work)) {
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+static void smc_lgr_terminate_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(work, struct smc_link_group,
+ terminate_work);
- if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
- smc_llc_link_inactive(lnk);
- if (lgr->is_smcd)
- smc_ism_signal_shutdown(lgr);
- smc_lgr_free(lgr);
- }
+ smc_lgr_terminate(lgr, true);
}
/* create a new SMC link group */
static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
{
struct smc_link_group *lgr;
+ struct list_head *lgr_list;
struct smc_link *lnk;
+ spinlock_t *lgr_lock;
u8 rndvec[3];
int rc = 0;
int i;
@@ -213,10 +253,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
if (!lgr) {
rc = SMC_CLC_DECL_MEM;
- goto out;
+ goto ism_put_vlan;
}
lgr->is_smcd = ini->is_smcd;
lgr->sync_err = 0;
+ lgr->terminating = 0;
+ lgr->freefast = 0;
+ lgr->freeing = 0;
lgr->vlan_id = ini->vlan_id;
rwlock_init(&lgr->sndbufs_lock);
rwlock_init(&lgr->rmbs_lock);
@@ -228,13 +271,20 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
smc_lgr_list.num += SMC_LGR_NUM_INCR;
memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+ INIT_WORK(&lgr->terminate_work, smc_lgr_terminate_work);
lgr->conns_all = RB_ROOT;
if (ini->is_smcd) {
/* SMC-D specific settings */
+ get_device(&ini->ism_dev->dev);
lgr->peer_gid = ini->ism_gid;
lgr->smcd = ini->ism_dev;
+ lgr_list = &ini->ism_dev->lgr_list;
+ lgr_lock = &lgr->smcd->lgr_lock;
+ lgr->peer_shutdown = 0;
+ atomic_inc(&ini->ism_dev->lgr_cnt);
} else {
/* SMC-R specific settings */
+ get_device(&ini->ib_dev->ibdev->dev);
lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
SMC_SYSTEMID_LEN);
@@ -245,6 +295,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lnk->link_id = SMC_SINGLE_LINK;
lnk->smcibdev = ini->ib_dev;
lnk->ibport = ini->ib_port;
+ lgr_list = &smc_lgr_list.list;
+ lgr_lock = &smc_lgr_list.lock;
lnk->path_mtu =
ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
if (!ini->ib_dev->initialized)
@@ -272,11 +324,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
rc = smc_wr_create_link(lnk);
if (rc)
goto destroy_qp;
+ atomic_inc(&lgr_cnt);
+ atomic_inc(&ini->ib_dev->lnk_cnt);
}
smc->conn.lgr = lgr;
- spin_lock_bh(&smc_lgr_list.lock);
- list_add(&lgr->list, &smc_lgr_list.list);
- spin_unlock_bh(&smc_lgr_list.lock);
+ spin_lock_bh(lgr_lock);
+ list_add(&lgr->list, lgr_list);
+ spin_unlock_bh(lgr_lock);
return 0;
destroy_qp:
@@ -289,6 +343,9 @@ clear_llc_lnk:
smc_llc_link_clear(lnk);
free_lgr:
kfree(lgr);
+ism_put_vlan:
+ if (ini->is_smcd && ini->vlan_id)
+ smc_ism_put_vlan(ini->ism_dev, ini->vlan_id);
out:
if (rc < 0) {
if (rc == -ENOMEM)
@@ -306,7 +363,7 @@ static void smc_buf_unuse(struct smc_connection *conn,
conn->sndbuf_desc->used = 0;
if (conn->rmb_desc) {
if (!conn->rmb_desc->regerr) {
- if (!lgr->is_smcd) {
+ if (!lgr->is_smcd && !list_empty(&lgr->list)) {
/* unregister rmb with peer */
smc_llc_do_delete_rkey(
&lgr->lnk[SMC_SINGLE_LINK],
@@ -332,14 +389,16 @@ void smc_conn_free(struct smc_connection *conn)
if (!lgr)
return;
if (lgr->is_smcd) {
- smc_ism_unset_conn(conn);
+ if (!list_empty(&lgr->list))
+ smc_ism_unset_conn(conn);
tasklet_kill(&conn->rx_tsklet);
} else {
smc_cdc_tx_dismiss_slots(conn);
}
- smc_lgr_unregister_conn(conn);
- smc_buf_unuse(conn, lgr); /* allow buffer reuse */
- conn->lgr = NULL;
+ if (!list_empty(&lgr->list)) {
+ smc_lgr_unregister_conn(conn);
+ smc_buf_unuse(conn, lgr); /* allow buffer reuse */
+ }
if (!lgr->conns_num)
smc_lgr_schedule_free_work(lgr);
@@ -354,6 +413,8 @@ static void smc_link_clear(struct smc_link *lnk)
smc_ib_destroy_queue_pair(lnk);
smc_ib_dealloc_protection_domain(lnk);
smc_wr_free_link_mem(lnk);
+ if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt))
+ wake_up(&lnk->smcibdev->lnks_deleted);
}
static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb,
@@ -430,24 +491,101 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
static void smc_lgr_free(struct smc_link_group *lgr)
{
smc_lgr_free_bufs(lgr);
- if (lgr->is_smcd)
- smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- else
+ if (lgr->is_smcd) {
+ if (!lgr->terminating) {
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
+ }
+ if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
+ wake_up(&lgr->smcd->lgrs_deleted);
+ } else {
smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+ put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
+ if (!atomic_dec_return(&lgr_cnt))
+ wake_up(&lgrs_deleted);
+ }
kfree(lgr);
}
void smc_lgr_forget(struct smc_link_group *lgr)
{
- spin_lock_bh(&smc_lgr_list.lock);
+ struct list_head *lgr_list;
+ spinlock_t *lgr_lock;
+
+ lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
/* do not use this link group for new connections */
- if (!list_empty(&lgr->list))
- list_del_init(&lgr->list);
- spin_unlock_bh(&smc_lgr_list.lock);
+ if (!list_empty(lgr_list))
+ list_del_init(lgr_list);
+ spin_unlock_bh(lgr_lock);
+}
+
+static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
+{
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ struct smc_buf_desc *buf_desc;
+
+ list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
+ buf_desc->len += sizeof(struct smcd_cdc_msg);
+ smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+ }
+ }
+}
+
+static void smc_sk_wake_ups(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space(&smc->sk);
+ smc->sk.sk_data_ready(&smc->sk);
+ smc->sk.sk_state_change(&smc->sk);
}
-/* terminate linkgroup abnormally */
-static void __smc_lgr_terminate(struct smc_link_group *lgr)
+/* kill a connection */
+static void smc_conn_kill(struct smc_connection *conn, bool soft)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ if (conn->lgr->is_smcd && conn->lgr->peer_shutdown)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ smc_close_abort(conn);
+ conn->killed = 1;
+ smc->sk.sk_err = ECONNABORTED;
+ smc_sk_wake_ups(smc);
+ if (conn->lgr->is_smcd) {
+ smc_ism_unset_conn(conn);
+ if (soft)
+ tasklet_kill(&conn->rx_tsklet);
+ else
+ tasklet_unlock_wait(&conn->rx_tsklet);
+ } else {
+ smc_cdc_tx_dismiss_slots(conn);
+ }
+ smc_lgr_unregister_conn(conn);
+ smc_close_active_abort(smc);
+}
+
+static void smc_lgr_cleanup(struct smc_link_group *lgr)
+{
+ if (lgr->is_smcd) {
+ smc_ism_signal_shutdown(lgr);
+ smcd_unregister_all_dmbs(lgr);
+ smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+ put_device(&lgr->smcd->dev);
+ } else {
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ wake_up(&lnk->wr_reg_wait);
+ if (lnk->state != SMC_LNK_INACTIVE) {
+ smc_link_send_delete(lnk, false);
+ smc_llc_link_inactive(lnk);
+ }
+ }
+}
+
+/* terminate link group */
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
{
struct smc_connection *conn;
struct smc_sock *smc;
@@ -455,80 +593,161 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
if (lgr->terminating)
return; /* lgr already terminating */
+ if (!soft)
+ cancel_delayed_work_sync(&lgr->free_work);
lgr->terminating = 1;
- if (!list_empty(&lgr->list)) /* forget lgr */
- list_del_init(&lgr->list);
if (!lgr->is_smcd)
smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
- write_lock_bh(&lgr->conns_lock);
+ /* kill remaining link group connections */
+ read_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
while (node) {
+ read_unlock_bh(&lgr->conns_lock);
conn = rb_entry(node, struct smc_connection, alert_node);
smc = container_of(conn, struct smc_sock, conn);
- sock_hold(&smc->sk); /* sock_put in close work */
- conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
- __smc_lgr_unregister_conn(conn);
- conn->lgr = NULL;
- write_unlock_bh(&lgr->conns_lock);
- if (!schedule_work(&conn->close_work))
- sock_put(&smc->sk);
- write_lock_bh(&lgr->conns_lock);
+ sock_hold(&smc->sk); /* sock_put below */
+ lock_sock(&smc->sk);
+ smc_conn_kill(conn, soft);
+ release_sock(&smc->sk);
+ sock_put(&smc->sk); /* sock_hold above */
+ read_lock_bh(&lgr->conns_lock);
node = rb_first(&lgr->conns_all);
}
- write_unlock_bh(&lgr->conns_lock);
- if (!lgr->is_smcd)
- wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
- smc_lgr_schedule_free_work(lgr);
+ read_unlock_bh(&lgr->conns_lock);
+ smc_lgr_cleanup(lgr);
+ if (soft)
+ smc_lgr_schedule_free_work_fast(lgr);
+ else
+ smc_lgr_free(lgr);
}
-void smc_lgr_terminate(struct smc_link_group *lgr)
+/* unlink and terminate link group
+ * @soft: true if link group shutdown can take its time
+ * false if immediate link group shutdown is required
+ */
+void smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
{
- spin_lock_bh(&smc_lgr_list.lock);
- __smc_lgr_terminate(lgr);
- spin_unlock_bh(&smc_lgr_list.lock);
+ spinlock_t *lgr_lock;
+
+ smc_lgr_list_head(lgr, &lgr_lock);
+ spin_lock_bh(lgr_lock);
+ if (lgr->terminating) {
+ spin_unlock_bh(lgr_lock);
+ return; /* lgr already terminating */
+ }
+ if (!soft)
+ lgr->freeing = 1;
+ list_del_init(&lgr->list);
+ spin_unlock_bh(lgr_lock);
+ __smc_lgr_terminate(lgr, soft);
}
/* Called when IB port is terminated */
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
{
struct smc_link_group *lgr, *l;
+ LIST_HEAD(lgr_free_list);
spin_lock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
if (!lgr->is_smcd &&
lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
- lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
- __smc_lgr_terminate(lgr);
+ lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) {
+ list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
+ }
}
spin_unlock_bh(&smc_lgr_list.lock);
+
+ list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
}
-/* Called when SMC-D device is terminated or peer is lost */
+/* Called when peer lgr shutdown (regularly or abnormally) is received */
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
{
struct smc_link_group *lgr, *l;
LIST_HEAD(lgr_free_list);
/* run common cleanup function and build free list */
- spin_lock_bh(&smc_lgr_list.lock);
- list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
- if (lgr->is_smcd && lgr->smcd == dev &&
- (!peer_gid || lgr->peer_gid == peer_gid) &&
+ spin_lock_bh(&dev->lgr_lock);
+ list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
+ if ((!peer_gid || lgr->peer_gid == peer_gid) &&
(vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
- __smc_lgr_terminate(lgr);
+ if (peer_gid) /* peer triggered termination */
+ lgr->peer_shutdown = 1;
list_move(&lgr->list, &lgr_free_list);
}
}
- spin_unlock_bh(&smc_lgr_list.lock);
+ spin_unlock_bh(&dev->lgr_lock);
/* cancel the regular free workers and actually free lgrs */
list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
list_del_init(&lgr->list);
- cancel_delayed_work_sync(&lgr->free_work);
- if (!peer_gid && vlan == VLAN_VID_MASK) /* dev terminated? */
- smc_ism_signal_shutdown(lgr);
- smc_lgr_free(lgr);
+ schedule_work(&lgr->terminate_work);
+ }
+}
+
+/* Called when an SMCD device is removed or the smc module is unloaded */
+void smc_smcd_terminate_all(struct smcd_dev *smcd)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+
+ spin_lock_bh(&smcd->lgr_lock);
+ list_splice_init(&smcd->lgr_list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ spin_unlock_bh(&smcd->lgr_lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (atomic_read(&smcd->lgr_cnt))
+ wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
+}
+
+/* Called when an SMCR device is removed or the smc module is unloaded.
+ * If smcibdev is given, all SMCR link groups using this device are terminated.
+ * If smcibdev is NULL, all SMCR link groups are terminated.
+ */
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_free_list);
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!smcibdev) {
+ list_splice_init(&smc_lgr_list.list, &lgr_free_list);
+ list_for_each_entry(lgr, &lgr_free_list, list)
+ lgr->freeing = 1;
+ } else {
+ list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
+ if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) {
+ list_move(&lgr->list, &lgr_free_list);
+ lgr->freeing = 1;
+ }
+ }
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) {
+ list_del_init(&lgr->list);
+ __smc_lgr_terminate(lgr, false);
+ }
+
+ if (smcibdev) {
+ if (atomic_read(&smcibdev->lnk_cnt))
+ wait_event(smcibdev->lnks_deleted,
+ !atomic_read(&smcibdev->lnk_cnt));
+ } else {
+ if (atomic_read(&lgr_cnt))
+ wait_event(lgrs_deleted, !atomic_read(&lgr_cnt));
}
}
@@ -558,7 +777,7 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
}
rtnl_lock();
- nest_lvl = dev_get_nest_level(ndev);
+ nest_lvl = ndev->lower_level;
for (i = 0; i < nest_lvl; i++) {
struct list_head *lower = &ndev->adj_list.lower;
@@ -604,10 +823,14 @@ static bool smcd_lgr_match(struct smc_link_group *lgr,
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
{
struct smc_connection *conn = &smc->conn;
+ struct list_head *lgr_list;
struct smc_link_group *lgr;
enum smc_lgr_role role;
+ spinlock_t *lgr_lock;
int rc = 0;
+ lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
+ lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
ini->cln_first_contact = SMC_FIRST_CONTACT;
role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
if (role == SMC_CLNT && ini->srv_first_contact)
@@ -615,8 +838,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
goto create;
/* determine if an existing link group can be reused */
- spin_lock_bh(&smc_lgr_list.lock);
- list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ spin_lock_bh(lgr_lock);
+ list_for_each_entry(lgr, lgr_list, list) {
write_lock_bh(&lgr->conns_lock);
if ((ini->is_smcd ?
smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
@@ -636,7 +859,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
}
write_unlock_bh(&lgr->conns_lock);
}
- spin_unlock_bh(&smc_lgr_list.lock);
+ spin_unlock_bh(lgr_lock);
if (role == SMC_CLNT && !ini->srv_first_contact &&
ini->cln_first_contact == SMC_FIRST_CONTACT) {
@@ -1024,29 +1247,62 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
return 0;
}
-/* Called (from smc_exit) when module is removed */
-void smc_core_exit(void)
+static void smc_core_going_away(void)
{
- struct smc_link_group *lgr, *lg;
- LIST_HEAD(lgr_freeing_list);
+ struct smc_ib_device *smcibdev;
+ struct smcd_dev *smcd;
- spin_lock_bh(&smc_lgr_list.lock);
- if (!list_empty(&smc_lgr_list.list))
- list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
- spin_unlock_bh(&smc_lgr_list.lock);
- list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
- list_del_init(&lgr->list);
- if (!lgr->is_smcd) {
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ spin_lock(&smc_ib_devices.lock);
+ list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+ int i;
- if (lnk->state == SMC_LNK_ACTIVE)
- smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
- false);
- smc_llc_link_inactive(lnk);
- }
- cancel_delayed_work_sync(&lgr->free_work);
- if (lgr->is_smcd)
- smc_ism_signal_shutdown(lgr);
- smc_lgr_free(lgr); /* free link group */
+ for (i = 0; i < SMC_MAX_PORTS; i++)
+ set_bit(i, smcibdev->ports_going_away);
+ }
+ spin_unlock(&smc_ib_devices.lock);
+
+ spin_lock(&smcd_dev_list.lock);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+ smcd->going_away = 1;
}
+ spin_unlock(&smcd_dev_list.lock);
+}
+
+/* Clean up all SMC link groups */
+static void smc_lgrs_shutdown(void)
+{
+ struct smcd_dev *smcd;
+
+ smc_core_going_away();
+
+ smc_smcr_terminate_all(NULL);
+
+ spin_lock(&smcd_dev_list.lock);
+ list_for_each_entry(smcd, &smcd_dev_list.list, list)
+ smc_smcd_terminate_all(smcd);
+ spin_unlock(&smcd_dev_list.lock);
+}
+
+static int smc_core_reboot_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ smc_lgrs_shutdown();
+ smc_ib_unregister_client();
+ return 0;
+}
+
+static struct notifier_block smc_reboot_notifier = {
+ .notifier_call = smc_core_reboot_event,
+};
+
+int __init smc_core_init(void)
+{
+ return register_reboot_notifier(&smc_reboot_notifier);
+}
+
+/* Called (from smc_exit) when module is removed */
+void smc_core_exit(void)
+{
+ unregister_reboot_notifier(&smc_reboot_notifier);
+ smc_lgrs_shutdown();
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index c00ac61dc129..c472e12951d1 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -202,8 +202,11 @@ struct smc_link_group {
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
+ struct work_struct terminate_work; /* abnormal lgr termination */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
+ u8 freefast : 1; /* free worker scheduled fast */
+ u8 freeing : 1; /* lgr is being freed */
bool is_smcd; /* SMC-R or SMC-D */
union {
@@ -225,6 +228,8 @@ struct smc_link_group {
/* Peer GID (remote) */
struct smcd_dev *smcd;
/* ISM device for VLAN reg. */
+ u8 peer_shutdown : 1;
+ /* peer triggered shutdownn */
};
};
};
@@ -280,15 +285,23 @@ static inline struct smc_connection *smc_lgr_find_conn(
return res;
}
+static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr)
+{
+ if (!lgr->terminating && !lgr->freeing)
+ schedule_work(&lgr->terminate_work);
+}
+
struct smc_sock;
struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
void smc_lgr_forget(struct smc_link_group *lgr);
-void smc_lgr_terminate(struct smc_link_group *lgr);
+void smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
unsigned short vlan);
+void smc_smcd_terminate_all(struct smcd_dev *dev);
+void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn,
@@ -305,6 +318,7 @@ void smc_conn_free(struct smc_connection *conn);
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
void smcd_conn_free(struct smc_connection *conn);
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
+int smc_core_init(void);
void smc_core_exit(void);
static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index f38727ecf8b2..e1f64f4ba236 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -39,16 +39,15 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
+ memset(r, 0, sizeof(*r));
r->diag_family = sk->sk_family;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
if (!smc->clcsock)
return;
r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
r->id.idiag_dport = smc->clcsock->sk->sk_dport;
r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
- sock_diag_save_cookie(sk, r->id.idiag_cookie);
if (sk->sk_protocol == SMCPROTO_SMC) {
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index d14ca4af6f94..548632621f4b 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/workqueue.h>
#include <linux/scatterlist.h>
+#include <linux/wait.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
@@ -242,8 +243,12 @@ static void smc_ib_port_event_work(struct work_struct *work)
for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
smc_ib_remember_port_attr(smcibdev, port_idx + 1);
clear_bit(port_idx, &smcibdev->port_event_mask);
- if (!smc_ib_port_active(smcibdev, port_idx + 1))
+ if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
+ set_bit(port_idx, smcibdev->ports_going_away);
smc_port_terminate(smcibdev, port_idx + 1);
+ } else {
+ clear_bit(port_idx, smcibdev->ports_going_away);
+ }
}
}
@@ -259,8 +264,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
switch (ibevent->event) {
case IB_EVENT_DEVICE_FATAL:
/* terminate all ports on device */
- for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++)
+ for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
set_bit(port_idx, &smcibdev->port_event_mask);
+ set_bit(port_idx, smcibdev->ports_going_away);
+ }
schedule_work(&smcibdev->port_event_work);
break;
case IB_EVENT_PORT_ERR:
@@ -269,6 +276,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
port_idx = ibevent->element.port_num - 1;
if (port_idx < SMC_MAX_PORTS) {
set_bit(port_idx, &smcibdev->port_event_mask);
+ if (ibevent->event == IB_EVENT_PORT_ERR)
+ set_bit(port_idx, smcibdev->ports_going_away);
+ else if (ibevent->event == IB_EVENT_PORT_ACTIVE)
+ clear_bit(port_idx, smcibdev->ports_going_away);
schedule_work(&smcibdev->port_event_work);
}
break;
@@ -307,6 +318,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
port_idx = ibevent->element.qp->port - 1;
if (port_idx < SMC_MAX_PORTS) {
set_bit(port_idx, &smcibdev->port_event_mask);
+ set_bit(port_idx, smcibdev->ports_going_away);
schedule_work(&smcibdev->port_event_work);
}
break;
@@ -509,9 +521,9 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
if (!smcibdev->initialized)
return;
smcibdev->initialized = 0;
- smc_wr_remove_dev(smcibdev);
ib_destroy_cq(smcibdev->roce_cq_recv);
ib_destroy_cq(smcibdev->roce_cq_send);
+ smc_wr_remove_dev(smcibdev);
}
static struct ib_client smc_ib_client;
@@ -532,7 +544,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
smcibdev->ibdev = ibdev;
INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
-
+ atomic_set(&smcibdev->lnk_cnt, 0);
+ init_waitqueue_head(&smcibdev->lnks_deleted);
spin_lock(&smc_ib_devices.lock);
list_add_tail(&smcibdev->list, &smc_ib_devices.list);
spin_unlock(&smc_ib_devices.lock);
@@ -554,7 +567,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev)
schedule_work(&smcibdev->port_event_work);
}
-/* callback function for ib_register_client() */
+/* callback function for ib_unregister_client() */
static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
{
struct smc_ib_device *smcibdev;
@@ -564,6 +577,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
spin_lock(&smc_ib_devices.lock);
list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
spin_unlock(&smc_ib_devices.lock);
+ smc_smcr_terminate_all(smcibdev);
smc_ib_cleanup_per_ibdev(smcibdev);
ib_unregister_event_handler(&smcibdev->event_handler);
kfree(smcibdev);
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index da60ab9e8d70..255db87547d3 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/if_ether.h>
+#include <linux/wait.h>
#include <rdma/ib_verbs.h>
#include <net/smc.h>
@@ -47,6 +48,9 @@ struct smc_ib_device { /* ib-device infos for smc */
u8 initialized : 1; /* ib dev CQ, evthdl done */
struct work_struct port_event_work;
unsigned long port_event_mask;
+ DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
+ atomic_t lnk_cnt; /* number of links on ibdev */
+ wait_queue_head_t lnks_deleted; /* wait 4 removal of all links*/
};
struct smc_buf_desc;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index e89e918b88e0..5c4727d5066e 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -146,6 +146,10 @@ out:
int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
{
struct smcd_dmb dmb;
+ int rc = 0;
+
+ if (!dmb_desc->dma_addr)
+ return rc;
memset(&dmb, 0, sizeof(dmb));
dmb.dmb_tok = dmb_desc->token;
@@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
dmb.cpu_addr = dmb_desc->cpu_addr;
dmb.dma_addr = dmb_desc->dma_addr;
dmb.dmb_len = dmb_desc->len;
- return smcd->ops->unregister_dmb(smcd, &dmb);
+ rc = smcd->ops->unregister_dmb(smcd, &dmb);
+ if (!rc || rc == ISM_ERROR) {
+ dmb_desc->cpu_addr = NULL;
+ dmb_desc->dma_addr = 0;
+ }
+
+ return rc;
}
int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
@@ -226,6 +236,9 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr)
int rc;
union smcd_sw_event_info ev_info;
+ if (lgr->peer_shutdown)
+ return 0;
+
memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
ev_info.vlan_id = lgr->vlan_id;
ev_info.code = ISM_EVENT_REQUEST;
@@ -286,7 +299,10 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
spin_lock_init(&smcd->lock);
+ spin_lock_init(&smcd->lgr_lock);
INIT_LIST_HEAD(&smcd->vlan);
+ INIT_LIST_HEAD(&smcd->lgr_list);
+ init_waitqueue_head(&smcd->lgrs_deleted);
smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
WQ_MEM_RECLAIM, name);
if (!smcd->event_wq) {
@@ -311,11 +327,12 @@ EXPORT_SYMBOL_GPL(smcd_register_dev);
void smcd_unregister_dev(struct smcd_dev *smcd)
{
spin_lock(&smcd_dev_list.lock);
- list_del(&smcd->list);
+ list_del_init(&smcd->list);
spin_unlock(&smcd_dev_list.lock);
+ smcd->going_away = 1;
+ smc_smcd_terminate_all(smcd);
flush_workqueue(smcd->event_wq);
destroy_workqueue(smcd->event_wq);
- smc_smcd_terminate(smcd, 0, VLAN_VID_MASK);
device_del(&smcd->dev);
}
@@ -342,6 +359,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
{
struct smc_ism_event_work *wrk;
+ if (smcd->going_away)
+ return;
/* copy event to event work queue, and let it be handled there */
wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
if (!wrk)
@@ -367,7 +386,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
spin_lock_irqsave(&smcd->lock, flags);
conn = smcd->conn[dmbno];
- if (conn)
+ if (conn && !conn->killed)
tasklet_schedule(&conn->rx_tsklet);
spin_unlock_irqrestore(&smcd->lock, flags);
}
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 4fd60c522802..a9f6431dd69a 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -475,7 +475,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
}
smc_llc_send_message(link, llc, sizeof(*llc));
- smc_lgr_schedule_free_work_fast(lgr);
+ smc_lgr_terminate_sched(lgr);
}
}
@@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
SMC_LLC_WAIT_TIME);
if (rc <= 0) {
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate(smc_get_lgr(link), true);
return;
}
next_interval = link->llc_testlink_time;
@@ -656,6 +656,7 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
void smc_llc_link_deleting(struct smc_link *link)
{
link->state = SMC_LNK_DELETING;
+ smc_wr_wakeup_tx_wait(link);
}
/* called in tasklet context */
@@ -663,6 +664,8 @@ void smc_llc_link_inactive(struct smc_link *link)
{
link->state = SMC_LNK_INACTIVE;
cancel_delayed_work(&link->llc_testlink_wrk);
+ smc_wr_wakeup_reg_wait(link);
+ smc_wr_wakeup_tx_wait(link);
}
/* called in worker context */
@@ -695,9 +698,11 @@ int smc_llc_do_confirm_rkey(struct smc_link *link,
int smc_llc_do_delete_rkey(struct smc_link *link,
struct smc_buf_desc *rmb_desc)
{
- int rc;
+ int rc = 0;
mutex_lock(&link->llc_delete_rkey_mutex);
+ if (link->state != SMC_LNK_ACTIVE)
+ goto out;
reinit_completion(&link->llc_delete_rkey);
rc = smc_llc_send_delete_rkey(link, rmb_desc);
if (rc)
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index bab2da8cf17a..2a5ed47c3e08 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -376,8 +376,6 @@ static int smc_pnet_fill_entry(struct net *net,
return 0;
error:
- if (pnetelem->ndev)
- dev_put(pnetelem->ndev);
return rc;
}
@@ -613,7 +611,7 @@ static const struct genl_ops smc_pnet_ops[] = {
{
.cmd = SMC_PNETID_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .flags = GENL_ADMIN_PERM,
+ /* can be retrieved by unprivileged users */
.doit = smc_pnet_get,
.dumpit = smc_pnet_dump,
.start = smc_pnet_dump_start
@@ -718,7 +716,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
int i, nest_lvl;
rtnl_lock();
- nest_lvl = dev_get_nest_level(ndev);
+ nest_lvl = ndev->lower_level;
for (i = 0; i < nest_lvl; i++) {
struct list_head *lower = &ndev->adj_list.lower;
@@ -781,6 +779,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
dev_put(ndev);
if (netdev == ndev &&
smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away) &&
!smc_ib_determine_gid(ibdev, i, ini->vlan_id,
ini->ib_gid, NULL)) {
ini->ib_dev = ibdev;
@@ -820,6 +819,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
continue;
if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) &&
smc_ib_port_active(ibdev, i) &&
+ !test_bit(i - 1, ibdev->ports_going_away) &&
!smc_ib_determine_gid(ibdev, i, ini->vlan_id,
ini->ib_gid, NULL)) {
ini->ib_dev = ibdev;
@@ -846,7 +846,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
spin_lock(&smcd_dev_list.lock);
list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
- if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) {
+ if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
+ !ismdev->going_away) {
ini->ism_dev = ismdev;
break;
}
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 413a6abf227e..39d7b34d06d2 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -201,6 +201,8 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
+ struct smc_cdc_conn_state_flags *cflags =
+ &conn->local_tx_ctrl.conn_state_flags;
struct sock *sk = &smc->sk;
int rc;
@@ -210,9 +212,10 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
add_wait_queue(sk_sleep(sk), &wait);
rc = sk_wait_event(sk, timeo,
sk->sk_err ||
+ cflags->peer_conn_abort ||
sk->sk_shutdown & RCV_SHUTDOWN ||
- fcrit(conn) ||
- smc_cdc_rxed_any_close_or_senddone(conn),
+ conn->killed ||
+ fcrit(conn),
&wait);
remove_wait_queue(sk_sleep(sk), &wait);
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
@@ -262,6 +265,18 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
return -EAGAIN;
}
+static bool smc_rx_recvmsg_data_available(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+
+ if (smc_rx_data_available(conn))
+ return true;
+ else if (conn->urg_state == SMC_URG_VALID)
+ /* we received a single urgent Byte - skip */
+ smc_rx_update_cons(smc, 0);
+ return false;
+}
+
/* smc_rx_recvmsg - receive data from RMBE
* @msg: copy data to receive buffer
* @pipe: copy data to pipe if set - indicates splice() call
@@ -303,16 +318,20 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
if (read_done >= target || (pipe && read_done))
break;
- if (atomic_read(&conn->bytes_to_rcv))
+ if (conn->killed)
+ break;
+
+ if (smc_rx_recvmsg_data_available(smc))
goto copy;
- else if (conn->urg_state == SMC_URG_VALID)
- /* we received a single urgent Byte - skip */
- smc_rx_update_cons(smc, 0);
- if (sk->sk_shutdown & RCV_SHUTDOWN ||
- smc_cdc_rxed_any_close_or_senddone(conn) ||
- conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+ if (sk->sk_shutdown & RCV_SHUTDOWN) {
+ /* smc_cdc_msg_recv_action() could have run after
+ * above smc_rx_recvmsg_data_available()
+ */
+ if (smc_rx_recvmsg_data_available(smc))
+ goto copy;
break;
+ }
if (read_done) {
if (sk->sk_err ||
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index f0de323d15d6..0d42e7716b91 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -76,18 +76,17 @@ static int smc_tx_wait(struct smc_sock *smc, int flags)
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct smc_connection *conn = &smc->conn;
struct sock *sk = &smc->sk;
- bool noblock;
long timeo;
int rc = 0;
/* similar to sk_stream_wait_memory */
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- noblock = timeo ? false : true;
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->killed ||
conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
rc = -EPIPE;
break;
@@ -97,8 +96,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags)
break;
}
if (!timeo) {
- if (noblock)
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ /* ensure EPOLLOUT is subsequently generated */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
rc = -EAGAIN;
break;
}
@@ -157,7 +156,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
return -ENOTCONN;
if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
(smc->sk.sk_err == ECONNABORTED) ||
- conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+ conn->killed)
return -EPIPE;
if (smc_cdc_rxed_any_close(conn))
return send_done ?: -ECONNRESET;
@@ -284,10 +283,8 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
peer_rmbe_offset;
rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
- if (rc) {
- conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
- smc_lgr_terminate(lgr);
- }
+ if (rc)
+ smc_lgr_terminate(lgr, true);
return rc;
}
@@ -497,10 +494,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
if (smc->sk.sk_err == ECONNABORTED)
return sock_error(&smc->sk);
+ if (conn->killed)
+ return -EPIPE;
rc = 0;
- if (conn->alert_token_local) /* connection healthy */
- mod_delayed_work(system_wq, &conn->tx_work,
- SMC_TX_WORK_DELAY);
+ mod_delayed_work(system_wq, &conn->tx_work,
+ SMC_TX_WORK_DELAY);
}
return rc;
}
@@ -549,6 +547,9 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
{
int rc;
+ if (conn->killed ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+ return -EPIPE; /* connection being aborted */
if (conn->lgr->is_smcd)
rc = smcd_tx_sndbuf_nonempty(conn);
else
@@ -575,9 +576,7 @@ void smc_tx_work(struct work_struct *work)
int rc;
lock_sock(&smc->sk);
- if (smc->sk.sk_err ||
- !conn->alert_token_local ||
- conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+ if (smc->sk.sk_err)
goto out;
rc = smc_tx_sndbuf_nonempty(conn);
@@ -610,8 +609,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
((to_confirm > conn->rmbe_update_limit) &&
((sender_free <= (conn->rmb_desc->len / 2)) ||
conn->local_rx_ctrl.prod_flags.write_blocked))) {
+ if (conn->killed ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
+ return;
if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
- conn->alert_token_local) { /* connection healthy */
+ !conn->killed) {
schedule_delayed_work(&conn->tx_work,
SMC_TX_WORK_DELAY);
return;
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 253aa75dc2b6..337ee52ad3d3 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -50,6 +50,26 @@ struct smc_wr_tx_pend { /* control data for a pending send request */
/*------------------------------- completion --------------------------------*/
+/* returns true if at least one tx work request is pending on the given link */
+static inline bool smc_wr_is_tx_pend(struct smc_link *link)
+{
+ if (find_first_bit(link->wr_tx_mask, link->wr_tx_cnt) !=
+ link->wr_tx_cnt) {
+ return true;
+ }
+ return false;
+}
+
+/* wait till all pending tx work requests on the given link are completed */
+static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
+{
+ if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
+ SMC_WR_TX_WAIT_PENDING_TIME))
+ return 0;
+ else /* timeout */
+ return -EPIPE;
+}
+
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
{
u32 i;
@@ -75,7 +95,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
link->wr_reg_state = FAILED;
else
link->wr_reg_state = CONFIRMED;
- wake_up(&link->wr_reg_wait);
+ smc_wr_wakeup_reg_wait(link);
return;
}
@@ -101,7 +121,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
clear_bit(i, link->wr_tx_mask);
}
/* terminate connections of this link group abnormally */
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate_sched(smc_get_lgr(link));
}
if (pnd_snd.handler)
pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -171,6 +191,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
struct smc_rdma_wr **wr_rdma_buf,
struct smc_wr_tx_pend_priv **wr_pend_priv)
{
+ struct smc_link_group *lgr = smc_get_lgr(link);
struct smc_wr_tx_pend *wr_pend;
u32 idx = link->wr_tx_cnt;
struct ib_send_wr *wr_ib;
@@ -179,19 +200,20 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
*wr_buf = NULL;
*wr_pend_priv = NULL;
- if (in_softirq()) {
+ if (in_softirq() || lgr->terminating) {
rc = smc_wr_tx_get_free_slot_index(link, &idx);
if (rc)
return rc;
} else {
- rc = wait_event_timeout(
+ rc = wait_event_interruptible_timeout(
link->wr_tx_wait,
link->state == SMC_LNK_INACTIVE ||
+ lgr->terminating ||
(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
SMC_WR_TX_WAIT_FREE_SLOT_TIME);
if (!rc) {
/* timeout - terminate connections */
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate_sched(lgr);
return -EPIPE;
}
if (idx == link->wr_tx_cnt)
@@ -227,6 +249,7 @@ int smc_wr_tx_put_slot(struct smc_link *link,
memset(&link->wr_tx_bufs[idx], 0,
sizeof(link->wr_tx_bufs[idx]));
test_and_clear_bit(idx, link->wr_tx_mask);
+ wake_up(&link->wr_tx_wait);
return 1;
}
@@ -247,7 +270,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
if (rc) {
smc_wr_tx_put_slot(link, priv);
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate_sched(smc_get_lgr(link));
}
return rc;
}
@@ -272,7 +295,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
SMC_WR_REG_MR_WAIT_TIME);
if (!rc) {
/* timeout - terminate connections */
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate_sched(smc_get_lgr(link));
return -EPIPE;
}
if (rc == -ERESTARTSYS)
@@ -373,7 +396,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
/* terminate connections of this link group
* abnormally
*/
- smc_lgr_terminate(smc_get_lgr(link));
+ smc_lgr_terminate_sched(smc_get_lgr(link));
break;
default:
smc_wr_rx_post(link); /* refill WR RX */
@@ -510,8 +533,10 @@ void smc_wr_free_link(struct smc_link *lnk)
{
struct ib_device *ibdev;
- memset(lnk->wr_tx_mask, 0,
- BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ if (smc_wr_tx_wait_no_pending_sends(lnk))
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) *
+ sizeof(*lnk->wr_tx_mask));
if (!lnk->smcibdev)
return;
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 09bf32fd3959..3ac99c898418 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -60,6 +60,16 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
atomic_long_set(wr_tx_id, val);
}
+static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
+{
+ wake_up_all(&lnk->wr_tx_wait);
+}
+
+static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
+{
+ wake_up(&lnk->wr_reg_wait);
+}
+
/* post a new receive work request to fill a completed old work request entry */
static inline int smc_wr_rx_post(struct smc_link *link)
{
diff --git a/net/socket.c b/net/socket.c
index 6a9ab7a8b1d2..b79a05de7c6e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -100,6 +100,7 @@
#include <linux/if_tun.h>
#include <linux/ipv6_route.h>
#include <linux/route.h>
+#include <linux/termios.h>
#include <linux/sockios.h>
#include <net/busy_poll.h>
#include <linux/errqueue.h>
@@ -128,6 +129,18 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
+#ifdef CONFIG_PROC_FS
+static void sock_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct socket *sock = f->private_data;
+
+ if (sock->ops->show_fdinfo)
+ sock->ops->show_fdinfo(m, sock);
+}
+#else
+#define sock_show_fdinfo NULL
+#endif
+
/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
@@ -149,6 +162,7 @@ static const struct file_operations socket_file_ops = {
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
+ .show_fdinfo = sock_show_fdinfo,
};
/*
@@ -404,6 +418,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
sock->file = file;
file->private_data = sock;
+ stream_open(SOCK_INODE(sock), file);
return file;
}
EXPORT_SYMBOL(sock_alloc_file);
@@ -793,7 +808,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
sizeof(ts), &ts);
} else {
- struct timespec ts;
+ struct __kernel_old_timespec ts;
skb_get_timestampns(skb, &ts);
put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
@@ -955,7 +970,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
.msg_iocb = iocb};
ssize_t res;
- if (file->f_flags & O_NONBLOCK)
+ if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
msg.msg_flags = MSG_DONTWAIT;
if (iocb->ki_pos != 0)
@@ -980,7 +995,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_pos != 0)
return -ESPIPE;
- if (file->f_flags & O_NONBLOCK)
+ if (file->f_flags & O_NONBLOCK || (iocb->ki_flags & IOCB_NOWAIT))
msg.msg_flags = MSG_DONTWAIT;
if (sock->type == SOCK_SEQPACKET)
@@ -1690,24 +1705,13 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
return __sys_listen(fd, backlog);
}
-/*
- * For accept, we attempt to create a new socket, set up the link
- * with the client, wake up the client, then return the new
- * connected fd. We collect the address of the connector in kernel
- * space and move it to user at the very end. This is unclean because
- * we open the socket then return an error.
- *
- * 1003.1g adds the ability to recvmsg() to query connection pending
- * status to recvmsg. We need to add that support in a way thats
- * clean when we restructure accept also.
- */
-
-int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
- int __user *upeer_addrlen, int flags)
+int __sys_accept4_file(struct file *file, unsigned file_flags,
+ struct sockaddr __user *upeer_sockaddr,
+ int __user *upeer_addrlen, int flags)
{
struct socket *sock, *newsock;
struct file *newfile;
- int err, len, newfd, fput_needed;
+ int err, len, newfd;
struct sockaddr_storage address;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
@@ -1716,14 +1720,14 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ sock = sock_from_file(file, &err);
if (!sock)
goto out;
err = -ENFILE;
newsock = sock_alloc();
if (!newsock)
- goto out_put;
+ goto out;
newsock->type = sock->type;
newsock->ops = sock->ops;
@@ -1738,20 +1742,21 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
- goto out_put;
+ goto out;
}
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
if (IS_ERR(newfile)) {
err = PTR_ERR(newfile);
put_unused_fd(newfd);
- goto out_put;
+ goto out;
}
err = security_socket_accept(sock, newsock);
if (err)
goto out_fd;
- err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
+ err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags,
+ false);
if (err < 0)
goto out_fd;
@@ -1772,15 +1777,42 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
fd_install(newfd, newfile);
err = newfd;
-
-out_put:
- fput_light(sock->file, fput_needed);
out:
return err;
out_fd:
fput(newfile);
put_unused_fd(newfd);
- goto out_put;
+ goto out;
+
+}
+
+/*
+ * For accept, we attempt to create a new socket, set up the link
+ * with the client, wake up the client, then return the new
+ * connected fd. We collect the address of the connector in kernel
+ * space and move it to user at the very end. This is unclean because
+ * we open the socket then return an error.
+ *
+ * 1003.1g adds the ability to recvmsg() to query connection pending
+ * status to recvmsg. We need to add that support in a way thats
+ * clean when we restructure accept also.
+ */
+
+int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
+ int __user *upeer_addrlen, int flags)
+{
+ int ret = -EBADF;
+ struct fd f;
+
+ f = fdget(fd);
+ if (f.file) {
+ ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
+ upeer_addrlen, flags);
+ if (f.flags)
+ fput(f.file);
+ }
+
+ return ret;
}
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
@@ -1807,32 +1839,46 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
* include the -EINPROGRESS status for such sockets.
*/
-int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
+int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
+ int addrlen, int file_flags)
{
struct socket *sock;
- struct sockaddr_storage address;
- int err, fput_needed;
+ int err;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
+ sock = sock_from_file(file, &err);
if (!sock)
goto out;
- err = move_addr_to_kernel(uservaddr, addrlen, &address);
- if (err < 0)
- goto out_put;
err =
- security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
+ security_socket_connect(sock, (struct sockaddr *)address, addrlen);
if (err)
- goto out_put;
+ goto out;
- err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
- sock->file->f_flags);
-out_put:
- fput_light(sock->file, fput_needed);
+ err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
+ sock->file->f_flags | file_flags);
out:
return err;
}
+int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
+{
+ int ret = -EBADF;
+ struct fd f;
+
+ f = fdget(fd);
+ if (f.file) {
+ struct sockaddr_storage address;
+
+ ret = move_addr_to_kernel(uservaddr, addrlen, &address);
+ if (!ret)
+ ret = __sys_connect_file(f.file, &address, addrlen, 0);
+ if (f.flags)
+ fput(f.file);
+ }
+
+ return ret;
+}
+
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
int, addrlen)
{
@@ -2232,15 +2278,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return err < 0 ? err : 0;
}
-static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
- struct msghdr *msg_sys, unsigned int flags,
- struct used_address *used_address,
- unsigned int allowed_msghdr_flags)
+static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
+ unsigned int flags, struct used_address *used_address,
+ unsigned int allowed_msghdr_flags)
{
- struct compat_msghdr __user *msg_compat =
- (struct compat_msghdr __user *)msg;
- struct sockaddr_storage address;
- struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
unsigned char ctl[sizeof(struct cmsghdr) + 20]
__aligned(sizeof(__kernel_size_t));
/* 20 is size of ipv6_pktinfo */
@@ -2248,19 +2289,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
int ctl_len;
ssize_t err;
- msg_sys->msg_name = &address;
-
- if (MSG_CMSG_COMPAT & flags)
- err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
- else
- err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
- if (err < 0)
- return err;
-
err = -ENOBUFS;
if (msg_sys->msg_controllen > INT_MAX)
- goto out_freeiov;
+ goto out;
flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
ctl_len = msg_sys->msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
@@ -2268,7 +2300,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
sizeof(ctl));
if (err)
- goto out_freeiov;
+ goto out;
ctl_buf = msg_sys->msg_control;
ctl_len = msg_sys->msg_controllen;
} else if (ctl_len) {
@@ -2277,7 +2309,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
if (ctl_len > sizeof(ctl)) {
ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
if (ctl_buf == NULL)
- goto out_freeiov;
+ goto out;
}
err = -EFAULT;
/*
@@ -2323,7 +2355,47 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
out_freectl:
if (ctl_buf != ctl)
sock_kfree_s(sock->sk, ctl_buf, ctl_len);
-out_freeiov:
+out:
+ return err;
+}
+
+int sendmsg_copy_msghdr(struct msghdr *msg,
+ struct user_msghdr __user *umsg, unsigned flags,
+ struct iovec **iov)
+{
+ int err;
+
+ if (flags & MSG_CMSG_COMPAT) {
+ struct compat_msghdr __user *msg_compat;
+
+ msg_compat = (struct compat_msghdr __user *) umsg;
+ err = get_compat_msghdr(msg, msg_compat, NULL, iov);
+ } else {
+ err = copy_msghdr_from_user(msg, umsg, NULL, iov);
+ }
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
+ struct msghdr *msg_sys, unsigned int flags,
+ struct used_address *used_address,
+ unsigned int allowed_msghdr_flags)
+{
+ struct sockaddr_storage address;
+ struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+ ssize_t err;
+
+ msg_sys->msg_name = &address;
+
+ err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov);
+ if (err < 0)
+ return err;
+
+ err = ____sys_sendmsg(sock, msg_sys, flags, used_address,
+ allowed_msghdr_flags);
kfree(iov);
return err;
}
@@ -2331,12 +2403,14 @@ out_freeiov:
/*
* BSD sendmsg interface
*/
-long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags)
{
- struct msghdr msg_sys;
+ /* disallow ancillary data requests from this path */
+ if (msg->msg_control || msg->msg_controllen)
+ return -EINVAL;
- return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+ return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
@@ -2442,33 +2516,41 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
}
-static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
- struct msghdr *msg_sys, unsigned int flags, int nosec)
+int recvmsg_copy_msghdr(struct msghdr *msg,
+ struct user_msghdr __user *umsg, unsigned flags,
+ struct sockaddr __user **uaddr,
+ struct iovec **iov)
{
- struct compat_msghdr __user *msg_compat =
- (struct compat_msghdr __user *)msg;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- unsigned long cmsg_ptr;
- int len;
ssize_t err;
- /* kernel mode address */
- struct sockaddr_storage addr;
-
- /* user mode address pointers */
- struct sockaddr __user *uaddr;
- int __user *uaddr_len = COMPAT_NAMELEN(msg);
-
- msg_sys->msg_name = &addr;
+ if (MSG_CMSG_COMPAT & flags) {
+ struct compat_msghdr __user *msg_compat;
- if (MSG_CMSG_COMPAT & flags)
- err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
- else
- err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
+ msg_compat = (struct compat_msghdr __user *) umsg;
+ err = get_compat_msghdr(msg, msg_compat, uaddr, iov);
+ } else {
+ err = copy_msghdr_from_user(msg, umsg, uaddr, iov);
+ }
if (err < 0)
return err;
+ return 0;
+}
+
+static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys,
+ struct user_msghdr __user *msg,
+ struct sockaddr __user *uaddr,
+ unsigned int flags, int nosec)
+{
+ struct compat_msghdr __user *msg_compat =
+ (struct compat_msghdr __user *) msg;
+ int __user *uaddr_len = COMPAT_NAMELEN(msg);
+ struct sockaddr_storage addr;
+ unsigned long cmsg_ptr;
+ int len;
+ ssize_t err;
+
+ msg_sys->msg_name = &addr;
cmsg_ptr = (unsigned long)msg_sys->msg_control;
msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
@@ -2477,9 +2559,14 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
- err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags);
+
+ if (unlikely(nosec))
+ err = sock_recvmsg_nosec(sock, msg_sys, flags);
+ else
+ err = sock_recvmsg(sock, msg_sys, flags);
+
if (err < 0)
- goto out_freeiov;
+ goto out;
len = err;
if (uaddr != NULL) {
@@ -2487,12 +2574,12 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
msg_sys->msg_namelen, uaddr,
uaddr_len);
if (err < 0)
- goto out_freeiov;
+ goto out;
}
err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
COMPAT_FLAGS(msg));
if (err)
- goto out_freeiov;
+ goto out;
if (MSG_CMSG_COMPAT & flags)
err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
&msg_compat->msg_controllen);
@@ -2500,10 +2587,25 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
&msg->msg_controllen);
if (err)
- goto out_freeiov;
+ goto out;
err = len;
+out:
+ return err;
+}
-out_freeiov:
+static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
+ struct msghdr *msg_sys, unsigned int flags, int nosec)
+{
+ struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+ /* user mode address pointers */
+ struct sockaddr __user *uaddr;
+ ssize_t err;
+
+ err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov);
+ if (err < 0)
+ return err;
+
+ err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec);
kfree(iov);
return err;
}
@@ -2512,12 +2614,15 @@ out_freeiov:
* BSD recvmsg interface
*/
-long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
- unsigned int flags)
+long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user *uaddr, unsigned int flags)
{
- struct msghdr msg_sys;
+ /* disallow ancillary data requests from this path */
+ if (msg->msg_control || msg->msg_controllen)
+ return -EINVAL;
- return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+ return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
@@ -2833,7 +2938,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
a[2], true);
break;
case SYS_RECVMMSG:
- if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
+ if (IS_ENABLED(CONFIG_64BIT))
err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
a[2], a[3],
(struct __kernel_timespec __user *)a[4],
@@ -3452,6 +3557,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCSARP:
case SIOCGARP:
case SIOCDARP:
+ case SIOCOUTQ:
+ case SIOCOUTQNSD:
case SIOCATMARK:
return sock_do_ioctl(net, sock, cmd, arg);
}
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index d024af4be85e..8b4d72b1a066 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -175,7 +175,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
return 0;
len = (buf + buflen) - delim - 1;
- p = kstrndup(delim + 1, len, GFP_KERNEL);
+ p = kmemdup_nul(delim + 1, len, GFP_KERNEL);
if (p) {
u32 scope_id = 0;
struct net_device *dev;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index cdb05b48de44..5748ad0ba1bd 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -221,55 +221,6 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info)
}
EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
-/**
- * rpcauth_list_flavors - discover registered flavors and pseudoflavors
- * @array: array to fill in
- * @size: size of "array"
- *
- * Returns the number of array items filled in, or a negative errno.
- *
- * The returned array is not sorted by any policy. Callers should not
- * rely on the order of the items in the returned array.
- */
-int
-rpcauth_list_flavors(rpc_authflavor_t *array, int size)
-{
- const struct rpc_authops *ops;
- rpc_authflavor_t flavor, pseudos[4];
- int i, len, result = 0;
-
- rcu_read_lock();
- for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
- ops = rcu_dereference(auth_flavors[flavor]);
- if (result >= size) {
- result = -ENOMEM;
- break;
- }
-
- if (ops == NULL)
- continue;
- if (ops->list_pseudoflavors == NULL) {
- array[result++] = ops->au_flavor;
- continue;
- }
- len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos));
- if (len < 0) {
- result = len;
- break;
- }
- for (i = 0; i < len; i++) {
- if (result >= size) {
- result = -ENOMEM;
- break;
- }
- array[result++] = pseudos[i];
- }
- }
- rcu_read_unlock();
- return result;
-}
-EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
-
struct rpc_auth *
rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 4ce42c62458e..24ca861815b1 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
goto unwrap_failed;
- if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset))
+ if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
goto unwrap_failed;
maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
@@ -2118,7 +2118,6 @@ static const struct rpc_authops authgss_ops = {
.hash_cred = gss_hash_cred,
.lookup_cred = gss_lookup_cred,
.crcreate = gss_create_cred,
- .list_pseudoflavors = gss_mech_list_pseudoflavors,
.info2flavor = gss_mech_info2flavor,
.flavor2info = gss_mech_flavor2info,
};
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 550fdf18d3b3..3b7f721c023b 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -228,14 +228,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
ret = 0;
err_free_raw:
- memset(rawkey, 0, keybytes);
- kfree(rawkey);
+ kzfree(rawkey);
err_free_out:
- memset(outblockdata, 0, blocksize);
- kfree(outblockdata);
+ kzfree(outblockdata);
err_free_in:
- memset(inblockdata, 0, blocksize);
- kfree(inblockdata);
+ kzfree(inblockdata);
err_free_cipher:
crypto_free_sync_skcipher(cipher);
err_return:
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 6e5d6d240215..75b3c2e9e8f8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -253,6 +253,7 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
{
u32 seq_send;
int tmp;
+ u32 time32;
p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate));
if (IS_ERR(p))
@@ -290,9 +291,11 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
p = ERR_PTR(-ENOSYS);
goto out_err;
}
- p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+ p = simple_get_bytes(p, end, &time32, sizeof(time32));
if (IS_ERR(p))
goto out_err;
+ /* unsigned 32-bit time overflows in year 2106 */
+ ctx->endtime = (time64_t)time32;
p = simple_get_bytes(p, end, &seq_send, sizeof(seq_send));
if (IS_ERR(p))
goto out_err;
@@ -587,15 +590,18 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
{
u64 seq_send64;
int keylen;
+ u32 time32;
p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags));
if (IS_ERR(p))
goto out_err;
ctx->initiate = ctx->flags & KRB5_CTX_FLAG_INITIATOR;
- p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
+ p = simple_get_bytes(p, end, &time32, sizeof(time32));
if (IS_ERR(p))
goto out_err;
+ /* unsigned 32-bit time overflows in year 2106 */
+ ctx->endtime = (time64_t)time32;
p = simple_get_bytes(p, end, &seq_send64, sizeof(seq_send64));
if (IS_ERR(p))
goto out_err;
@@ -659,7 +665,7 @@ out_err:
static int
gss_import_sec_context_kerberos(const void *p, size_t len,
struct gss_ctx *ctx_id,
- time_t *endtime,
+ time64_t *endtime,
gfp_t gfp_mask)
{
const void *end = (const void *)((const char *)p + len);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 48fe4a591b54..f1d280accf43 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -131,14 +131,14 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text,
struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
.data = cksumdata};
void *ptr;
- s32 now;
+ time64_t now;
u32 seq_send;
u8 *cksumkey;
dprintk("RPC: %s\n", __func__);
BUG_ON(ctx == NULL);
- now = get_seconds();
+ now = ktime_get_real_seconds();
ptr = setup_token(ctx, token);
@@ -170,7 +170,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
struct xdr_netobj cksumobj = { .len = sizeof(cksumdata),
.data = cksumdata};
void *krb5_hdr;
- s32 now;
+ time64_t now;
u8 *cksumkey;
unsigned int cksum_usage;
__be64 seq_send_be64;
@@ -198,7 +198,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
memcpy(krb5_hdr + GSS_KRB5_TOK_HDR_LEN, cksumobj.data, cksumobj.len);
- now = get_seconds();
+ now = ktime_get_real_seconds();
return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index ef2b25b86d2f..aaab91cf24c8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -124,7 +124,7 @@ gss_verify_mic_v1(struct krb5_ctx *ctx,
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > ctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
@@ -149,7 +149,7 @@ gss_verify_mic_v2(struct krb5_ctx *ctx,
char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
struct xdr_netobj cksumobj = {.len = sizeof(cksumdata),
.data = cksumdata};
- s32 now;
+ time64_t now;
u8 *ptr = read_token->data;
u8 *cksumkey;
u8 flags;
@@ -194,7 +194,7 @@ gss_verify_mic_v2(struct krb5_ctx *ctx,
return GSS_S_BAD_SIG;
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > ctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 14a0aff0cd84..6c1920eed771 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -163,7 +163,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
.data = cksumdata};
int blocksize = 0, plainlen;
unsigned char *ptr, *msg_start;
- s32 now;
+ time64_t now;
int headlen;
struct page **tmp_pages;
u32 seq_send;
@@ -172,7 +172,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
dprintk("RPC: %s\n", __func__);
- now = get_seconds();
+ now = ktime_get_real_seconds();
blocksize = crypto_sync_skcipher_blocksize(kctx->enc);
gss_krb5_add_padding(buf, offset, blocksize);
@@ -268,7 +268,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
char cksumdata[GSS_KRB5_MAX_CKSUM_LEN];
struct xdr_netobj md5cksum = {.len = sizeof(cksumdata),
.data = cksumdata};
- s32 now;
+ time64_t now;
int direction;
s32 seqnum;
unsigned char *ptr;
@@ -359,7 +359,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > kctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
@@ -439,7 +439,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
struct xdr_buf *buf, struct page **pages)
{
u8 *ptr, *plainhdr;
- s32 now;
+ time64_t now;
u8 flags = 0x00;
__be16 *be16ptr;
__be64 *be64ptr;
@@ -481,14 +481,14 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
if (err)
return err;
- now = get_seconds();
+ now = ktime_get_real_seconds();
return (kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
}
static u32
gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
{
- s32 now;
+ time64_t now;
u8 *ptr;
u8 flags = 0x00;
u16 ec, rrc;
@@ -557,7 +557,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
/* do sequencing checks */
/* it got through unscathed. Make sure the context is unexpired */
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (now > kctx->endtime)
return GSS_S_CONTEXT_EXPIRED;
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 82060099a429..db550bfc2642 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -20,6 +20,7 @@
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/gss_api.h>
#include <linux/sunrpc/clnt.h>
+#include <trace/events/rpcgss.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -158,7 +159,6 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
if (sprint_oid(obj->data, obj->len, buf, sizeof(buf)) < 0)
return NULL;
- dprintk("RPC: %s(%s)\n", __func__, buf);
request_module("rpc-auth-gss-%s", buf);
rcu_read_lock();
@@ -172,6 +172,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
}
}
rcu_read_unlock();
+ if (!gm)
+ trace_rpcgss_oid_to_mech(buf);
return gm;
}
@@ -218,35 +220,6 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
}
/**
- * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
- * @array_ptr: array to fill in
- * @size: size of "array"
- *
- * Returns the number of array items filled in, or a negative errno.
- *
- * The returned array is not sorted by any policy. Callers should not
- * rely on the order of the items in the returned array.
- */
-int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
-{
- struct gss_api_mech *pos = NULL;
- int j, i = 0;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pos, &registered_mechs, gm_list) {
- for (j = 0; j < pos->gm_pf_num; j++) {
- if (i >= size) {
- spin_unlock(&registered_mechs_lock);
- return -ENOMEM;
- }
- array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
- }
- }
- rcu_read_unlock();
- return i;
-}
-
-/**
* gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor
* @gm: GSS mechanism handle
* @qop: GSS quality-of-protection value
@@ -374,7 +347,7 @@ int
gss_import_sec_context(const void *input_token, size_t bufsize,
struct gss_api_mech *mech,
struct gss_ctx **ctx_id,
- time_t *endtime,
+ time64_t *endtime,
gfp_t gfp_mask)
{
if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 8be2f209982b..65b67b257302 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -49,6 +49,9 @@
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/cache.h>
+
+#include <trace/events/rpcgss.h>
+
#include "gss_rpc_upcall.h"
@@ -200,7 +203,7 @@ static int rsi_parse(struct cache_detail *cd,
char *ep;
int len;
struct rsi rsii, *rsip = NULL;
- time_t expiry;
+ time64_t expiry;
int status = -EINVAL;
memset(&rsii, 0, sizeof(rsii));
@@ -433,7 +436,7 @@ static int rsc_parse(struct cache_detail *cd,
int id;
int len, rv;
struct rsc rsci, *rscp = NULL;
- time_t expiry;
+ time64_t expiry;
int status = -EINVAL;
struct gss_api_mech *gm = NULL;
@@ -1075,24 +1078,32 @@ gss_read_verf(struct rpc_gss_wire_cred *gc,
return 0;
}
-/* Ok this is really heavily depending on a set of semantics in
- * how rqstp is set up by svc_recv and pages laid down by the
- * server when reading a request. We are basically guaranteed that
- * the token lays all down linearly across a set of pages, starting
- * at iov_base in rq_arg.head[0] which happens to be the first of a
- * set of pages stored in rq_pages[].
- * rq_arg.head[0].iov_base will provide us the page_base to pass
- * to the upcall.
- */
-static inline int
-gss_read_proxy_verf(struct svc_rqst *rqstp,
- struct rpc_gss_wire_cred *gc, __be32 *authp,
- struct xdr_netobj *in_handle,
- struct gssp_in_token *in_token)
+static void gss_free_in_token_pages(struct gssp_in_token *in_token)
{
- struct kvec *argv = &rqstp->rq_arg.head[0];
u32 inlen;
- int res;
+ int i;
+
+ i = 0;
+ inlen = in_token->page_len;
+ while (inlen) {
+ if (in_token->pages[i])
+ put_page(in_token->pages[i]);
+ inlen -= inlen > PAGE_SIZE ? PAGE_SIZE : inlen;
+ }
+
+ kfree(in_token->pages);
+ in_token->pages = NULL;
+}
+
+static int gss_read_proxy_verf(struct svc_rqst *rqstp,
+ struct rpc_gss_wire_cred *gc, __be32 *authp,
+ struct xdr_netobj *in_handle,
+ struct gssp_in_token *in_token)
+{
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ unsigned int page_base, length;
+ int pages, i, res;
+ size_t inlen;
res = gss_read_common_verf(gc, argv, authp, in_handle);
if (res)
@@ -1102,10 +1113,36 @@ gss_read_proxy_verf(struct svc_rqst *rqstp,
if (inlen > (argv->iov_len + rqstp->rq_arg.page_len))
return SVC_DENIED;
- in_token->pages = rqstp->rq_pages;
- in_token->page_base = (ulong)argv->iov_base & ~PAGE_MASK;
+ pages = DIV_ROUND_UP(inlen, PAGE_SIZE);
+ in_token->pages = kcalloc(pages, sizeof(struct page *), GFP_KERNEL);
+ if (!in_token->pages)
+ return SVC_DENIED;
+ in_token->page_base = 0;
in_token->page_len = inlen;
+ for (i = 0; i < pages; i++) {
+ in_token->pages[i] = alloc_page(GFP_KERNEL);
+ if (!in_token->pages[i]) {
+ gss_free_in_token_pages(in_token);
+ return SVC_DENIED;
+ }
+ }
+ length = min_t(unsigned int, inlen, argv->iov_len);
+ memcpy(page_address(in_token->pages[0]), argv->iov_base, length);
+ inlen -= length;
+
+ i = 1;
+ page_base = rqstp->rq_arg.page_base;
+ while (inlen) {
+ length = min_t(unsigned int, inlen, PAGE_SIZE);
+ memcpy(page_address(in_token->pages[i]),
+ page_address(rqstp->rq_arg.pages[i]) + page_base,
+ length);
+
+ inlen -= length;
+ page_base = 0;
+ i++;
+ }
return 0;
}
@@ -1184,7 +1221,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
static atomic64_t ctxhctr;
long long ctxh;
struct gss_api_mech *gm = NULL;
- time_t expiry;
+ time64_t expiry;
int status = -EINVAL;
memset(&rsci, 0, sizeof(rsci));
@@ -1211,6 +1248,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
dprintk("RPC: No creds found!\n");
goto out;
} else {
+ struct timespec64 boot;
/* steal creds */
rsci.cred = ud->creds;
@@ -1231,6 +1269,9 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
&expiry, GFP_KERNEL);
if (status)
goto out;
+
+ getboottime64(&boot);
+ expiry -= boot.tv_sec;
}
rsci.h.expiry_time = expiry;
@@ -1270,9 +1311,8 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
if (status)
goto out;
- dprintk("RPC: svcauth_gss: gss major status = %d "
- "minor status = %d\n",
- ud.major_status, ud.minor_status);
+ trace_rpcgss_accept_upcall(rqstp->rq_xid, ud.major_status,
+ ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
@@ -1280,8 +1320,11 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
break;
case GSS_S_COMPLETE:
status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
- if (status)
+ if (status) {
+ pr_info("%s: gss_proxy_save_rsc failed (%d)\n",
+ __func__, status);
goto out;
+ }
cli_handle.data = (u8 *)&handle;
cli_handle.len = sizeof(handle);
break;
@@ -1292,15 +1335,20 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
/* Got an answer to the upcall; use it: */
if (gss_write_init_verf(sn->rsc_cache, rqstp,
- &cli_handle, &ud.major_status))
+ &cli_handle, &ud.major_status)) {
+ pr_info("%s: gss_write_init_verf failed\n", __func__);
goto out;
+ }
if (gss_write_resv(resv, PAGE_SIZE,
&cli_handle, &ud.out_token,
- ud.major_status, ud.minor_status))
+ ud.major_status, ud.minor_status)) {
+ pr_info("%s: gss_write_resv failed\n", __func__);
goto out;
+ }
ret = SVC_COMPLETE;
out:
+ gss_free_in_token_pages(&ud.in_token);
gssp_free_upcall_data(&ud);
return ret;
}
@@ -1384,10 +1432,10 @@ static ssize_t read_gssp(struct file *file, char __user *buf,
return len;
}
-static const struct file_operations use_gss_proxy_ops = {
- .open = nonseekable_open,
- .write = write_gssp,
- .read = read_gssp,
+static const struct proc_ops use_gss_proxy_proc_ops = {
+ .proc_open = nonseekable_open,
+ .proc_write = write_gssp,
+ .proc_read = read_gssp,
};
static int create_use_gss_proxy_proc_entry(struct net *net)
@@ -1398,7 +1446,7 @@ static int create_use_gss_proxy_proc_entry(struct net *net)
sn->use_gss_proxy = -1;
*p = proc_create_data("use-gss-proxy", S_IFREG | 0600,
sn->proc_net_rpc,
- &use_gss_proxy_ops, net);
+ &use_gss_proxy_proc_ops, net);
if (!*p)
return -ENOMEM;
init_gssp_clnt(sn);
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 339e8c077c2d..195b40c5dae4 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -220,7 +220,7 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
goto out;
spin_lock_bh(&xprt->bc_pa_lock);
- xprt->bc_alloc_max -= max_reqs;
+ xprt->bc_alloc_max -= min(max_reqs, xprt->bc_alloc_max);
list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
dprintk("RPC: req=%p\n", req);
list_del(&req->rq_bc_pa_list);
@@ -307,8 +307,8 @@ void xprt_free_bc_rqst(struct rpc_rqst *req)
*/
dprintk("RPC: Last session removed req=%p\n", req);
xprt_free_allocation(req);
- return;
}
+ xprt_put(xprt);
}
/*
@@ -339,7 +339,7 @@ found:
spin_unlock(&xprt->bc_pa_lock);
if (new) {
if (req != new)
- xprt_free_bc_rqst(new);
+ xprt_free_allocation(new);
break;
} else if (req)
break;
@@ -368,6 +368,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
dprintk("RPC: add callback request to list\n");
+ xprt_get(xprt);
spin_lock(&bc_serv->sv_cb_lock);
list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
wake_up(&bc_serv->sv_cb_waitq);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 6f1528f271ee..bd843a81afa0 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -42,7 +42,7 @@ static bool cache_listeners_exist(struct cache_detail *detail);
static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
+ time64_t now = seconds_since_boot();
INIT_HLIST_NODE(&h->cache_list);
h->flags = 0;
kref_init(&h->ref);
@@ -53,9 +53,6 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail)
h->last_refresh = now;
}
-static inline int cache_is_valid(struct cache_head *h);
-static void cache_fresh_locked(struct cache_head *head, time_t expiry,
- struct cache_detail *detail);
static void cache_fresh_unlocked(struct cache_head *head,
struct cache_detail *detail);
@@ -80,6 +77,22 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
return NULL;
}
+static void sunrpc_begin_cache_remove_entry(struct cache_head *ch,
+ struct cache_detail *cd)
+{
+ /* Must be called under cd->hash_lock */
+ hlist_del_init_rcu(&ch->cache_list);
+ set_bit(CACHE_CLEANED, &ch->flags);
+ cd->entries --;
+}
+
+static void sunrpc_end_cache_remove_entry(struct cache_head *ch,
+ struct cache_detail *cd)
+{
+ cache_fresh_unlocked(ch, cd);
+ cache_put(ch, cd);
+}
+
static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
struct cache_head *key,
int hash)
@@ -103,11 +116,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
hlist_for_each_entry_rcu(tmp, head, cache_list) {
if (detail->match(tmp, key)) {
if (cache_is_expired(detail, tmp)) {
- hlist_del_init_rcu(&tmp->cache_list);
- detail->entries --;
- if (cache_is_valid(tmp) == -EAGAIN)
- set_bit(CACHE_NEGATIVE, &tmp->flags);
- cache_fresh_locked(tmp, 0, detail);
+ sunrpc_begin_cache_remove_entry(tmp, detail);
freeme = tmp;
break;
}
@@ -123,10 +132,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
cache_get(new);
spin_unlock(&detail->hash_lock);
- if (freeme) {
- cache_fresh_unlocked(freeme, detail);
- cache_put(freeme, detail);
- }
+ if (freeme)
+ sunrpc_end_cache_remove_entry(freeme, detail);
return new;
}
@@ -145,10 +152,10 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup_rcu);
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
-static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+static void cache_fresh_locked(struct cache_head *head, time64_t expiry,
struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
+ time64_t now = seconds_since_boot();
if (now <= detail->flush_time)
/* ensure it isn't immediately treated as expired */
now = detail->flush_time + 1;
@@ -280,7 +287,7 @@ int cache_check(struct cache_detail *detail,
struct cache_head *h, struct cache_req *rqstp)
{
int rv;
- long refresh_age, age;
+ time64_t refresh_age, age;
/* First decide return status as best we can */
rv = cache_is_valid(h);
@@ -294,7 +301,7 @@ int cache_check(struct cache_detail *detail,
rv = -ENOENT;
} else if (rv == -EAGAIN ||
(h->expiry_time != 0 && age > refresh_age/2)) {
- dprintk("RPC: Want update, refage=%ld, age=%ld\n",
+ dprintk("RPC: Want update, refage=%lld, age=%lld\n",
refresh_age, age);
if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
switch (cache_make_upcall(detail, h)) {
@@ -373,7 +380,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
spin_lock(&cache_list_lock);
cd->nextcheck = 0;
cd->entries = 0;
- atomic_set(&cd->readers, 0);
+ atomic_set(&cd->writers, 0);
cd->last_close = 0;
cd->last_warn = -1;
list_add(&cd->others, &cache_list);
@@ -460,8 +467,7 @@ static int cache_clean(void)
if (!cache_is_expired(current_detail, ch))
continue;
- hlist_del_init_rcu(&ch->cache_list);
- current_detail->entries--;
+ sunrpc_begin_cache_remove_entry(ch, current_detail);
rv = 1;
break;
}
@@ -471,11 +477,8 @@ static int cache_clean(void)
if (!ch)
current_index ++;
spin_unlock(&cache_list_lock);
- if (ch) {
- set_bit(CACHE_CLEANED, &ch->flags);
- cache_fresh_unlocked(ch, d);
- cache_put(ch, d);
- }
+ if (ch)
+ sunrpc_end_cache_remove_entry(ch, d);
} else
spin_unlock(&cache_list_lock);
@@ -531,13 +534,9 @@ void cache_purge(struct cache_detail *detail)
for (i = 0; i < detail->hash_size; i++) {
head = &detail->hash_table[i];
hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
- hlist_del_init_rcu(&ch->cache_list);
- detail->entries--;
-
- set_bit(CACHE_CLEANED, &ch->flags);
+ sunrpc_begin_cache_remove_entry(ch, detail);
spin_unlock(&detail->hash_lock);
- cache_fresh_unlocked(ch, detail);
- cache_put(ch, detail);
+ sunrpc_end_cache_remove_entry(ch, detail);
spin_lock(&detail->hash_lock);
}
}
@@ -1029,11 +1028,13 @@ static int cache_open(struct inode *inode, struct file *filp,
}
rp->offset = 0;
rp->q.reader = 1;
- atomic_inc(&cd->readers);
+
spin_lock(&queue_lock);
list_add(&rp->q.list, &cd->queue);
spin_unlock(&queue_lock);
}
+ if (filp->f_mode & FMODE_WRITE)
+ atomic_inc(&cd->writers);
filp->private_data = rp;
return 0;
}
@@ -1062,8 +1063,10 @@ static int cache_release(struct inode *inode, struct file *filp,
filp->private_data = NULL;
kfree(rp);
+ }
+ if (filp->f_mode & FMODE_WRITE) {
+ atomic_dec(&cd->writers);
cd->last_close = seconds_since_boot();
- atomic_dec(&cd->readers);
}
module_put(cd->owner);
return 0;
@@ -1171,7 +1174,7 @@ static void warn_no_listener(struct cache_detail *detail)
static bool cache_listeners_exist(struct cache_detail *detail)
{
- if (atomic_read(&detail->readers))
+ if (atomic_read(&detail->writers))
return true;
if (detail->last_close == 0)
/* This cache was never opened */
@@ -1406,7 +1409,7 @@ static int c_show(struct seq_file *m, void *p)
return cd->cache_show(m, cd, NULL);
ifdebug(CACHE)
- seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
+ seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
kref_read(&cp->ref), cp->flags);
cache_get(cp);
@@ -1479,7 +1482,7 @@ static ssize_t read_flush(struct file *file, char __user *buf,
char tbuf[22];
size_t len;
- len = snprintf(tbuf, sizeof(tbuf), "%lu\n",
+ len = snprintf(tbuf, sizeof(tbuf), "%llu\n",
convert_to_wallclock(cd->flush_time));
return simple_read_from_buffer(buf, count, ppos, tbuf, len);
}
@@ -1490,7 +1493,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
{
char tbuf[20];
char *ep;
- time_t now;
+ time64_t now;
if (*ppos || count > sizeof(tbuf)-1)
return -EINVAL;
@@ -1520,6 +1523,9 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
cd->nextcheck = now;
cache_flush();
+ if (cd->flush)
+ cd->flush();
+
*ppos += count;
return count;
}
@@ -1570,15 +1576,14 @@ static int cache_release_procfs(struct inode *inode, struct file *filp)
return cache_release(inode, filp, cd);
}
-static const struct file_operations cache_file_operations_procfs = {
- .owner = THIS_MODULE,
- .llseek = no_llseek,
- .read = cache_read_procfs,
- .write = cache_write_procfs,
- .poll = cache_poll_procfs,
- .unlocked_ioctl = cache_ioctl_procfs, /* for FIONREAD */
- .open = cache_open_procfs,
- .release = cache_release_procfs,
+static const struct proc_ops cache_channel_proc_ops = {
+ .proc_lseek = no_llseek,
+ .proc_read = cache_read_procfs,
+ .proc_write = cache_write_procfs,
+ .proc_poll = cache_poll_procfs,
+ .proc_ioctl = cache_ioctl_procfs, /* for FIONREAD */
+ .proc_open = cache_open_procfs,
+ .proc_release = cache_release_procfs,
};
static int content_open_procfs(struct inode *inode, struct file *filp)
@@ -1595,11 +1600,11 @@ static int content_release_procfs(struct inode *inode, struct file *filp)
return content_release(inode, filp, cd);
}
-static const struct file_operations content_file_operations_procfs = {
- .open = content_open_procfs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = content_release_procfs,
+static const struct proc_ops content_proc_ops = {
+ .proc_open = content_open_procfs,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = content_release_procfs,
};
static int open_flush_procfs(struct inode *inode, struct file *filp)
@@ -1633,12 +1638,12 @@ static ssize_t write_flush_procfs(struct file *filp,
return write_flush(filp, buf, count, ppos, cd);
}
-static const struct file_operations cache_flush_operations_procfs = {
- .open = open_flush_procfs,
- .read = read_flush_procfs,
- .write = write_flush_procfs,
- .release = release_flush_procfs,
- .llseek = no_llseek,
+static const struct proc_ops cache_flush_proc_ops = {
+ .proc_open = open_flush_procfs,
+ .proc_read = read_flush_procfs,
+ .proc_write = write_flush_procfs,
+ .proc_release = release_flush_procfs,
+ .proc_lseek = no_llseek,
};
static void remove_cache_proc_entries(struct cache_detail *cd)
@@ -1661,19 +1666,19 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
goto out_nomem;
p = proc_create_data("flush", S_IFREG | 0600,
- cd->procfs, &cache_flush_operations_procfs, cd);
+ cd->procfs, &cache_flush_proc_ops, cd);
if (p == NULL)
goto out_nomem;
if (cd->cache_request || cd->cache_parse) {
p = proc_create_data("channel", S_IFREG | 0600, cd->procfs,
- &cache_file_operations_procfs, cd);
+ &cache_channel_proc_ops, cd);
if (p == NULL)
goto out_nomem;
}
if (cd->cache_show) {
p = proc_create_data("content", S_IFREG | 0400, cd->procfs,
- &content_file_operations_procfs, cd);
+ &content_proc_ops, cd);
if (p == NULL)
goto out_nomem;
}
@@ -1885,10 +1890,9 @@ void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
{
spin_lock(&cd->hash_lock);
if (!hlist_unhashed(&h->cache_list)){
- hlist_del_init_rcu(&h->cache_list);
- cd->entries--;
+ sunrpc_begin_cache_remove_entry(h, cd);
spin_unlock(&cd->hash_lock);
- cache_put(h, cd);
+ sunrpc_end_cache_remove_entry(h, cd);
} else
spin_unlock(&cd->hash_lock);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d8679b6027e9..7324b21f923e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -591,6 +591,9 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
xprt->resvport = 1;
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
+ xprt->reuseport = 0;
+ if (args->flags & RPC_CLNT_CREATE_REUSEPORT)
+ xprt->reuseport = 1;
clnt = rpc_create_xprt(args, xprt);
if (IS_ERR(clnt) || args->nconnect <= 1)
@@ -1676,8 +1679,6 @@ call_reserveresult(struct rpc_task *task)
return;
}
- printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n",
- __func__, status);
rpc_call_rpcerror(task, -EIO);
return;
}
@@ -1686,11 +1687,8 @@ call_reserveresult(struct rpc_task *task)
* Even though there was an error, we may have acquired
* a request slot somehow. Make sure not to leak it.
*/
- if (task->tk_rqstp) {
- printk(KERN_ERR "%s: status=%d, request allocated anyway\n",
- __func__, status);
+ if (task->tk_rqstp)
xprt_release(task);
- }
switch (status) {
case -ENOMEM:
@@ -1699,14 +1697,9 @@ call_reserveresult(struct rpc_task *task)
case -EAGAIN: /* woken up; retry */
task->tk_action = call_retry_reserve;
return;
- case -EIO: /* probably a shutdown */
- break;
default:
- printk(KERN_ERR "%s: unrecognized error %d, exiting\n",
- __func__, status);
- break;
+ rpc_call_rpcerror(task, status);
}
- rpc_call_rpcerror(task, status);
}
/*
@@ -1837,7 +1830,7 @@ call_allocate(struct rpc_task *task)
return;
}
- rpc_exit(task, -ERESTARTSYS);
+ rpc_call_rpcerror(task, -ERESTARTSYS);
}
static int
@@ -1862,6 +1855,7 @@ rpc_xdr_encode(struct rpc_task *task)
req->rq_rbuffer,
req->rq_rcvsize);
+ req->rq_reply_bytes_recvd = 0;
req->rq_snd_buf.head[0].iov_len = 0;
xdr_init_encode(&xdr, &req->rq_snd_buf,
req->rq_snd_buf.head[0].iov_base, req);
@@ -1881,6 +1875,8 @@ call_encode(struct rpc_task *task)
if (!rpc_task_need_encode(task))
goto out;
dprint_status(task);
+ /* Dequeue task from the receive queue while we're encoding */
+ xprt_request_dequeue_xprt(task);
/* Encode here so that rpcsec_gss can use correct sequence number. */
rpc_xdr_encode(task);
/* Did the encode result in an error condition? */
@@ -1970,6 +1966,7 @@ call_bind(struct rpc_task *task)
static void
call_bind_status(struct rpc_task *task)
{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
int status = -EIO;
if (rpc_task_transmitted(task)) {
@@ -1977,14 +1974,15 @@ call_bind_status(struct rpc_task *task)
return;
}
- if (task->tk_status >= 0) {
- dprint_status(task);
+ dprint_status(task);
+ trace_rpc_bind_status(task);
+ if (task->tk_status >= 0)
+ goto out_next;
+ if (xprt_bound(xprt)) {
task->tk_status = 0;
- task->tk_action = call_connect;
- return;
+ goto out_next;
}
- trace_rpc_bind_status(task);
switch (task->tk_status) {
case -ENOMEM:
dprintk("RPC: %5u rpcbind out of memory\n", task->tk_pid);
@@ -2003,6 +2001,9 @@ call_bind_status(struct rpc_task *task)
task->tk_rebind_retry--;
rpc_delay(task, 3*HZ);
goto retry_timeout;
+ case -ENOBUFS:
+ rpc_delay(task, HZ >> 2);
+ goto retry_timeout;
case -EAGAIN:
goto retry_timeout;
case -ETIMEDOUT:
@@ -2026,7 +2027,6 @@ call_bind_status(struct rpc_task *task)
case -ENETDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
- case -ENOBUFS:
case -EPIPE:
dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
task->tk_pid, task->tk_status);
@@ -2043,7 +2043,9 @@ call_bind_status(struct rpc_task *task)
rpc_call_rpcerror(task, status);
return;
-
+out_next:
+ task->tk_action = call_connect;
+ return;
retry_timeout:
task->tk_status = 0;
task->tk_action = call_bind;
@@ -2090,6 +2092,7 @@ call_connect(struct rpc_task *task)
static void
call_connect_status(struct rpc_task *task)
{
+ struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
struct rpc_clnt *clnt = task->tk_client;
int status = task->tk_status;
@@ -2099,8 +2102,17 @@ call_connect_status(struct rpc_task *task)
}
dprint_status(task);
-
trace_rpc_connect_status(task);
+
+ if (task->tk_status == 0) {
+ clnt->cl_stats->netreconn++;
+ goto out_next;
+ }
+ if (xprt_connected(xprt)) {
+ task->tk_status = 0;
+ goto out_next;
+ }
+
task->tk_status = 0;
switch (status) {
case -ECONNREFUSED:
@@ -2117,9 +2129,8 @@ call_connect_status(struct rpc_task *task)
case -ENETDOWN:
case -ENETUNREACH:
case -EHOSTUNREACH:
- case -EADDRINUSE:
- case -ENOBUFS:
case -EPIPE:
+ case -EPROTO:
xprt_conditional_disconnect(task->tk_rqstp->rq_xprt,
task->tk_rqstp->rq_connect_cookie);
if (RPC_IS_SOFTCONN(task))
@@ -2127,17 +2138,20 @@ call_connect_status(struct rpc_task *task)
/* retry with existing socket, after a delay */
rpc_delay(task, 3*HZ);
/* fall through */
+ case -EADDRINUSE:
case -ENOTCONN:
case -EAGAIN:
case -ETIMEDOUT:
goto out_retry;
- case 0:
- clnt->cl_stats->netreconn++;
- task->tk_action = call_transmit;
- return;
+ case -ENOBUFS:
+ rpc_delay(task, HZ >> 2);
+ goto out_retry;
}
rpc_call_rpcerror(task, status);
return;
+out_next:
+ task->tk_action = call_transmit;
+ return;
out_retry:
/* Check for timeouts before looping back to call_bind */
task->tk_action = call_bind;
@@ -2365,7 +2379,7 @@ call_status(struct rpc_task *task)
case -ECONNABORTED:
case -ENOTCONN:
rpc_force_rebind(clnt);
- /* fall through */
+ break;
case -EADDRINUSE:
rpc_delay(task, 3*HZ);
/* fall through */
@@ -2462,6 +2476,7 @@ call_decode(struct rpc_task *task)
struct rpc_clnt *clnt = task->tk_client;
struct rpc_rqst *req = task->tk_rqstp;
struct xdr_stream xdr;
+ int err;
dprint_status(task);
@@ -2484,6 +2499,15 @@ call_decode(struct rpc_task *task)
* before it changed req->rq_reply_bytes_recvd.
*/
smp_rmb();
+
+ /*
+ * Did we ever call xprt_complete_rqst()? If not, we should assume
+ * the message is incomplete.
+ */
+ err = -EAGAIN;
+ if (!req->rq_reply_bytes_recvd)
+ goto out;
+
req->rq_rcv_buf.len = req->rq_private_buf.len;
/* Check that the softirq receive buffer is valid */
@@ -2492,7 +2516,9 @@ call_decode(struct rpc_task *task)
xdr_init_decode(&xdr, &req->rq_rcv_buf,
req->rq_rcv_buf.head[0].iov_base, req);
- switch (rpc_decode_header(task, &xdr)) {
+ err = rpc_decode_header(task, &xdr);
+out:
+ switch (err) {
case 0:
task->tk_action = rpc_exit_task;
task->tk_status = rpcauth_unwrap_resp(task, &xdr);
@@ -2501,9 +2527,6 @@ call_decode(struct rpc_task *task)
return;
case -EAGAIN:
task->tk_status = 0;
- xdr_free_bvec(&req->rq_rcv_buf);
- req->rq_reply_bytes_recvd = 0;
- req->rq_rcv_buf.len = 0;
if (task->tk_client->cl_discrtry)
xprt_conditional_disconnect(req->rq_xprt,
req->rq_connect_cookie);
@@ -2544,7 +2567,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr)
return 0;
out_fail:
trace_rpc_bad_callhdr(task);
- rpc_exit(task, error);
+ rpc_call_rpcerror(task, error);
return error;
}
@@ -2611,7 +2634,7 @@ out_garbage:
return -EAGAIN;
}
out_err:
- rpc_exit(task, error);
+ rpc_call_rpcerror(task, error);
return error;
out_unparsable:
@@ -2877,7 +2900,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
struct rpc_xprt *xprt;
unsigned long connect_timeout;
unsigned long reconnect_timeout;
- unsigned char resvport;
+ unsigned char resvport, reuseport;
int ret = 0;
rcu_read_lock();
@@ -2889,6 +2912,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
return -EAGAIN;
}
resvport = xprt->resvport;
+ reuseport = xprt->reuseport;
connect_timeout = xprt->connect_timeout;
reconnect_timeout = xprt->max_reconnect_timeout;
rcu_read_unlock();
@@ -2899,6 +2923,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
goto out_put_switch;
}
xprt->resvport = resvport;
+ xprt->reuseport = reuseport;
if (xprt->ops->set_connect_timeout != NULL)
xprt->ops->set_connect_timeout(xprt,
connect_timeout,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 748bac601e47..39e14d5edaf1 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -51,7 +51,7 @@ static BLOCKING_NOTIFIER_HEAD(rpc_pipefs_notifier_list);
int rpc_pipefs_notifier_register(struct notifier_block *nb)
{
- return blocking_notifier_chain_cond_register(&rpc_pipefs_notifier_list, nb);
+ return blocking_notifier_chain_register(&rpc_pipefs_notifier_list, nb);
}
EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register);
@@ -1416,8 +1416,7 @@ EXPORT_SYMBOL_GPL(gssd_running);
static int rpc_fs_get_tree(struct fs_context *fc)
{
- fc->s_fs_info = get_net(fc->net_ns);
- return vfs_get_super(fc, vfs_get_keyed_super, rpc_fill_super);
+ return get_tree_keyed(fc, rpc_fill_super, get_net(fc->net_ns));
}
static void rpc_fs_free_fc(struct fs_context *fc)
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 1f275aba786f..55e900255b0c 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -260,7 +260,7 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
rpc_reset_waitqueue_priority(queue);
queue->qlen = 0;
queue->timer_list.expires = 0;
- INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
+ INIT_DELAYED_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
INIT_LIST_HEAD(&queue->timer_list.list);
rpc_assign_waitqueue_name(queue, qname);
}
@@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq,
return NULL;
}
-static void
-rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
- struct rpc_wait_queue *queue, struct rpc_task *task)
-{
- rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL);
-}
-
/*
* Wake up a queued task while the queue lock is being held
*/
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue,
+ struct rpc_task *task)
{
- rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
-}
-
-/*
- * Wake up a task on a specific queue
- */
-void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
- struct rpc_wait_queue *queue,
- struct rpc_task *task)
-{
- if (!RPC_IS_QUEUED(task))
- return;
- spin_lock(&queue->lock);
- rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
- spin_unlock(&queue->lock);
+ rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue,
+ task, NULL, NULL);
}
/*
@@ -843,6 +824,7 @@ rpc_reset_task_statistics(struct rpc_task *task)
*/
void rpc_exit_task(struct rpc_task *task)
{
+ trace_rpc_task_end(task, task->tk_action);
task->tk_action = NULL;
if (task->tk_ops->rpc_count_stats)
task->tk_ops->rpc_count_stats(task, task->tk_calldata);
@@ -864,6 +846,8 @@ void rpc_signal_task(struct rpc_task *task)
if (!RPC_IS_ACTIVATED(task))
return;
+
+ trace_rpc_task_signalled(task, task->tk_action);
set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
smp_mb__after_atomic();
queue = READ_ONCE(task->tk_waitqueue);
@@ -930,8 +914,10 @@ static void __rpc_execute(struct rpc_task *task)
/*
* Signalled tasks should exit rather than sleep.
*/
- if (RPC_SIGNALLED(task))
+ if (RPC_SIGNALLED(task)) {
+ task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
+ }
/*
* The queue->lock protects against races with
@@ -965,8 +951,9 @@ static void __rpc_execute(struct rpc_task *task)
* clean up after sleeping on some queue, we don't
* break the loop here, but go around once more.
*/
- dprintk("RPC: %5u got signal\n", task->tk_pid);
+ trace_rpc_task_signalled(task, task->tk_action);
set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate);
+ task->tk_rpc_status = -ERESTARTSYS;
rpc_exit(task, -ERESTARTSYS);
}
dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 7c74197c2ecf..c964b48eaaba 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -69,12 +69,11 @@ static int rpc_proc_open(struct inode *inode, struct file *file)
return single_open(file, rpc_proc_show, PDE_DATA(inode));
}
-static const struct file_operations rpc_proc_fops = {
- .owner = THIS_MODULE,
- .open = rpc_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
+static const struct proc_ops rpc_proc_ops = {
+ .proc_open = rpc_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
};
/*
@@ -281,19 +280,19 @@ EXPORT_SYMBOL_GPL(rpc_clnt_show_stats);
*/
static inline struct proc_dir_entry *
do_register(struct net *net, const char *name, void *data,
- const struct file_operations *fops)
+ const struct proc_ops *proc_ops)
{
struct sunrpc_net *sn;
dprintk("RPC: registering /proc/net/rpc/%s\n", name);
sn = net_generic(net, sunrpc_net_id);
- return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
+ return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data);
}
struct proc_dir_entry *
rpc_proc_register(struct net *net, struct rpc_stat *statp)
{
- return do_register(net, statp->program->name, statp, &rpc_proc_fops);
+ return do_register(net, statp->program->name, statp, &rpc_proc_ops);
}
EXPORT_SYMBOL_GPL(rpc_proc_register);
@@ -308,9 +307,9 @@ rpc_proc_unregister(struct net *net, const char *name)
EXPORT_SYMBOL_GPL(rpc_proc_unregister);
struct proc_dir_entry *
-svc_proc_register(struct net *net, struct svc_stat *statp, const struct file_operations *fops)
+svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops)
{
- return do_register(net, statp->program->pg_name, statp, fops);
+ return do_register(net, statp->program->pg_name, statp, proc_ops);
}
EXPORT_SYMBOL_GPL(svc_proc_register);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 220b79988000..187dd4e73d64 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1233,8 +1233,8 @@ svc_generic_init_request(struct svc_rqst *rqstp,
if (rqstp->rq_vers >= progp->pg_nvers )
goto err_bad_vers;
- versp = progp->pg_vers[rqstp->rq_vers];
- if (!versp)
+ versp = progp->pg_vers[rqstp->rq_vers];
+ if (!versp)
goto err_bad_vers;
/*
@@ -1337,6 +1337,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
auth_stat = rpc_autherr_badcred;
auth_res = progp->pg_authenticate(rqstp);
}
+ if (auth_res != SVC_OK)
+ trace_svc_authenticate(rqstp, auth_res, auth_stat);
switch (auth_res) {
case SVC_OK:
break;
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 550b214cb001..552617e3467b 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -19,6 +19,8 @@
#include <linux/err.h>
#include <linux/hash.h>
+#include <trace/events/sunrpc.h>
+
#define RPCDBG_FACILITY RPCDBG_AUTH
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 5c04ba7d456b..04aa80a2d752 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -166,7 +166,7 @@ static void ip_map_request(struct cache_detail *cd,
}
static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
-static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time64_t expiry);
static int ip_map_parse(struct cache_detail *cd,
char *mesg, int mlen)
@@ -187,7 +187,7 @@ static int ip_map_parse(struct cache_detail *cd,
struct ip_map *ipmp;
struct auth_domain *dom;
- time_t expiry;
+ time64_t expiry;
if (mesg[mlen-1] != '\n')
return -EINVAL;
@@ -308,7 +308,7 @@ static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
}
static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
- struct unix_domain *udom, time_t expiry)
+ struct unix_domain *udom, time64_t expiry)
{
struct ip_map ip;
struct cache_head *ch;
@@ -328,7 +328,7 @@ static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
}
static inline int ip_map_update(struct net *net, struct ip_map *ipm,
- struct unix_domain *udom, time_t expiry)
+ struct unix_domain *udom, time64_t expiry)
{
struct sunrpc_net *sn;
@@ -491,7 +491,7 @@ static int unix_gid_parse(struct cache_detail *cd,
int rv;
int i;
int err;
- time_t expiry;
+ time64_t expiry;
struct unix_gid ug, *ugp;
if (mesg[mlen - 1] != '\n')
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 48c93b9e525e..e5497dc2475b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -436,13 +436,12 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
}
/**
- * xdr_shrink_pagelen
+ * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
* @buf: xdr_buf
* @len: bytes to remove from buf->pages
*
- * Shrinks XDR buffer's page array buf->pages by
- * 'len' bytes. The extra data is not lost, but is instead
- * moved into the tail.
+ * The extra data is not lost, but is instead moved into buf->tail.
+ * Returns the actual number of bytes moved.
*/
static unsigned int
xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
@@ -455,8 +454,8 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
result = 0;
tail = buf->tail;
- BUG_ON (len > pglen);
-
+ if (len > buf->page_len)
+ len = buf-> page_len;
tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
/* Shift the tail first */
@@ -560,7 +559,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
* required at the end of encoding, or any other time when the xdr_buf
* data might be read.
*/
-void xdr_commit_encode(struct xdr_stream *xdr)
+inline void xdr_commit_encode(struct xdr_stream *xdr)
{
int shift = xdr->scratch.iov_len;
void *page;
@@ -1080,7 +1079,7 @@ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
}
EXPORT_SYMBOL_GPL(xdr_enter_page);
-static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
+static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
void
xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
@@ -1236,43 +1235,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/* If the netobj starting offset bytes from the start of xdr_buf is contained
- * entirely in the head or the tail, set object to point to it; otherwise
- * try to find space for it at the end of the tail, copy it there, and
- * set obj to point to it. */
-int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset)
+/**
+ * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
+ * @buf: pointer to buffer containing a mic
+ * @mic: on success, returns the address of the mic
+ * @offset: the offset in buf where mic may be found
+ *
+ * This function may modify the xdr buf if the mic is found to be straddling
+ * a boundary between head, pages, and tail. On success the mic can be read
+ * from the address returned. There is no need to free the mic.
+ *
+ * Return: Success returns 0, otherwise an integer error.
+ */
+int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
{
struct xdr_buf subbuf;
+ unsigned int boundary;
- if (xdr_decode_word(buf, offset, &obj->len))
+ if (xdr_decode_word(buf, offset, &mic->len))
return -EFAULT;
- if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len))
+ offset += 4;
+
+ /* Is the mic partially in the head? */
+ boundary = buf->head[0].iov_len;
+ if (offset < boundary && (offset + mic->len) > boundary)
+ xdr_shift_buf(buf, boundary - offset);
+
+ /* Is the mic partially in the pages? */
+ boundary += buf->page_len;
+ if (offset < boundary && (offset + mic->len) > boundary)
+ xdr_shrink_pagelen(buf, boundary - offset);
+
+ if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
return -EFAULT;
- /* Is the obj contained entirely in the head? */
- obj->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == obj->len)
+ /* Is the mic contained entirely in the head? */
+ mic->data = subbuf.head[0].iov_base;
+ if (subbuf.head[0].iov_len == mic->len)
return 0;
- /* ..or is the obj contained entirely in the tail? */
- obj->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == obj->len)
+ /* ..or is the mic contained entirely in the tail? */
+ mic->data = subbuf.tail[0].iov_base;
+ if (subbuf.tail[0].iov_len == mic->len)
return 0;
- /* use end of tail as storage for obj:
- * (We don't copy to the beginning because then we'd have
- * to worry about doing a potentially overlapping copy.
- * This assumes the object is at most half the length of the
- * tail.) */
- if (obj->len > buf->buflen - buf->len)
+ /* Find a contiguous area in @buf to hold all of @mic */
+ if (mic->len > buf->buflen - buf->len)
return -ENOMEM;
if (buf->tail[0].iov_len != 0)
- obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
+ mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
else
- obj->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len);
+ mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
+ __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
return 0;
}
-EXPORT_SYMBOL_GPL(xdr_buf_read_netobj);
+EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
/* Returns 0 on success, or else a negative error code. */
static int
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 783748dc5e6f..1aafe8d3f3f4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -205,20 +205,20 @@ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
xprt->snd_task = task;
+out_locked:
+ trace_xprt_reserve_xprt(xprt, task);
return 1;
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n",
- task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -269,23 +269,22 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
if (task == xprt->snd_task)
- return 1;
+ goto out_locked;
goto out_sleep;
}
if (req == NULL) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
if (test_bit(XPRT_WRITE_SPACE, &xprt->state))
goto out_unlock;
if (!xprt_need_congestion_window_wait(xprt)) {
xprt->snd_task = task;
- return 1;
+ goto out_locked;
}
out_unlock:
xprt_clear_locked(xprt);
out_sleep:
- dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_status = -EAGAIN;
if (RPC_IS_SOFT(task))
rpc_sleep_on_timeout(&xprt->sending, task, NULL,
@@ -293,6 +292,9 @@ out_sleep:
else
rpc_sleep_on(&xprt->sending, task, NULL);
return 0;
+out_locked:
+ trace_xprt_reserve_cong(xprt, task);
+ return 1;
}
EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong);
@@ -357,6 +359,7 @@ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
xprt_clear_locked(xprt);
__xprt_lock_write_next(xprt);
}
+ trace_xprt_release_xprt(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt);
@@ -374,6 +377,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
xprt_clear_locked(xprt);
__xprt_lock_write_next_cong(xprt);
}
+ trace_xprt_release_cong(xprt, task);
}
EXPORT_SYMBOL_GPL(xprt_release_xprt_cong);
@@ -395,8 +399,7 @@ __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
if (req->rq_cong)
return 1;
- dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n",
- req->rq_task->tk_pid, xprt->cong, xprt->cwnd);
+ trace_xprt_get_cong(xprt, req->rq_task);
if (RPCXPRT_CONGESTED(xprt)) {
xprt_set_congestion_window_wait(xprt);
return 0;
@@ -418,6 +421,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
req->rq_cong = 0;
xprt->cong -= RPC_CWNDSCALE;
xprt_test_and_clear_congestion_window_wait(xprt);
+ trace_xprt_put_cong(xprt, req->rq_task);
__xprt_lock_write_next_cong(xprt);
}
@@ -456,6 +460,12 @@ void xprt_release_rqst_cong(struct rpc_task *task)
}
EXPORT_SYMBOL_GPL(xprt_release_rqst_cong);
+static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt)
+{
+ if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state))
+ __xprt_lock_write_next_cong(xprt);
+}
+
/*
* Clear the congestion window wait flag and wake up the next
* entry on xprt->sending
@@ -671,6 +681,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
spin_lock(&xprt->transport_lock);
xprt_clear_connected(xprt);
xprt_clear_write_space_locked(xprt);
+ xprt_clear_congestion_window_wait_locked(xprt);
xprt_wake_pending_tasks(xprt, -ENOTCONN);
spin_unlock(&xprt->transport_lock);
}
@@ -1324,6 +1335,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task)
}
/**
+ * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue
+ * @task: pointer to rpc_task
+ *
+ * Remove a task from the transmit and receive queues, and ensure that
+ * it is not pinned by the receive work item.
+ */
+void
+xprt_request_dequeue_xprt(struct rpc_task *task)
+{
+ struct rpc_rqst *req = task->tk_rqstp;
+ struct rpc_xprt *xprt = req->rq_xprt;
+
+ if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
+ test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
+ xprt_is_pinned_rqst(req)) {
+ spin_lock(&xprt->queue_lock);
+ xprt_request_dequeue_transmit_locked(task);
+ xprt_request_dequeue_receive_locked(task);
+ while (xprt_is_pinned_rqst(req)) {
+ set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+ spin_unlock(&xprt->queue_lock);
+ xprt_wait_on_pinned_rqst(req);
+ spin_lock(&xprt->queue_lock);
+ clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
+ }
+ spin_unlock(&xprt->queue_lock);
+ }
+}
+
+/**
* xprt_request_prepare - prepare an encoded request for transport
* @req: pointer to rpc_rqst
*
@@ -1408,13 +1449,6 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
status = -EBADMSG;
goto out_dequeue;
}
- if (task->tk_ops->rpc_call_prepare_transmit) {
- task->tk_ops->rpc_call_prepare_transmit(task,
- task->tk_calldata);
- status = task->tk_status;
- if (status < 0)
- goto out_dequeue;
- }
if (RPC_SIGNALLED(task)) {
status = -ERESTARTSYS;
goto out_dequeue;
@@ -1754,28 +1788,6 @@ void xprt_retry_reserve(struct rpc_task *task)
xprt_do_reserve(xprt, task);
}
-static void
-xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req)
-{
- struct rpc_xprt *xprt = req->rq_xprt;
-
- if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) ||
- test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) ||
- xprt_is_pinned_rqst(req)) {
- spin_lock(&xprt->queue_lock);
- xprt_request_dequeue_transmit_locked(task);
- xprt_request_dequeue_receive_locked(task);
- while (xprt_is_pinned_rqst(req)) {
- set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
- spin_unlock(&xprt->queue_lock);
- xprt_wait_on_pinned_rqst(req);
- spin_lock(&xprt->queue_lock);
- clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate);
- }
- spin_unlock(&xprt->queue_lock);
- }
-}
-
/**
* xprt_release - release an RPC request slot
* @task: task which is finished with the slot
@@ -1795,7 +1807,7 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- xprt_request_dequeue_all(task, req);
+ xprt_request_dequeue_xprt(task);
spin_lock(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
@@ -1935,6 +1947,11 @@ static void xprt_destroy_cb(struct work_struct *work)
rpc_destroy_wait_queue(&xprt->backlog);
kfree(xprt->servername);
/*
+ * Destroy any existing back channel
+ */
+ xprt_destroy_backchannel(xprt, UINT_MAX);
+
+ /*
* Tear down transport state and free the rpc_xprt
*/
xprt->ops->destroy(xprt);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 59e624b1d7a0..1a0ae0c61353 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
{
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-
- return r_xprt->rx_buf.rb_bc_srv_max_requests;
+ return RPCRDMA_BACKWARD_WRS >> 1;
}
static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
@@ -81,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
*p = xdr_zero;
if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
- &rqst->rq_snd_buf, rpcrdma_noch))
+ &rqst->rq_snd_buf, rpcrdma_noch_pullup))
return -EIO;
trace_xprtrdma_cb_reply(rqst);
@@ -165,6 +163,7 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
spin_lock(&xprt->bc_pa_lock);
list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
spin_unlock(&xprt->bc_pa_lock);
+ xprt_put(xprt);
}
static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt)
@@ -195,6 +194,10 @@ create_req:
req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
if (!req)
return NULL;
+ if (rpcrdma_req_setup(r_xprt, req)) {
+ rpcrdma_req_destroy(req);
+ return NULL;
+ }
xprt->bc_alloc_count++;
rqst = &req->rl_slot;
@@ -261,6 +264,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
/* Queue rqst for ULP's callback service */
bc_serv = xprt->bc_serv;
+ xprt_get(xprt);
spin_lock(&bc_serv->sv_cb_lock);
list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
spin_unlock(&bc_serv->sv_cb_lock);
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 0b6dad7580a1..125297c9aa3e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -7,67 +7,37 @@
/* Lightweight memory registration using Fast Registration Work
* Requests (FRWR).
*
- * FRWR features ordered asynchronous registration and deregistration
- * of arbitrarily sized memory regions. This is the fastest and safest
+ * FRWR features ordered asynchronous registration and invalidation
+ * of arbitrarily-sized memory regions. This is the fastest and safest
* but most complex memory registration mode.
*/
/* Normal operation
*
- * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
* Work Request (frwr_map). When the RDMA operation is finished, this
* Memory Region is invalidated using a LOCAL_INV Work Request
- * (frwr_unmap_sync).
+ * (frwr_unmap_async and frwr_unmap_sync).
*
- * Typically these Work Requests are not signaled, and neither are RDMA
- * SEND Work Requests (with the exception of signaling occasionally to
- * prevent provider work queue overflows). This greatly reduces HCA
+ * Typically FAST_REG Work Requests are not signaled, and neither are
+ * RDMA Send Work Requests (with the exception of signaling occasionally
+ * to prevent provider work queue overflows). This greatly reduces HCA
* interrupt workload.
- *
- * As an optimization, frwr_unmap marks MRs INVALID before the
- * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
- * rb_mrs immediately so that no work (like managing a linked list
- * under a spinlock) is needed in the completion upcall.
- *
- * But this means that frwr_map() can occasionally encounter an MR
- * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
- * ordering prevents a subsequent FAST_REG WR from executing against
- * that MR while it is still being invalidated.
*/
/* Transport recovery
*
- * ->op_map and the transport connect worker cannot run at the same
- * time, but ->op_unmap can fire while the transport connect worker
- * is running. Thus MR recovery is handled in ->op_map, to guarantee
- * that recovered MRs are owned by a sending RPC, and not one where
- * ->op_unmap could fire at the same time transport reconnect is
- * being done.
- *
- * When the underlying transport disconnects, MRs are left in one of
- * four states:
- *
- * INVALID: The MR was not in use before the QP entered ERROR state.
- *
- * VALID: The MR was registered before the QP entered ERROR state.
- *
- * FLUSHED_FR: The MR was being registered when the QP entered ERROR
- * state, and the pending WR was flushed.
- *
- * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR
- * state, and the pending WR was flushed.
- *
- * When frwr_map encounters FLUSHED and VALID MRs, they are recovered
- * with ib_dereg_mr and then are re-initialized. Because MR recovery
- * allocates fresh resources, it is deferred to a workqueue, and the
- * recovered MRs are placed back on the rb_mrs list when recovery is
- * complete. frwr_map allocates another MR for the current RPC while
- * the broken MR is reset.
- *
- * To ensure that frwr_map doesn't encounter an MR that is marked
- * INVALID but that is about to be flushed due to a previous transport
- * disconnect, the transport connect worker attempts to drain all
- * pending send queue WRs before the transport is reconnected.
+ * frwr_map and frwr_unmap_* cannot run at the same time the transport
+ * connect worker is running. The connect worker holds the transport
+ * send lock, just as ->send_request does. This prevents frwr_map and
+ * the connect worker from running concurrently. When a connection is
+ * closed, the Receive completion queue is drained before the allowing
+ * the connect worker to get control. This prevents frwr_unmap and the
+ * connect worker from running concurrently.
+ *
+ * When the underlying transport disconnects, MRs that are in flight
+ * are flushed and are likely unusable. Thus all MRs are destroyed.
+ * New MRs are created on demand.
*/
#include <linux/sunrpc/rpc_rdma.h>
@@ -81,28 +51,6 @@
#endif
/**
- * frwr_is_supported - Check if device supports FRWR
- * @device: interface adapter to check
- *
- * Returns true if device supports FRWR, otherwise false
- */
-bool frwr_is_supported(struct ib_device *device)
-{
- struct ib_device_attr *attrs = &device->attrs;
-
- if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
- goto out_not_supported;
- if (attrs->max_fast_reg_page_list_len == 0)
- goto out_not_supported;
- return true;
-
-out_not_supported:
- pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
- device->name);
- return false;
-}
-
-/**
* frwr_release_mr - Destroy one MR
* @mr: MR allocated by frwr_init_mr
*
@@ -118,13 +66,8 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
kfree(mr);
}
-/* MRs are dynamically allocated, so simply clean up and release the MR.
- * A replacement MR will subsequently be allocated on demand.
- */
-static void
-frwr_mr_recycle_worker(struct work_struct *work)
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
{
- struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle);
struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
trace_xprtrdma_mr_recycle(mr);
@@ -136,10 +79,10 @@ frwr_mr_recycle_worker(struct work_struct *work)
mr->mr_dir = DMA_NONE;
}
- spin_lock(&r_xprt->rx_buf.rb_mrlock);
+ spin_lock(&r_xprt->rx_buf.rb_lock);
list_del(&mr->mr_all);
r_xprt->rx_stats.mrs_recycled++;
- spin_unlock(&r_xprt->rx_buf.rb_mrlock);
+ spin_unlock(&r_xprt->rx_buf.rb_lock);
frwr_release_mr(mr);
}
@@ -156,12 +99,10 @@ frwr_mr_recycle_worker(struct work_struct *work)
*/
void frwr_reset(struct rpcrdma_req *req)
{
- while (!list_empty(&req->rl_registered)) {
- struct rpcrdma_mr *mr;
+ struct rpcrdma_mr *mr;
- mr = rpcrdma_mr_pop(&req->rl_registered);
- rpcrdma_mr_unmap_and_put(mr);
- }
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+ rpcrdma_mr_put(mr);
}
/**
@@ -183,14 +124,13 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
if (IS_ERR(frmr))
goto out_mr_err;
- sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL);
+ sg = kcalloc(depth, sizeof(*sg), GFP_NOFS);
if (!sg)
goto out_list_err;
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
- INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
init_completion(&mr->frwr.fr_linv_done);
sg_init_table(sg, depth);
@@ -203,33 +143,53 @@ out_mr_err:
return rc;
out_list_err:
- dprintk("RPC: %s: sg allocation failure\n",
- __func__);
ib_dereg_mr(frmr);
return -ENOMEM;
}
/**
- * frwr_open - Prepare an endpoint for use with FRWR
- * @ia: interface adapter this endpoint will use
- * @ep: endpoint to prepare
+ * frwr_query_device - Prepare a transport for use with FRWR
+ * @r_xprt: controlling transport instance
+ * @device: RDMA device to query
*
* On success, sets:
- * ep->rep_attr.cap.max_send_wr
- * ep->rep_attr.cap.max_recv_wr
+ * ep->rep_attr
* ep->rep_max_requests
- * ia->ri_max_segs
+ * ia->ri_max_rdma_segs
*
* And these FRWR-related fields:
* ia->ri_max_frwr_depth
* ia->ri_mrtype
*
- * On failure, a negative errno is returned.
+ * Return values:
+ * On success, returns zero.
+ * %-EINVAL - the device does not support FRWR memory registration
+ * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
*/
-int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
+int frwr_query_device(struct rpcrdma_xprt *r_xprt,
+ const struct ib_device *device)
{
- struct ib_device_attr *attrs = &ia->ri_id->device->attrs;
+ const struct ib_device_attr *attrs = &device->attrs;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
int max_qp_wr, depth, delta;
+ unsigned int max_sge;
+
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ attrs->max_fast_reg_page_list_len == 0) {
+ pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n",
+ device->name);
+ return -EINVAL;
+ }
+
+ max_sge = min_t(unsigned int, attrs->max_send_sge,
+ RPCRDMA_MAX_SEND_SGES);
+ if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+ pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
+ return -ENOMEM;
+ }
+ ep->rep_attr.cap.max_send_sge = max_sge;
+ ep->rep_attr.cap.max_recv_sge = 1;
ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
@@ -239,14 +199,12 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
* capability, but perform optimally when the MRs are not larger
* than a page.
*/
- if (attrs->max_sge_rd > 1)
+ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
ia->ri_max_frwr_depth = attrs->max_sge_rd;
else
ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
- dprintk("RPC: %s: max FR page list depth = %u\n",
- __func__, ia->ri_max_frwr_depth);
/* Add room for frwr register and invalidate WRs.
* 1. FRWR reg WR for head
@@ -270,7 +228,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
} while (delta > 0);
}
- max_qp_wr = ia->ri_id->device->attrs.max_qp_wr;
+ max_qp_wr = attrs->max_qp_wr;
max_qp_wr -= RPCRDMA_BACKWARD_WRS;
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
@@ -281,7 +239,7 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
ep->rep_max_requests = max_qp_wr / depth;
if (!ep->rep_max_requests)
- return -EINVAL;
+ return -ENOMEM;
ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
}
ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
@@ -290,30 +248,22 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep)
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
- ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
- ia->ri_max_frwr_depth);
+ ia->ri_max_rdma_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
/* Reply chunks require segments for head and tail buffers */
- ia->ri_max_segs += 2;
- if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS;
- return 0;
-}
-
-/**
- * frwr_maxpages - Compute size of largest payload
- * @r_xprt: transport
- *
- * Returns maximum size of an RPC message, in pages.
- *
- * FRWR mode conveys a list of pages per chunk segment. The
- * maximum length of that list is the FRWR page list depth.
- */
-size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ ia->ri_max_rdma_segs += 2;
+ if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+ ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+
+ /* Ensure the underlying device is capable of conveying the
+ * largest r/wsize NFS will ask for. This guarantees that
+ * failing over from one RDMA device to another will not
+ * break NFS I/O.
+ */
+ if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+ return -ENOMEM;
- return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth);
+ return 0;
}
/**
@@ -323,31 +273,25 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
* @nsegs: number of segments remaining
* @writing: true when RDMA Write will be used
* @xid: XID of RPC using the registered memory
- * @out: initialized MR
+ * @mr: MR to fill in
*
* Prepare a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*
* Returns the next segment or a negative errno pointer.
- * On success, the prepared MR is planted in @out.
+ * On success, @mr is filled in.
*/
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
- struct rpcrdma_mr **out)
+ struct rpcrdma_mr *mr)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
- struct rpcrdma_mr *mr;
- struct ib_mr *ibmr;
struct ib_reg_wr *reg_wr;
- int i, n;
+ int i, n, dma_nents;
+ struct ib_mr *ibmr;
u8 key;
- mr = rpcrdma_mr_get(r_xprt);
- if (!mr)
- goto out_getmr_err;
-
if (nsegs > ia->ri_max_frwr_depth)
nsegs = ia->ri_max_frwr_depth;
for (i = 0; i < nsegs;) {
@@ -362,22 +306,23 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
++seg;
++i;
- if (holes_ok)
+ if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
mr->mr_dir = rpcrdma_data_dir(writing);
+ mr->mr_nents = i;
- mr->mr_nents =
- ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
- if (!mr->mr_nents)
+ dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ mr->mr_dir);
+ if (!dma_nents)
goto out_dmamap_err;
ibmr = mr->frwr.fr_mr;
- n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
- if (unlikely(n != mr->mr_nents))
+ n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
+ if (n != dma_nents)
goto out_mapmr_err;
ibmr->iova &= 0x00000000ffffffff;
@@ -397,22 +342,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_offset = ibmr->iova;
trace_xprtrdma_mr_map(mr);
- *out = mr;
return seg;
-out_getmr_err:
- xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
- return ERR_PTR(-EAGAIN);
-
out_dmamap_err:
mr->mr_dir = DMA_NONE;
trace_xprtrdma_frwr_sgerr(mr, i);
- rpcrdma_mr_put(mr);
return ERR_PTR(-EIO);
out_mapmr_err:
trace_xprtrdma_frwr_maperr(mr, n);
- rpcrdma_mr_recycle(mr);
return ERR_PTR(-EIO);
}
@@ -449,7 +387,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
- post_wr = &req->rl_sendctx->sc_wr;
+ post_wr = &req->rl_wr;
list_for_each_entry(mr, &req->rl_registered, mr_list) {
struct rpcrdma_frwr *frwr;
@@ -465,9 +403,6 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- /* If ib_post_send fails, the next ->send_request for
- * @req will queue these MRs for recovery.
- */
return ib_post_send(ia->ri_id->qp, post_wr, NULL);
}
@@ -485,7 +420,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
trace_xprtrdma_mr_remoteinv(mr);
- rpcrdma_mr_unmap_and_put(mr);
+ rpcrdma_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
}
@@ -493,9 +428,9 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
{
if (wc->status != IB_WC_SUCCESS)
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
else
- rpcrdma_mr_unmap_and_put(mr);
+ rpcrdma_mr_put(mr);
}
/**
@@ -532,8 +467,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_li_wake(wc, frwr);
- complete(&frwr->fr_linv_done);
__frwr_release_mr(wc, mr);
+ complete(&frwr->fr_linv_done);
}
/**
@@ -562,8 +497,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
frwr = NULL;
prev = &first;
- while (!list_empty(&req->rl_registered)) {
- mr = rpcrdma_mr_pop(&req->rl_registered);
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
trace_xprtrdma_mr_localinv(mr);
r_xprt->rx_stats.local_inv_needed++;
@@ -596,7 +530,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
bad_wr = NULL;
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
- trace_xprtrdma_post_send(req, rc);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -609,6 +542,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
+ trace_xprtrdma_post_linv(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr,
fr_invwr);
@@ -616,7 +550,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
bad_wr = bad_wr->next;
list_del_init(&mr->mr_list);
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
}
@@ -632,11 +566,15 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
struct rpcrdma_frwr *frwr =
container_of(cqe, struct rpcrdma_frwr, fr_cqe);
struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_li_done(wc, frwr);
- rpcrdma_complete_rqst(frwr->fr_req->rl_reply);
__frwr_release_mr(wc, mr);
+
+ /* Ensure @rep is generated before __frwr_release_mr */
+ smp_rmb();
+ rpcrdma_complete_rqst(rep);
}
/**
@@ -662,15 +600,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
frwr = NULL;
prev = &first;
- while (!list_empty(&req->rl_registered)) {
- mr = rpcrdma_mr_pop(&req->rl_registered);
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
trace_xprtrdma_mr_localinv(mr);
r_xprt->rx_stats.local_inv_needed++;
frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_localinv;
- frwr->fr_req = req;
last = &frwr->fr_invwr;
last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
@@ -697,18 +633,18 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*/
bad_wr = NULL;
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
- trace_xprtrdma_post_send(req, rc);
if (!rc)
return;
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
+ trace_xprtrdma_post_linv(req, rc);
while (bad_wr) {
frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
bad_wr = bad_wr->next;
- rpcrdma_mr_recycle(mr);
+ frwr_mr_recycle(mr);
}
/* The final LOCAL_INV WR in the chain is supposed to
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 4345e6912392..28020ec104d4 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -78,8 +78,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
size += rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max call header size = %u\n",
- __func__, size);
return size;
}
@@ -100,8 +98,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
size += sizeof(__be32); /* list discriminator */
- dprintk("RPC: %s: max reply header size = %u\n",
- __func__, size);
return size;
}
@@ -115,7 +111,7 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
*/
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
{
- unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs;
+ unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
ep->rep_max_inline_send =
@@ -149,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
remaining -= min_t(unsigned int,
PAGE_SIZE - offset, remaining);
offset = 0;
- if (++count > r_xprt->rx_ia.ri_max_send_sges)
+ if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
return false;
}
}
@@ -342,6 +338,31 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
return 0;
}
+static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpcrdma_mr_seg *seg,
+ int nsegs, bool writing,
+ struct rpcrdma_mr **mr)
+{
+ *mr = rpcrdma_mr_pop(&req->rl_free_mrs);
+ if (!*mr) {
+ *mr = rpcrdma_mr_get(r_xprt);
+ if (!*mr)
+ goto out_getmr_err;
+ trace_xprtrdma_mr_get(req);
+ (*mr)->mr_req = req;
+ }
+
+ rpcrdma_mr_push(*mr, &req->rl_registered);
+ return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
+
+out_getmr_err:
+ trace_xprtrdma_nomrs(req);
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+ rpcrdma_mrs_refresh(r_xprt);
+ return ERR_PTR(-EAGAIN);
+}
+
/* Register and XDR encode the Read list. Supports encoding a list of read
* segments that belong to a single read chunk.
*
@@ -356,9 +377,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
*
* Only a single @pos value is currently supported.
*/
-static noinline int
-rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
+static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpc_rqst *rqst,
+ enum rpcrdma_chunktype rtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
@@ -366,7 +388,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
unsigned int pos;
int nsegs;
- if (rtype == rpcrdma_noch)
+ if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
goto done;
pos = rqst->rq_snd_buf.head[0].iov_len;
@@ -379,10 +401,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return nsegs;
do {
- seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr);
+ seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
- rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_read_segment(xdr, mr, pos) < 0)
return -EMSGSIZE;
@@ -411,9 +432,10 @@ done:
*
* Only a single Write chunk is currently supported.
*/
-static noinline int
-rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpc_rqst *rqst,
+ enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
@@ -440,10 +462,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
- seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
+ seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
- rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
@@ -474,9 +495,10 @@ done:
* Returns zero on success, or a negative errno if a failure occurred.
* @xdr is advanced to the next position in the stream.
*/
-static noinline int
-rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
+static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct rpc_rqst *rqst,
+ enum rpcrdma_chunktype wtype)
{
struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
@@ -501,10 +523,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
- seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
+ seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
if (IS_ERR(seg))
return PTR_ERR(seg);
- rpcrdma_mr_push(mr, &req->rl_registered);
if (encode_rdma_segment(xdr, mr) < 0)
return -EMSGSIZE;
@@ -539,6 +560,7 @@ static void rpcrdma_sendctx_done(struct kref *kref)
*/
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{
+ struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
struct ib_sge *sge;
if (!sc->sc_unmap_count)
@@ -550,7 +572,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
*/
for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
++sge, --sc->sc_unmap_count)
- ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
+ ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
@@ -558,152 +580,228 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
/* Prepare an SGE for the RPC-over-RDMA transport header.
*/
-static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, u32 len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
- struct ib_sge *sge = sc->sc_sges;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+
+ sge->addr = rdmab_addr(rb);
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+ DMA_TO_DEVICE);
+}
+
+/* The head iovec is straightforward, as it is usually already
+ * DMA-mapped. Sync the content that has changed.
+ */
+static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, unsigned int len)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
+ return false;
+
sge->addr = rdmab_addr(rb);
sge->length = len;
sge->lkey = rdmab_lkey(rb);
ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
DMA_TO_DEVICE);
- sc->sc_wr.num_sge++;
return true;
+}
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+/* If there is a page list present, DMA map and prepare an
+ * SGE for each page to be sent.
+ */
+static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct rpcrdma_sendctx *sc = req->rl_sendctx;
+ struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+ unsigned int page_base, len, remaining;
+ struct page **ppages;
+ struct ib_sge *sge;
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ sge = &sc->sc_sges[req->rl_wr.num_sge++];
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
+ page_base, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
+
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+
+ sc->sc_unmap_count++;
+ ppages++;
+ remaining -= len;
+ page_base = 0;
+ }
+
+ return true;
+
+out_mapping_err:
+ trace_xprtrdma_dma_maperr(sge->addr);
return false;
}
-/* Prepare the Send SGEs. The head and tail iovec, and each entry
- * in the page list, gets its own SGE.
+/* The tail iovec may include an XDR pad for the page list,
+ * as well as additional content, and may not reside in the
+ * same page as the head iovec.
*/
-static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req,
+static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
struct xdr_buf *xdr,
- enum rpcrdma_chunktype rtype)
+ unsigned int page_base, unsigned int len)
{
struct rpcrdma_sendctx *sc = req->rl_sendctx;
- unsigned int sge_no, page_base, len, remaining;
+ struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
struct rpcrdma_regbuf *rb = req->rl_sendbuf;
- struct ib_sge *sge = sc->sc_sges;
- struct page *page, **ppages;
+ struct page *page = virt_to_page(xdr->tail[0].iov_base);
- /* The head iovec is straightforward, as it is already
- * DMA-mapped. Sync the content that has changed.
- */
- if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
- goto out_regbuf;
- sc->sc_device = rdmab_device(rb);
- sge_no = 1;
- sge[sge_no].addr = rdmab_addr(rb);
- sge[sge_no].length = xdr->head[0].iov_len;
- sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
- sge[sge_no].length, DMA_TO_DEVICE);
+ sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+ goto out_mapping_err;
- /* If there is a Read chunk, the page list is being handled
- * via explicit RDMA, and thus is skipped here. However, the
- * tail iovec may include an XDR pad for the page list, as
- * well as additional content, and may not reside in the
- * same page as the head iovec.
- */
- if (rtype == rpcrdma_readch) {
- len = xdr->tail[0].iov_len;
+ sge->length = len;
+ sge->lkey = rdmab_lkey(rb);
+ ++sc->sc_unmap_count;
+ return true;
- /* Do not include the tail if it is only an XDR pad */
- if (len < 4)
- goto out;
+out_mapping_err:
+ trace_xprtrdma_dma_maperr(sge->addr);
+ return false;
+}
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned char *dst;
- /* If the content in the page list is an odd length,
- * xdr_write_pages() has added a pad at the beginning
- * of the tail iovec. Force the tail's non-pad content
- * to land at the next XDR position in the Send message.
- */
- page_base += len & 3;
- len -= len & 3;
- goto map_tail;
- }
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len + xdr->page_len;
+ memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+ r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
- /* If there is a page list present, temporarily DMA map
- * and prepare an SGE for each page to be sent.
- */
- if (xdr->page_len) {
- ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
- page_base = offset_in_page(xdr->page_base);
- remaining = xdr->page_len;
- while (remaining) {
- sge_no++;
- if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
- goto out_mapping_overflow;
-
- len = min_t(u32, PAGE_SIZE - page_base, remaining);
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), *ppages,
- page_base, len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb),
- sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
-
- sc->sc_unmap_count++;
- ppages++;
- remaining -= len;
- page_base = 0;
- }
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ unsigned int len, page_base, remaining;
+ struct page **ppages;
+ unsigned char *src, *dst;
+
+ dst = (unsigned char *)xdr->head[0].iov_base;
+ dst += xdr->head[0].iov_len;
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_base = offset_in_page(xdr->page_base);
+ remaining = xdr->page_len;
+ while (remaining) {
+ src = page_address(*ppages);
+ src += page_base;
+ len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+ memcpy(dst, src, len);
+ r_xprt->rx_stats.pullup_copy_count += len;
+
+ ppages++;
+ dst += len;
+ remaining -= len;
+ page_base = 0;
}
+}
- /* The tail iovec is not always constructed in the same
- * page where the head iovec resides (see, for example,
- * gss_wrap_req_priv). To neatly accommodate that case,
- * DMA map it separately.
- */
- if (xdr->tail[0].iov_len) {
- page = virt_to_page(xdr->tail[0].iov_base);
- page_base = offset_in_page(xdr->tail[0].iov_base);
- len = xdr->tail[0].iov_len;
+/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
+ * When the head, pagelist, and tail are small, a pull-up copy
+ * is considerably less costly than DMA mapping the components
+ * of @xdr.
+ *
+ * Assumptions:
+ * - the caller has already verified that the total length
+ * of the RPC Call body will fit into @rl_sendbuf.
+ */
+static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ if (unlikely(xdr->tail[0].iov_len))
+ rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
-map_tail:
- sge_no++;
- sge[sge_no].addr =
- ib_dma_map_page(rdmab_device(rb), page, page_base, len,
- DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
- goto out_mapping_err;
- sge[sge_no].length = len;
- sge[sge_no].lkey = rdmab_lkey(rb);
- sc->sc_unmap_count++;
- }
+ if (unlikely(xdr->page_len))
+ rpcrdma_pullup_pagelist(r_xprt, req, xdr);
-out:
- sc->sc_wr.num_sge += sge_no;
- if (sc->sc_unmap_count)
+ /* The whole RPC message resides in the head iovec now */
+ return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
+}
+
+static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ struct kvec *tail = &xdr->tail[0];
+
+ if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+ return false;
+ if (xdr->page_len)
+ if (!rpcrdma_prepare_pagelist(req, xdr))
+ return false;
+ if (tail->iov_len)
+ if (!rpcrdma_prepare_tail_iov(req, xdr,
+ offset_in_page(tail->iov_base),
+ tail->iov_len))
+ return false;
+
+ if (req->rl_sendctx->sc_unmap_count)
kref_get(&req->rl_kref);
return true;
+}
-out_regbuf:
- pr_err("rpcrdma: failed to DMA map a Send buffer\n");
- return false;
+static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req,
+ struct xdr_buf *xdr)
+{
+ if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+ return false;
-out_mapping_overflow:
- rpcrdma_sendctx_unmap(sc);
- pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
- return false;
+ /* If there is a Read chunk, the page list is being handled
+ * via explicit RDMA, and thus is skipped here.
+ */
-out_mapping_err:
- rpcrdma_sendctx_unmap(sc);
- trace_xprtrdma_dma_maperr(sge[sge_no].addr);
- return false;
+ /* Do not include the tail if it is only an XDR pad */
+ if (xdr->tail[0].iov_len > 3) {
+ unsigned int page_base, len;
+
+ /* If the content in the page list is an odd length,
+ * xdr_write_pages() adds a pad at the beginning of
+ * the tail iovec. Force the tail's non-pad content to
+ * land at the next XDR position in the Send message.
+ */
+ page_base = offset_in_page(xdr->tail[0].iov_base);
+ len = xdr->tail[0].iov_len;
+ page_base += len & 3;
+ len -= len & 3;
+ if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
+ return false;
+ kref_get(&req->rl_kref);
+ }
+
+ return true;
}
/**
@@ -716,31 +814,52 @@ out_mapping_err:
*
* Returns 0 on success; otherwise a negative errno is returned.
*/
-int
-rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, u32 hdrlen,
- struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_req *req, u32 hdrlen,
+ struct xdr_buf *xdr,
+ enum rpcrdma_chunktype rtype)
{
int ret;
ret = -EAGAIN;
req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
if (!req->rl_sendctx)
- goto err;
- req->rl_sendctx->sc_wr.num_sge = 0;
+ goto out_nosc;
req->rl_sendctx->sc_unmap_count = 0;
req->rl_sendctx->sc_req = req;
kref_init(&req->rl_kref);
+ req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
+ req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
+ req->rl_wr.num_sge = 0;
+ req->rl_wr.opcode = IB_WR_SEND;
+
+ rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen);
ret = -EIO;
- if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
- goto err;
- if (rtype != rpcrdma_areadch)
- if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
- goto err;
+ switch (rtype) {
+ case rpcrdma_noch_pullup:
+ if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_noch_mapped:
+ if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_readch:
+ if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
+ goto out_unmap;
+ break;
+ case rpcrdma_areadch:
+ break;
+ default:
+ goto out_unmap;
+ }
+
return 0;
-err:
+out_unmap:
+ rpcrdma_sendctx_unmap(req->rl_sendctx);
+out_nosc:
trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
return ret;
}
@@ -770,6 +889,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct xdr_stream *xdr = &req->rl_stream;
enum rpcrdma_chunktype rtype, wtype;
+ struct xdr_buf *buf = &rqst->rq_snd_buf;
bool ddp_allowed;
__be32 *p;
int ret;
@@ -785,7 +905,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
goto out_err;
*p++ = rqst->rq_xid;
*p++ = rpcrdma_version;
- *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
+ *p++ = r_xprt->rx_buf.rb_max_requests;
/* When the ULP employs a GSS flavor that guarantees integrity
* or privacy, direct data placement of individual data items
@@ -827,8 +947,9 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
*p++ = rdma_msg;
- rtype = rpcrdma_noch;
- } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
+ rpcrdma_noch_pullup : rpcrdma_noch_mapped;
+ } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
*p++ = rdma_msg;
rtype = rpcrdma_readch;
} else {
@@ -837,17 +958,6 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
rtype = rpcrdma_areadch;
}
- /* If this is a retransmit, discard previously registered
- * chunks. Very likely the connection has been replaced,
- * so these registrations are invalid and unusable.
- */
- while (unlikely(!list_empty(&req->rl_registered))) {
- struct rpcrdma_mr *mr;
-
- mr = rpcrdma_mr_pop(&req->rl_registered);
- rpcrdma_mr_recycle(mr);
- }
-
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
*
@@ -881,7 +991,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
goto out_err;
ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
- &rqst->rq_snd_buf, rtype);
+ buf, rtype);
if (ret)
goto out_err;
@@ -895,6 +1005,40 @@ out_err:
return ret;
}
+static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
+ struct rpcrdma_buffer *buf,
+ u32 grant)
+{
+ buf->rb_credits = grant;
+ xprt->cwnd = grant << RPC_CWNDSHIFT;
+}
+
+static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock(&xprt->transport_lock);
+ __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant);
+ spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * rpcrdma_reset_cwnd - Reset the xprt's congestion window
+ * @r_xprt: controlling transport instance
+ *
+ * Prepare @r_xprt for the next connection by reinitializing
+ * its credit grant to one (see RFC 8166, Section 3.3.3).
+ */
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ spin_lock(&xprt->transport_lock);
+ xprt->cong = 0;
+ __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1);
+ spin_unlock(&xprt->transport_lock);
+}
+
/**
* rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
* @rqst: controlling RPC request
@@ -934,7 +1078,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
curlen = rqst->rq_rcv_buf.head[0].iov_len;
if (curlen > copy_len)
curlen = copy_len;
- trace_xprtrdma_fixup(rqst, copy_len, curlen);
srcp += curlen;
copy_len -= curlen;
@@ -954,8 +1097,6 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
if (curlen > pagelist_len)
curlen = pagelist_len;
- trace_xprtrdma_fixup_pg(rqst, i, srcp,
- copy_len, curlen);
destp = kmap_atomic(ppages[i]);
memcpy(destp + page_base, srcp, curlen);
flush_dcache_page(ppages[i]);
@@ -987,6 +1128,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
rqst->rq_private_buf.tail[0].iov_base = srcp;
}
+ if (fixup_copy_count)
+ trace_xprtrdma_fixup(rqst, fixup_copy_count);
return fixup_copy_count;
}
@@ -1240,8 +1383,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
struct rpc_rqst *rqst = rep->rr_rqst;
int status;
- xprt->reestablish_timeout = 0;
-
switch (rep->rr_proc) {
case rdma_msg:
status = rpcrdma_decode_msg(r_xprt, rep, rqst);
@@ -1300,6 +1441,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
u32 credits;
__be32 *p;
+ /* Any data means we had a useful conversation, so
+ * then we don't need to delay the next reconnect.
+ */
+ if (xprt->reestablish_timeout)
+ xprt->reestablish_timeout = 0;
+
/* Fixed transport header fields */
xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
rep->rr_hdrbuf.head[0].iov_base, NULL);
@@ -1329,14 +1476,11 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (credits == 0)
credits = 1; /* don't deadlock */
- else if (credits > buf->rb_max_requests)
- credits = buf->rb_max_requests;
- if (buf->rb_credits != credits) {
- spin_lock(&xprt->transport_lock);
- buf->rb_credits = credits;
- xprt->cwnd = credits << RPC_CWNDSHIFT;
- spin_unlock(&xprt->transport_lock);
- }
+ else if (credits > r_xprt->rx_ep.rep_max_requests)
+ credits = r_xprt->rx_ep.rep_max_requests;
+ if (buf->rb_credits != credits)
+ rpcrdma_update_cwnd(r_xprt, credits);
+ rpcrdma_post_recvs(r_xprt, false);
req = rpcr_to_rdmar(rqst);
if (req->rl_reply) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index abdb3004a1e3..97bca509a391 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -73,8 +73,6 @@ atomic_t rdma_stat_rq_prod;
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
-struct workqueue_struct *svc_rdma_wq;
-
/*
* This function implements reading and resetting an atomic_t stat
* variable through read/write to a proc file. Any write to the file
@@ -230,7 +228,6 @@ static struct ctl_table svcrdma_root_table[] = {
void svc_rdma_cleanup(void)
{
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
- destroy_workqueue(svc_rdma_wq);
if (svcrdma_table_header) {
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
@@ -246,10 +243,6 @@ int svc_rdma_init(void)
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
- svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
- if (!svc_rdma_wq)
- return -ENOMEM;
-
if (!svcrdma_table_header)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index d1fcc41d5eb5..908e78bb87c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -195,6 +195,7 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
#endif
+ rqst->rq_xtime = ktime_get();
rc = svc_rdma_bc_sendto(rdma, rqst, ctxt);
if (rc) {
svc_rdma_send_ctxt_put(rdma, ctxt);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 65e2fb9aac65..96bccd398469 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -172,9 +172,10 @@ static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
+ struct llist_node *node;
- while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
- list_del(&ctxt->rc_list);
+ while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
svc_rdma_recv_ctxt_destroy(rdma, ctxt);
}
}
@@ -183,21 +184,18 @@ static struct svc_rdma_recv_ctxt *
svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
{
struct svc_rdma_recv_ctxt *ctxt;
+ struct llist_node *node;
- spin_lock(&rdma->sc_recv_lock);
- ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts);
- if (!ctxt)
+ node = llist_del_first(&rdma->sc_recv_ctxts);
+ if (!node)
goto out_empty;
- list_del(&ctxt->rc_list);
- spin_unlock(&rdma->sc_recv_lock);
+ ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
out:
ctxt->rc_page_count = 0;
return ctxt;
out_empty:
- spin_unlock(&rdma->sc_recv_lock);
-
ctxt = svc_rdma_recv_ctxt_alloc(rdma);
if (!ctxt)
return NULL;
@@ -218,11 +216,9 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
for (i = 0; i < ctxt->rc_page_count; i++)
put_page(ctxt->rc_pages[i]);
- if (!ctxt->rc_temp) {
- spin_lock(&rdma->sc_recv_lock);
- list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
- spin_unlock(&rdma->sc_recv_lock);
- } else
+ if (!ctxt->rc_temp)
+ llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
+ else
svc_rdma_recv_ctxt_destroy(rdma, ctxt);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 6fdba72f89f4..f3f108090aa4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -233,11 +233,15 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
/* The first SGE contains the transport header, which
* remains mapped until @ctxt is destroyed.
*/
- for (i = 1; i < ctxt->sc_send_wr.num_sge; i++)
+ for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) {
ib_dma_unmap_page(device,
ctxt->sc_sges[i].addr,
ctxt->sc_sges[i].length,
DMA_TO_DEVICE);
+ trace_svcrdma_dma_unmap_page(rdma,
+ ctxt->sc_sges[i].addr,
+ ctxt->sc_sges[i].length);
+ }
for (i = 0; i < ctxt->sc_page_count; ++i)
put_page(ctxt->sc_pages[i]);
@@ -490,6 +494,7 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
dma_addr_t dma_addr;
dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+ trace_svcrdma_dma_map_page(rdma, dma_addr, len);
if (ib_dma_mapping_error(dev, dma_addr))
goto out_maperr;
@@ -499,7 +504,6 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
return 0;
out_maperr:
- trace_svcrdma_dma_map_page(rdma, page);
return -EIO;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3fe665152d95..145a3615c319 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -140,14 +140,13 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
- INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts);
+ init_llist_head(&cma_xprt->sc_recv_ctxts);
INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_send_lock);
- spin_lock_init(&cma_xprt->sc_recv_lock);
spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
/*
@@ -454,14 +453,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
- newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
- 0, IB_POLL_WORKQUEUE);
+ newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
- newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
- 0, IB_POLL_WORKQUEUE);
+ newxprt->sc_rq_cq =
+ ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -630,8 +629,9 @@ static void svc_rdma_free(struct svc_xprt *xprt)
{
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
INIT_WORK(&rdma->sc_work, __svc_rdma_free);
- queue_work(svc_rdma_wq, &rdma->sc_work);
+ schedule_work(&rdma->sc_work);
}
static int svc_rdma_has_wspace(struct svc_xprt *xprt)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2ec349ed4770..3cfeba68ee9a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -243,16 +243,13 @@ xprt_rdma_connect_worker(struct work_struct *work)
rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
xprt_clear_connecting(xprt);
if (r_xprt->rx_ep.rep_connected > 0) {
- if (!xprt_test_and_set_connected(xprt)) {
- xprt->stat.connect_count++;
- xprt->stat.connect_time += (long)jiffies -
- xprt->stat.connect_start;
- xprt_wake_pending_tasks(xprt, -EAGAIN);
- }
- } else {
- if (xprt_test_and_clear_connected(xprt))
- xprt_wake_pending_tasks(xprt, rc);
+ xprt->stat.connect_count++;
+ xprt->stat.connect_time += (long)jiffies -
+ xprt->stat.connect_start;
+ xprt_set_connected(xprt);
+ rc = -EAGAIN;
}
+ xprt_wake_pending_tasks(xprt, rc);
}
/**
@@ -319,7 +316,8 @@ xprt_setup_rdma(struct xprt_create *args)
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
- xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0);
+ xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
+ xprt_rdma_slot_table_entries);
if (!xprt)
return ERR_PTR(-ENOMEM);
@@ -361,19 +359,13 @@ xprt_setup_rdma(struct xprt_create *args)
if (rc)
goto out3;
- INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
- xprt_rdma_connect_worker);
-
- xprt->max_payload = frwr_maxpages(new_xprt);
- if (xprt->max_payload == 0)
- goto out4;
- xprt->max_payload <<= PAGE_SHIFT;
- dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
- __func__, xprt->max_payload);
-
if (!try_module_get(THIS_MODULE))
goto out4;
+ INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
+ xprt_rdma_connect_worker);
+ xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+
dprintk("RPC: %s: %s:%s\n", __func__,
xprt->address_strings[RPC_DISPLAY_ADDR],
xprt->address_strings[RPC_DISPLAY_PORT]);
@@ -423,17 +415,10 @@ void xprt_rdma_close(struct rpc_xprt *xprt)
if (ep->rep_connected == -ENODEV)
return;
- if (ep->rep_connected > 0)
- xprt->reestablish_timeout = 0;
rpcrdma_ep_disconnect(ep, ia);
- /* Prepare @xprt for the next connection by reinitializing
- * its credit grant to one (see RFC 8166, Section 3.3.3).
- */
- r_xprt->rx_buf.rb_credits = 1;
- xprt->cwnd = RPC_CWNDSHIFT;
-
out:
+ xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
}
@@ -451,12 +436,6 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
char buf[8];
- dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n",
- __func__, xprt,
- xprt->address_strings[RPC_DISPLAY_ADDR],
- xprt->address_strings[RPC_DISPLAY_PORT],
- port);
-
rpc_set_port(sap, port);
kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
@@ -466,6 +445,9 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
snprintf(buf, sizeof(buf), "%4hx", port);
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+ trace_xprtrdma_op_setport(container_of(xprt, struct rpcrdma_xprt,
+ rx_xprt));
}
/**
@@ -494,9 +476,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
* @reconnect_timeout: reconnect timeout after server disconnects
*
*/
-static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
- unsigned long connect_timeout,
- unsigned long reconnect_timeout)
+static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -537,13 +519,12 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
unsigned long delay;
- trace_xprtrdma_op_connect(r_xprt);
-
delay = 0;
if (r_xprt->rx_ep.rep_connected != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
+ trace_xprtrdma_op_connect(r_xprt, delay);
queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
delay);
}
@@ -571,6 +552,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
return;
out_sleep:
+ set_bit(XPRT_CONGESTED, &xprt->state);
rpc_sleep_on(&xprt->backlog, task, NULL);
task->tk_status = -EAGAIN;
}
@@ -589,7 +571,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
memset(rqst, 0, sizeof(*rqst));
rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
- rpc_wake_up_next(&xprt->backlog);
+ if (unlikely(!rpc_wake_up_next(&xprt->backlog)))
+ clear_bit(XPRT_CONGESTED, &xprt->state);
}
static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
@@ -803,7 +786,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
- .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout,
+ .set_connect_timeout = xprt_rdma_set_connect_timeout,
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 805b1f35e1ca..353f61ac8d51 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,6 +53,7 @@
#include <linux/slab.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
+#include <linux/log2.h>
#include <asm-generic/barrier.h>
#include <asm/bitops.h>
@@ -73,15 +74,21 @@
/*
* internal functions
*/
-static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc);
+static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_sendctx *sc);
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
+static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
-static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags);
static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
-static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/* Wait for outstanding transport work to finish. ib_drain_qp
* handles the drains in the wrong order for us, so open code
@@ -122,7 +129,7 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context)
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
- * @cq: completion queue (ignored)
+ * @cq: completion queue
* @wc: completed WR
*
*/
@@ -135,7 +142,7 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_send(sc, wc);
- rpcrdma_sendctx_put_locked(sc);
+ rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
}
/**
@@ -167,19 +174,18 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
rdmab_addr(rep->rr_rdmabuf),
wc->byte_len, DMA_FROM_DEVICE);
- rpcrdma_post_recvs(r_xprt, false);
rpcrdma_reply_handler(rep);
return;
out_flushed:
- rpcrdma_recv_buffer_put(rep);
+ rpcrdma_rep_destroy(rep);
}
-static void
-rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
- struct rdma_conn_param *param)
+static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
+ struct rdma_conn_param *param)
{
const struct rpcrdma_connect_private *pmsg = param->private_data;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
unsigned int rsize, wsize;
/* Default settings for RPC-over-RDMA Version One */
@@ -195,13 +201,11 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < r_xprt->rx_ep.rep_inline_recv)
- r_xprt->rx_ep.rep_inline_recv = rsize;
- if (wsize < r_xprt->rx_ep.rep_inline_send)
- r_xprt->rx_ep.rep_inline_send = wsize;
- dprintk("RPC: %s: max send %u, max recv %u\n", __func__,
- r_xprt->rx_ep.rep_inline_send,
- r_xprt->rx_ep.rep_inline_recv);
+ if (rsize < ep->rep_inline_recv)
+ ep->rep_inline_recv = rsize;
+ if (wsize < ep->rep_inline_send)
+ ep->rep_inline_send = wsize;
+
rpcrdma_set_max_header_sizes(r_xprt);
}
@@ -244,6 +248,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
ia->ri_id->device->name,
rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
#endif
+ init_completion(&ia->ri_remove_done);
set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
ep->rep_connected = -ENODEV;
xprt_force_disconnect(xprt);
@@ -255,7 +260,8 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_ESTABLISHED:
++xprt->connect_cookie;
ep->rep_connected = 1;
- rpcrdma_update_connect_private(r_xprt, &event->param.conn);
+ rpcrdma_update_cm_private(r_xprt, &event->param.conn);
+ trace_xprtrdma_inline_thresh(r_xprt);
wake_up_all(&ep->rep_connect_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
@@ -295,10 +301,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
struct rdma_cm_id *id;
int rc;
- trace_xprtrdma_conn_start(xprt);
-
init_completion(&ia->ri_done);
- init_completion(&ia->ri_remove_done);
id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
xprt, RDMA_PS_TCP, IB_QPT_RC);
@@ -312,10 +315,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
if (rc)
goto out;
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
- if (rc < 0) {
- trace_xprtrdma_conn_tout(xprt);
+ if (rc < 0)
goto out;
- }
rc = ia->ri_async_rc;
if (rc)
@@ -326,10 +327,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
if (rc)
goto out;
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
- if (rc < 0) {
- trace_xprtrdma_conn_tout(xprt);
+ if (rc < 0)
goto out;
- }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -371,18 +370,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
goto out_err;
}
- switch (xprt_rdma_memreg_strategy) {
- case RPCRDMA_FRWR:
- if (frwr_is_supported(ia->ri_id->device))
- break;
- /*FALLTHROUGH*/
- default:
- pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
- ia->ri_id->device->name, xprt_rdma_memreg_strategy);
- rc = -EINVAL;
- goto out_err;
- }
-
return 0;
out_err:
@@ -396,6 +383,8 @@ out_err:
*
* Divest transport H/W resources associated with this adapter,
* but allow it to be restored later.
+ *
+ * Caller must hold the transport send lock.
*/
void
rpcrdma_ia_remove(struct rpcrdma_ia *ia)
@@ -403,11 +392,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_req *req;
- struct rpcrdma_rep *rep;
-
- cancel_delayed_work_sync(&buf->rb_refresh_worker);
/* This is similar to rpcrdma_ep_destroy, but:
* - Don't cancel the connect worker.
@@ -429,14 +413,10 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia)
/* The ULP is responsible for ensuring all DMA
* mappings and MRs are gone.
*/
- list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
- rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
- list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
- rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf);
- rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
- rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
- }
- rpcrdma_mrs_destroy(buf);
+ rpcrdma_reps_unmap(r_xprt);
+ rpcrdma_reqs_reset(r_xprt);
+ rpcrdma_mrs_destroy(r_xprt);
+ rpcrdma_sendctxs_destroy(r_xprt);
ib_dealloc_pd(ia->ri_pd);
ia->ri_pd = NULL;
@@ -479,30 +459,20 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
struct ib_cq *sendcq, *recvcq;
- unsigned int max_sge;
int rc;
- ep->rep_max_requests = xprt_rdma_slot_table_entries;
+ ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
ep->rep_inline_send = xprt_rdma_max_inline_write;
ep->rep_inline_recv = xprt_rdma_max_inline_read;
- max_sge = min_t(unsigned int, ia->ri_id->device->attrs.max_send_sge,
- RPCRDMA_MAX_SEND_SGES);
- if (max_sge < RPCRDMA_MIN_SEND_SGES) {
- pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
- return -ENOMEM;
- }
- ia->ri_max_send_sges = max_sge;
-
- rc = frwr_open(ia, ep);
+ rc = frwr_query_device(r_xprt, ia->ri_id->device);
if (rc)
return rc;
+ r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
ep->rep_attr.qp_context = ep;
ep->rep_attr.srq = NULL;
- ep->rep_attr.cap.max_send_sge = max_sge;
- ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
ep->rep_attr.qp_type = IB_QPT_RC;
@@ -521,18 +491,17 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
init_waitqueue_head(&ep->rep_connect_wait);
ep->rep_receive_count = 0;
- sendcq = ib_alloc_cq(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_send_wr + 1,
- ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0,
- IB_POLL_WORKQUEUE);
+ sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
+ ep->rep_attr.cap.max_send_wr + 1,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
goto out1;
}
- recvcq = ib_alloc_cq(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- 0, IB_POLL_WORKQUEUE);
+ recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
+ ep->rep_attr.cap.max_recv_wr + 1,
+ IB_POLL_WORKQUEUE);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
goto out2;
@@ -605,10 +574,11 @@ void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
* Unlike a normal reconnection, a fresh PD and a new set
* of MRs and buffers is needed.
*/
-static int
-rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+ struct ib_qp_init_attr *qp_init_attr)
{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
int rc, err;
trace_xprtrdma_reinsert(r_xprt);
@@ -623,15 +593,14 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
goto out2;
}
+ memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
if (err) {
pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
goto out3;
}
-
- rpcrdma_mrs_create(r_xprt);
return 0;
out3:
@@ -642,16 +611,14 @@ out1:
return rc;
}
-static int
-rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
- struct rpcrdma_ia *ia)
+static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
+ struct ib_qp_init_attr *qp_init_attr)
{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rdma_cm_id *id, *old;
int err, rc;
- trace_xprtrdma_reconnect(r_xprt);
-
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
rc = -EHOSTUNREACH;
id = rpcrdma_create_id(r_xprt, ia);
@@ -673,7 +640,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
goto out_destroy;
}
- err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
if (err)
goto out_destroy;
@@ -698,25 +665,26 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct ib_qp_init_attr qp_init_attr;
int rc;
retry:
+ memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
switch (ep->rep_connected) {
case 0:
- dprintk("RPC: %s: connecting...\n", __func__);
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
if (rc) {
rc = -ENETUNREACH;
goto out_noupdate;
}
break;
case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+ rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
if (rc)
goto out_noupdate;
break;
default:
- rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+ rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
if (rc)
goto out;
}
@@ -724,12 +692,19 @@ retry:
ep->rep_connected = 0;
xprt_clear_connected(xprt);
+ rpcrdma_reset_cwnd(r_xprt);
rpcrdma_post_recvs(r_xprt, true);
+ rc = rpcrdma_sendctxs_create(r_xprt);
+ if (rc)
+ goto out;
+
rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
if (rc)
goto out;
+ if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
if (ep->rep_connected <= 0) {
if (ep->rep_connected == -EAGAIN)
@@ -738,13 +713,19 @@ retry:
goto out;
}
- dprintk("RPC: %s: connected\n", __func__);
+ rc = rpcrdma_reqs_setup(r_xprt);
+ if (rc) {
+ rpcrdma_ep_disconnect(ep, ia);
+ goto out;
+ }
+ rpcrdma_mrs_create(r_xprt);
out:
if (rc)
ep->rep_connected = rc;
out_noupdate:
+ trace_xprtrdma_connect(r_xprt, rc);
return rc;
}
@@ -753,11 +734,8 @@ out_noupdate:
* @ep: endpoint to disconnect
* @ia: associated interface adapter
*
- * This is separate from destroy to facilitate the ability
- * to reconnect without recreating the endpoint.
- *
- * This call is not reentrant, and must not be made in parallel
- * on the same endpoint.
+ * Caller serializes. Either the transport send lock is held,
+ * or we're being called to destroy the transport.
*/
void
rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
@@ -776,6 +754,9 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
trace_xprtrdma_disconnect(r_xprt, rc);
rpcrdma_xprt_drain(r_xprt);
+ rpcrdma_reqs_reset(r_xprt);
+ rpcrdma_mrs_destroy(r_xprt);
+ rpcrdma_sendctxs_destroy(r_xprt);
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -795,27 +776,28 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
* queue activity, and rpcrdma_xprt_drain has flushed all remaining
* Send requests.
*/
-static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
unsigned long i;
+ if (!buf->rb_sc_ctxs)
+ return;
for (i = 0; i <= buf->rb_sc_last; i++)
kfree(buf->rb_sc_ctxs[i]);
kfree(buf->rb_sc_ctxs);
+ buf->rb_sc_ctxs = NULL;
}
-static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
+static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ia->ri_max_send_sges),
+ sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
GFP_KERNEL);
if (!sc)
return NULL;
- sc->sc_wr.wr_cqe = &sc->sc_cqe;
- sc->sc_wr.sg_list = sc->sc_sges;
- sc->sc_wr.opcode = IB_WR_SEND;
sc->sc_cqe.done = rpcrdma_wc_send;
return sc;
}
@@ -831,22 +813,22 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
* the ->send_request call to fail temporarily before too many
* Sends are posted.
*/
- i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
- dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i);
+ i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
buf->rb_sc_last = i - 1;
for (i = 0; i <= buf->rb_sc_last; i++) {
- sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
+ sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
if (!sc)
return -ENOMEM;
- sc->sc_xprt = r_xprt;
buf->rb_sc_ctxs[i] = sc;
}
+ buf->rb_sc_head = 0;
+ buf->rb_sc_tail = 0;
return 0;
}
@@ -906,6 +888,7 @@ out_emptyq:
/**
* rpcrdma_sendctx_put_locked - Release a send context
+ * @r_xprt: controlling transport instance
* @sc: send context to release
*
* Usage: Called from Send completion to return a sendctxt
@@ -913,10 +896,10 @@ out_emptyq:
*
* The caller serializes calls to this function (per transport).
*/
-static void
-rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_sendctx *sc)
{
- struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
unsigned long next_tail;
/* Unmap SGEs of previously completed but unsignaled
@@ -934,7 +917,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
/* Paired with READ_ONCE */
smp_store_release(&buf->rb_sc_tail, next_tail);
- xprt_write_space(&sc->sc_xprt->rx_xprt);
+ xprt_write_space(&r_xprt->rx_xprt);
}
static void
@@ -943,14 +926,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
unsigned int count;
- LIST_HEAD(free);
- LIST_HEAD(all);
- for (count = 0; count < ia->ri_max_segs; count++) {
+ for (count = 0; count < ia->ri_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
- mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ mr = kzalloc(sizeof(*mr), GFP_NOFS);
if (!mr)
break;
@@ -962,15 +943,13 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
mr->mr_xprt = r_xprt;
- list_add(&mr->mr_list, &free);
- list_add(&mr->mr_all, &all);
+ spin_lock(&buf->rb_lock);
+ rpcrdma_mr_push(mr, &buf->rb_mrs);
+ list_add(&mr->mr_all, &buf->rb_all_mrs);
+ spin_unlock(&buf->rb_lock);
}
- spin_lock(&buf->rb_mrlock);
- list_splice(&free, &buf->rb_mrs);
- list_splice(&all, &buf->rb_all);
r_xprt->rx_stats.mrs_allocated += count;
- spin_unlock(&buf->rb_mrlock);
trace_xprtrdma_createmrs(r_xprt, count);
}
@@ -978,7 +957,7 @@ static void
rpcrdma_mr_refresh_worker(struct work_struct *work)
{
struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
- rb_refresh_worker.work);
+ rb_refresh_worker);
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
rx_buf);
@@ -987,6 +966,28 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
}
/**
+ * rpcrdma_mrs_refresh - Wake the MR refresh worker
+ * @r_xprt: controlling transport instance
+ *
+ */
+void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+ /* If there is no underlying device, it's no use to
+ * wake the refresh worker.
+ */
+ if (ep->rep_connected != -ENODEV) {
+ /* The work is scheduled on a WQ_MEM_RECLAIM
+ * workqueue in order to prevent MR allocation
+ * from recursing into NFS during direct reclaim.
+ */
+ queue_work(xprtiod_workqueue, &buf->rb_refresh_worker);
+ }
+}
+
+/**
* rpcrdma_req_create - Allocate an rpcrdma_req object
* @r_xprt: controlling r_xprt
* @size: initial size, in bytes, of send and receive buffers
@@ -998,45 +999,120 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
gfp_t flags)
{
struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
- struct rpcrdma_regbuf *rb;
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), flags);
if (req == NULL)
goto out1;
- rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags);
- if (!rb)
- goto out2;
- req->rl_rdmabuf = rb;
- xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
-
req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags);
if (!req->rl_sendbuf)
- goto out3;
+ goto out2;
req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags);
if (!req->rl_recvbuf)
- goto out4;
+ goto out3;
+ INIT_LIST_HEAD(&req->rl_free_mrs);
INIT_LIST_HEAD(&req->rl_registered);
spin_lock(&buffer->rb_lock);
list_add(&req->rl_all, &buffer->rb_allreqs);
spin_unlock(&buffer->rb_lock);
return req;
-out4:
- kfree(req->rl_sendbuf);
out3:
- kfree(req->rl_rdmabuf);
+ kfree(req->rl_sendbuf);
out2:
kfree(req);
out1:
return NULL;
}
-static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
- bool temp)
+/**
+ * rpcrdma_req_setup - Per-connection instance setup of an rpcrdma_req object
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req object to set up
+ *
+ * Returns zero on success, and a negative errno on failure.
+ */
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct rpcrdma_regbuf *rb;
+ size_t maxhdrsize;
+
+ /* Compute maximum header buffer size in bytes */
+ maxhdrsize = rpcrdma_fixed_maxsz + 3 +
+ r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+ maxhdrsize *= sizeof(__be32);
+ rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
+ DMA_TO_DEVICE, GFP_KERNEL);
+ if (!rb)
+ goto out;
+
+ if (!__rpcrdma_regbuf_dma_map(r_xprt, rb))
+ goto out_free;
+
+ req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
+ return 0;
+
+out_free:
+ rpcrdma_regbuf_free(rb);
+out:
+ return -ENOMEM;
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * and thus can be walked without holding rb_lock. Eg. the
+ * caller is holding the transport send lock to exclude
+ * device removal or disconnection.
+ */
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+ int rc;
+
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+ rc = rpcrdma_req_setup(r_xprt, req);
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+static void rpcrdma_req_reset(struct rpcrdma_req *req)
+{
+ /* Credits are valid for only one connection */
+ req->rl_slot.rq_cong = 0;
+
+ rpcrdma_regbuf_free(req->rl_rdmabuf);
+ req->rl_rdmabuf = NULL;
+
+ rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
+ rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * and thus can be walked without holding rb_lock. Eg. the
+ * caller is holding the transport send lock to exclude
+ * device removal or disconnection.
+ */
+static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all)
+ rpcrdma_req_reset(req);
+}
+
+/* No locking needed here. This function is called only by the
+ * Receive completion handler.
+ */
+static noinline
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
+ bool temp)
{
struct rpcrdma_rep *rep;
@@ -1049,6 +1125,9 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (!rep->rr_rdmabuf)
goto out_free;
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
+ goto out_free_regbuf;
+
xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1058,14 +1137,63 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
rep->rr_temp = temp;
+ list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
return rep;
+out_free_regbuf:
+ rpcrdma_regbuf_free(rep->rr_rdmabuf);
out_free:
kfree(rep);
out:
return NULL;
}
+/* No locking needed here. This function is invoked only by the
+ * Receive completion handler, or during transport shutdown.
+ */
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+ list_del(&rep->rr_all);
+ rpcrdma_regbuf_free(rep->rr_rdmabuf);
+ kfree(rep);
+}
+
+static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
+{
+ struct llist_node *node;
+
+ /* Calls to llist_del_first are required to be serialized */
+ node = llist_del_first(&buf->rb_free_reps);
+ if (!node)
+ return NULL;
+ return llist_entry(node, struct rpcrdma_rep, rr_node);
+}
+
+static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
+ struct rpcrdma_rep *rep)
+{
+ llist_add(&rep->rr_node, &buf->rb_free_reps);
+}
+
+static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_rep *rep;
+
+ list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
+ rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
+ rep->rr_temp = true;
+ }
+}
+
+static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_rep *rep;
+
+ while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
+ rpcrdma_rep_destroy(rep);
+}
+
/**
* rpcrdma_buffer_create - Create initial set of req/rep objects
* @r_xprt: transport instance to (re)initialize
@@ -1077,37 +1205,28 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
int i, rc;
- buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
buf->rb_bc_srv_max_requests = 0;
- spin_lock_init(&buf->rb_mrlock);
spin_lock_init(&buf->rb_lock);
INIT_LIST_HEAD(&buf->rb_mrs);
- INIT_LIST_HEAD(&buf->rb_all);
- INIT_DELAYED_WORK(&buf->rb_refresh_worker,
- rpcrdma_mr_refresh_worker);
-
- rpcrdma_mrs_create(r_xprt);
+ INIT_LIST_HEAD(&buf->rb_all_mrs);
+ INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);
INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
+ INIT_LIST_HEAD(&buf->rb_all_reps);
rc = -ENOMEM;
- for (i = 0; i < buf->rb_max_requests; i++) {
+ for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) {
struct rpcrdma_req *req;
- req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE,
+ req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
GFP_KERNEL);
if (!req)
goto out;
list_add(&req->rl_list, &buf->rb_send_bufs);
}
- buf->rb_credits = 1;
- INIT_LIST_HEAD(&buf->rb_recv_bufs);
-
- rc = rpcrdma_sendctxs_create(r_xprt);
- if (rc)
- goto out;
+ init_llist_head(&buf->rb_free_reps);
return 0;
out:
@@ -1115,58 +1234,62 @@ out:
return rc;
}
-static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
-{
- rpcrdma_regbuf_free(rep->rr_rdmabuf);
- kfree(rep);
-}
-
/**
* rpcrdma_req_destroy - Destroy an rpcrdma_req object
* @req: unused object to be destroyed
*
- * This function assumes that the caller prevents concurrent device
- * unload and transport tear-down.
+ * Relies on caller holding the transport send lock to protect
+ * removing req->rl_all from buf->rb_all_reqs safely.
*/
-void
-rpcrdma_req_destroy(struct rpcrdma_req *req)
+void rpcrdma_req_destroy(struct rpcrdma_req *req)
{
+ struct rpcrdma_mr *mr;
+
list_del(&req->rl_all);
+ while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) {
+ struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
+
+ spin_lock(&buf->rb_lock);
+ list_del(&mr->mr_all);
+ spin_unlock(&buf->rb_lock);
+
+ frwr_release_mr(mr);
+ }
+
rpcrdma_regbuf_free(req->rl_recvbuf);
rpcrdma_regbuf_free(req->rl_sendbuf);
rpcrdma_regbuf_free(req->rl_rdmabuf);
kfree(req);
}
-static void
-rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
+/**
+ * rpcrdma_mrs_destroy - Release all of a transport's MRs
+ * @r_xprt: controlling transport instance
+ *
+ * Relies on caller holding the transport send lock to protect
+ * removing mr->mr_list from req->rl_free_mrs safely.
+ */
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
- rx_buf);
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_mr *mr;
- unsigned int count;
- count = 0;
- spin_lock(&buf->rb_mrlock);
- while (!list_empty(&buf->rb_all)) {
- mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
- list_del(&mr->mr_all);
+ cancel_work_sync(&buf->rb_refresh_worker);
- spin_unlock(&buf->rb_mrlock);
-
- /* Ensure MW is not on any rl_registered list */
- if (!list_empty(&mr->mr_list))
- list_del(&mr->mr_list);
+ spin_lock(&buf->rb_lock);
+ while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
+ struct rpcrdma_mr,
+ mr_all)) != NULL) {
+ list_del(&mr->mr_list);
+ list_del(&mr->mr_all);
+ spin_unlock(&buf->rb_lock);
frwr_release_mr(mr);
- count++;
- spin_lock(&buf->rb_mrlock);
- }
- spin_unlock(&buf->rb_mrlock);
- r_xprt->rx_stats.mrs_allocated = 0;
- dprintk("RPC: %s: released %u MRs\n", __func__, count);
+ spin_lock(&buf->rb_lock);
+ }
+ spin_unlock(&buf->rb_lock);
}
/**
@@ -1180,18 +1303,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- cancel_delayed_work_sync(&buf->rb_refresh_worker);
-
- rpcrdma_sendctxs_destroy(buf);
-
- while (!list_empty(&buf->rb_recv_bufs)) {
- struct rpcrdma_rep *rep;
-
- rep = list_first_entry(&buf->rb_recv_bufs,
- struct rpcrdma_rep, rr_list);
- list_del(&rep->rr_list);
- rpcrdma_rep_destroy(rep);
- }
+ rpcrdma_reps_destroy(buf);
while (!list_empty(&buf->rb_send_bufs)) {
struct rpcrdma_req *req;
@@ -1201,8 +1313,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
list_del(&req->rl_list);
rpcrdma_req_destroy(req);
}
-
- rpcrdma_mrs_destroy(buf);
}
/**
@@ -1216,54 +1326,20 @@ struct rpcrdma_mr *
rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_mr *mr = NULL;
-
- spin_lock(&buf->rb_mrlock);
- if (!list_empty(&buf->rb_mrs))
- mr = rpcrdma_mr_pop(&buf->rb_mrs);
- spin_unlock(&buf->rb_mrlock);
+ struct rpcrdma_mr *mr;
- if (!mr)
- goto out_nomrs;
+ spin_lock(&buf->rb_lock);
+ mr = rpcrdma_mr_pop(&buf->rb_mrs);
+ spin_unlock(&buf->rb_lock);
return mr;
-
-out_nomrs:
- trace_xprtrdma_nomrs(r_xprt);
- if (r_xprt->rx_ep.rep_connected != -ENODEV)
- schedule_delayed_work(&buf->rb_refresh_worker, 0);
-
- /* Allow the reply handler and refresh worker to run */
- cond_resched();
-
- return NULL;
-}
-
-static void
-__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
-{
- spin_lock(&buf->rb_mrlock);
- rpcrdma_mr_push(mr, &buf->rb_mrs);
- spin_unlock(&buf->rb_mrlock);
}
/**
- * rpcrdma_mr_put - Release an rpcrdma_mr object
- * @mr: object to release
+ * rpcrdma_mr_put - DMA unmap an MR and release it
+ * @mr: MR to release
*
*/
-void
-rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
- __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
-}
-
-/**
- * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
- * @mr: object to release
- *
- */
-void
-rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
+void rpcrdma_mr_put(struct rpcrdma_mr *mr)
{
struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
@@ -1273,7 +1349,8 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
- __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
+
+ rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
}
/**
@@ -1304,39 +1381,24 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
*/
void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
{
- struct rpcrdma_rep *rep = req->rl_reply;
-
+ if (req->rl_reply)
+ rpcrdma_rep_put(buffers, req->rl_reply);
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
list_add(&req->rl_list, &buffers->rb_send_bufs);
- if (rep) {
- if (!rep->rr_temp) {
- list_add(&rep->rr_list, &buffers->rb_recv_bufs);
- rep = NULL;
- }
- }
spin_unlock(&buffers->rb_lock);
- if (rep)
- rpcrdma_rep_destroy(rep);
}
-/*
- * Put reply buffers back into pool when not attached to
- * request. This happens in error conditions.
+/**
+ * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
+ * @rep: rep to release
+ *
+ * Used after error conditions.
*/
-void
-rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
- struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
-
- if (!rep->rr_temp) {
- spin_lock(&buffers->rb_lock);
- list_add(&rep->rr_list, &buffers->rb_recv_bufs);
- spin_unlock(&buffers->rb_lock);
- } else {
- rpcrdma_rep_destroy(rep);
- }
+ rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
}
/* Returns a pointer to a rpcrdma_regbuf object, or NULL.
@@ -1453,7 +1515,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
- struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
+ struct ib_send_wr *send_wr = &req->rl_wr;
int rc;
if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
@@ -1471,12 +1533,17 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
return 0;
}
-static void
-rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+/**
+ * rpcrdma_post_recvs - Refill the Receive Queue
+ * @r_xprt: controlling transport instance
+ * @temp: mark Receive buffers to be deleted after use
+ *
+ */
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct ib_recv_wr *i, *wr, *bad_wr;
+ struct ib_recv_wr *wr, *bad_wr;
struct rpcrdma_rep *rep;
int needed, count, rc;
@@ -1484,7 +1551,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
count = 0;
needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (ep->rep_receive_count > needed)
+ if (likely(ep->rep_receive_count > needed))
goto out;
needed -= ep->rep_receive_count;
if (!temp)
@@ -1492,42 +1559,26 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
/* fast path: all needed reps can be found on the free list */
wr = NULL;
- spin_lock(&buf->rb_lock);
while (needed) {
- rep = list_first_entry_or_null(&buf->rb_recv_bufs,
- struct rpcrdma_rep, rr_list);
+ rep = rpcrdma_rep_get_locked(buf);
+ if (rep && rep->rr_temp) {
+ rpcrdma_rep_destroy(rep);
+ continue;
+ }
if (!rep)
- break;
-
- list_del(&rep->rr_list);
- rep->rr_recv_wr.next = wr;
- wr = &rep->rr_recv_wr;
- --needed;
- }
- spin_unlock(&buf->rb_lock);
-
- while (needed) {
- rep = rpcrdma_rep_create(r_xprt, temp);
+ rep = rpcrdma_rep_create(r_xprt, temp);
if (!rep)
break;
+ trace_xprtrdma_post_recv(rep);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
--needed;
+ ++count;
}
if (!wr)
goto out;
- for (i = wr; i; i = i->next) {
- rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
-
- if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
- goto release_wrs;
-
- trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
- ++count;
- }
-
rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
out:
@@ -1544,11 +1595,4 @@ out:
}
ep->rep_receive_count += count;
return;
-
-release_wrs:
- for (i = wr; i;) {
- rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
- i = i->next;
- rpcrdma_recv_buffer_put(rep);
- }
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 92ce09fcea74..37d5080c250b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -47,6 +47,7 @@
#include <linux/atomic.h> /* atomic_t, etc */
#include <linux/kref.h> /* struct kref */
#include <linux/workqueue.h> /* struct work_struct */
+#include <linux/llist.h>
#include <rdma/rdma_cm.h> /* RDMA connection api */
#include <rdma/ib_verbs.h> /* RDMA verbs api */
@@ -70,9 +71,8 @@ struct rpcrdma_ia {
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
int ri_async_rc;
- unsigned int ri_max_segs;
+ unsigned int ri_max_rdma_segs;
unsigned int ri_max_frwr_depth;
- unsigned int ri_max_send_sges;
bool ri_implicit_roundup;
enum ib_mr_type ri_mrtype;
unsigned long ri_flags;
@@ -98,7 +98,7 @@ struct rpcrdma_ep {
wait_queue_head_t rep_connect_wait;
struct rpcrdma_connect_private rep_cm_private;
struct rdma_conn_param rep_remote_cma;
- unsigned int rep_max_requests; /* set by /proc */
+ unsigned int rep_max_requests; /* depends on device */
unsigned int rep_inline_send; /* negotiated */
unsigned int rep_inline_recv; /* negotiated */
int rep_receive_count;
@@ -117,9 +117,6 @@ struct rpcrdma_ep {
#endif
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
- *
- * The below structure appears at the front of a large region of kmalloc'd
- * memory, which always starts on a good alignment boundary.
*/
struct rpcrdma_regbuf {
@@ -158,25 +155,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
/* To ensure a transport can always make forward progress,
* the number of RDMA segments allowed in header chunk lists
- * is capped at 8. This prevents less-capable devices and
- * memory registrations from overrunning the Send buffer
- * while building chunk lists.
+ * is capped at 16. This prevents less-capable devices from
+ * overrunning the Send buffer while building chunk lists.
*
* Elements of the Read list take up more room than the
- * Write list or Reply chunk. 8 read segments means the Read
- * list (or Write list or Reply chunk) cannot consume more
- * than
- *
- * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes.
+ * Write list or Reply chunk. 16 read segments means the
+ * chunk lists cannot consume more than
*
- * And the fixed part of the header is another 24 bytes.
+ * ((16 + 2) * read segment size) + 1 XDR words,
*
- * The smallest inline threshold is 1024 bytes, ensuring that
- * at least 750 bytes are available for RPC messages.
+ * or about 400 bytes. The fixed part of the header is
+ * another 24 bytes. Thus when the inline threshold is
+ * 1024 bytes, at least 600 bytes are available for RPC
+ * message bodies.
*/
enum {
- RPCRDMA_MAX_HDR_SEGS = 8,
- RPCRDMA_HDRBUF_SIZE = 256,
+ RPCRDMA_MAX_HDR_SEGS = 16,
};
/*
@@ -206,8 +200,9 @@ struct rpcrdma_rep {
struct rpc_rqst *rr_rqst;
struct xdr_buf rr_hdrbuf;
struct xdr_stream rr_stream;
- struct list_head rr_list;
+ struct llist_node rr_node;
struct ib_recv_wr rr_recv_wr;
+ struct list_head rr_all;
};
/* To reduce the rate at which a transport invokes ib_post_recv
@@ -223,12 +218,8 @@ enum {
/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
*/
struct rpcrdma_req;
-struct rpcrdma_xprt;
struct rpcrdma_sendctx {
- struct ib_send_wr sc_wr;
struct ib_cqe sc_cqe;
- struct ib_device *sc_device;
- struct rpcrdma_xprt *sc_xprt;
struct rpcrdma_req *sc_req;
unsigned int sc_unmap_count;
struct ib_sge sc_sges[];
@@ -240,20 +231,20 @@ struct rpcrdma_sendctx {
* An external memory region is any buffer or page that is registered
* on the fly (ie, not pre-registered).
*/
-struct rpcrdma_req;
struct rpcrdma_frwr {
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
struct completion fr_linv_done;
- struct rpcrdma_req *fr_req;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
};
};
+struct rpcrdma_req;
struct rpcrdma_mr {
struct list_head mr_list;
+ struct rpcrdma_req *mr_req;
struct scatterlist *mr_sg;
int mr_nents;
enum dma_data_direction mr_dir;
@@ -262,7 +253,6 @@ struct rpcrdma_mr {
u32 mr_handle;
u32 mr_length;
u64 mr_offset;
- struct work_struct mr_recycle;
struct list_head mr_all;
};
@@ -323,6 +313,7 @@ struct rpcrdma_req {
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
struct xdr_buf rl_hdrbuf;
+ struct ib_send_wr rl_wr;
struct rpcrdma_sendctx *rl_sendctx;
struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */
@@ -331,7 +322,8 @@ struct rpcrdma_req {
struct list_head rl_all;
struct kref rl_kref;
- struct list_head rl_registered; /* registered segments */
+ struct list_head rl_free_mrs;
+ struct list_head rl_registered;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
@@ -344,7 +336,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst)
static inline void
rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
{
- list_add_tail(&mr->mr_list, list);
+ list_add(&mr->mr_list, list);
}
static inline struct rpcrdma_mr *
@@ -352,8 +344,9 @@ rpcrdma_mr_pop(struct list_head *list)
{
struct rpcrdma_mr *mr;
- mr = list_first_entry(list, struct rpcrdma_mr, mr_list);
- list_del_init(&mr->mr_list);
+ mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list);
+ if (mr)
+ list_del_init(&mr->mr_list);
return mr;
}
@@ -364,27 +357,28 @@ rpcrdma_mr_pop(struct list_head *list)
* One of these is associated with a transport instance
*/
struct rpcrdma_buffer {
- spinlock_t rb_mrlock; /* protect rb_mrs list */
+ spinlock_t rb_lock;
+ struct list_head rb_send_bufs;
struct list_head rb_mrs;
- struct list_head rb_all;
unsigned long rb_sc_head;
unsigned long rb_sc_tail;
unsigned long rb_sc_last;
struct rpcrdma_sendctx **rb_sc_ctxs;
- spinlock_t rb_lock; /* protect buf lists */
- struct list_head rb_send_bufs;
- struct list_head rb_recv_bufs;
struct list_head rb_allreqs;
+ struct list_head rb_all_mrs;
+ struct list_head rb_all_reps;
- u32 rb_max_requests;
+ struct llist_head rb_free_reps;
+
+ __be32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
u32 rb_bc_srv_max_requests;
u32 rb_bc_max_requests;
- struct delayed_work rb_refresh_worker;
+ struct work_struct rb_refresh_worker;
};
/*
@@ -477,12 +471,14 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
struct rpcrdma_req *);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/*
* Buffer calls - xprtrdma/verbs.c
*/
struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
gfp_t flags);
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_req_destroy(struct rpcrdma_req *req);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
@@ -490,13 +486,7 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
void rpcrdma_mr_put(struct rpcrdma_mr *mr);
-void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr);
-
-static inline void
-rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
-{
- schedule_work(&mr->mr_recycle);
-}
+void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
@@ -545,16 +535,15 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
-bool frwr_is_supported(struct ib_device *device);
void frwr_reset(struct rpcrdma_req *req);
-int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
+int frwr_query_device(struct rpcrdma_xprt *r_xprt,
+ const struct ib_device *device);
int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
void frwr_release_mr(struct rpcrdma_mr *mr);
-size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
- struct rpcrdma_mr **mr);
+ struct rpcrdma_mr *mr);
int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
@@ -566,6 +555,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
enum rpcrdma_chunktype {
rpcrdma_noch = 0,
+ rpcrdma_noch_pullup,
+ rpcrdma_noch_mapped,
rpcrdma_readch,
rpcrdma_areadch,
rpcrdma_writech,
@@ -579,6 +570,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
@@ -590,7 +582,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
/* RPC/RDMA module init - xprtrdma/transport.c
*/
-extern unsigned int xprt_rdma_slot_table_entries;
extern unsigned int xprt_rdma_max_inline_read;
extern unsigned int xprt_rdma_max_inline_write;
void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e2176c167a57..d86c664ea6af 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags)
printk(KERN_WARNING "Callback slot table overflowed\n");
return -ESHUTDOWN;
}
+ if (transport->recv.copied && !req->rq_private_buf.len)
+ return -ESHUTDOWN;
ret = xs_read_stream_request(transport, msg, flags, req);
if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
xprt_complete_bc_request(req, transport->recv.copied);
+ else
+ req->rq_private_buf.len = transport->recv.copied;
return ret;
}
@@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
/* Look up and lock the request corresponding to the given XID */
spin_lock(&xprt->queue_lock);
req = xprt_lookup_rqst(xprt, transport->recv.xid);
- if (!req) {
+ if (!req || (transport->recv.copied && !req->rq_private_buf.len)) {
msg->msg_flags |= MSG_TRUNC;
goto out;
}
@@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags)
spin_lock(&xprt->queue_lock);
if (msg->msg_flags & (MSG_EOR|MSG_TRUNC))
xprt_complete_rqst(req->rq_task, transport->recv.copied);
+ else
+ req->rq_private_buf.len = transport->recv.copied;
xprt_unpin_rqst(req);
out:
spin_unlock(&xprt->queue_lock);
@@ -1243,19 +1249,21 @@ static void xs_error_report(struct sock *sk)
{
struct sock_xprt *transport;
struct rpc_xprt *xprt;
- int err;
read_lock_bh(&sk->sk_callback_lock);
if (!(xprt = xprt_from_sock(sk)))
goto out;
transport = container_of(xprt, struct sock_xprt, xprt);
- err = -sk->sk_err;
- if (err == 0)
+ transport->xprt_err = -sk->sk_err;
+ if (transport->xprt_err == 0)
goto out;
dprintk("RPC: xs_error_report client %p, error=%d...\n",
- xprt, -err);
- trace_rpc_socket_error(xprt, sk->sk_socket, err);
+ xprt, -transport->xprt_err);
+ trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err);
+
+ /* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */
+ smp_mb__before_atomic();
xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
out:
read_unlock_bh(&sk->sk_callback_lock);
@@ -1744,7 +1752,7 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock)
{
- if (transport->srcport == 0)
+ if (transport->srcport == 0 && transport->xprt.reuseport)
transport->srcport = xs_sock_getport(sock);
}
@@ -2470,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport)
static void xs_wake_error(struct sock_xprt *transport)
{
int sockerr;
- int sockerr_len = sizeof(sockerr);
if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
return;
@@ -2479,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport)
goto out;
if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
goto out;
- if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR,
- (char *)&sockerr, &sockerr_len) != 0)
- goto out;
+ sockerr = xchg(&transport->xprt_err, 0);
if (sockerr < 0)
xprt_wake_pending_tasks(&transport->xprt, sockerr);
out:
@@ -2654,6 +2659,8 @@ static int bc_sendto(struct rpc_rqst *req)
.iov_len = sizeof(marker),
};
+ req->rq_xtime = ktime_get();
+
len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len);
if (len != iov.iov_len)
return -EAGAIN;
@@ -2679,7 +2686,6 @@ static int bc_send_request(struct rpc_rqst *req)
struct svc_xprt *xprt;
int len;
- dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
/*
* Get the server socket associated with this callback xprt
*/
diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig
index b83e16ade4d2..716b61a701a8 100644
--- a/net/tipc/Kconfig
+++ b/net/tipc/Kconfig
@@ -35,6 +35,21 @@ config TIPC_MEDIA_UDP
Saying Y here will enable support for running TIPC over IP/UDP
bool
default y
+config TIPC_CRYPTO
+ bool "TIPC encryption support"
+ depends on TIPC
+ select CRYPTO
+ select CRYPTO_AES
+ select CRYPTO_GCM
+ help
+ Saying Y here will enable support for TIPC encryption.
+ All TIPC messages will be encrypted/decrypted by using the currently most
+ advanced algorithm: AEAD AES-GCM (like IPSec or TLS) before leaving/
+ entering the TIPC stack.
+ Key setting from user-space is performed via netlink by a user program
+ (e.g. the iproute2 'tipc' tool).
+ bool
+ default y
config TIPC_DIAG
tristate "TIPC: socket monitoring interface"
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index c86aba0282af..ee49a9f1dd4f 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -9,15 +9,14 @@ tipc-y += addr.o bcast.o bearer.o \
core.o link.o discover.o msg.o \
name_distr.o subscr.o monitor.o name_table.o net.o \
netlink.o netlink_compat.o node.o socket.o eth_media.o \
- topsrv.o socket.o group.o trace.o
+ topsrv.o group.o trace.o
CFLAGS_trace.o += -I$(src)
tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o
tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o
tipc-$(CONFIG_SYSCTL) += sysctl.o
+tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o
obj-$(CONFIG_TIPC_DIAG) += diag.o
-
-tipc_diag-y := diag.o
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 1336f3cdad38..4c20be08b9c4 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -84,12 +84,12 @@ static struct tipc_bc_base *tipc_bc_base(struct net *net)
*/
int tipc_bcast_get_mtu(struct net *net)
{
- return tipc_link_mtu(tipc_bc_sndlink(net)) - INT_H_SIZE;
+ return tipc_link_mss(tipc_bc_sndlink(net));
}
-void tipc_bcast_disable_rcast(struct net *net)
+void tipc_bcast_toggle_rcast(struct net *net, bool supp)
{
- tipc_bc_base(net)->rcast_support = false;
+ tipc_bc_base(net)->rcast_support = supp;
}
static void tipc_bcbase_calc_bc_threshold(struct net *net)
@@ -185,7 +185,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
}
/* We have to transmit across all bearers */
- skb_queue_head_init(&_xmitq);
+ __skb_queue_head_init(&_xmitq);
for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
if (!bb->dests[bearer_id])
continue;
@@ -256,7 +256,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
struct sk_buff_head xmitq;
int rc = 0;
- skb_queue_head_init(&xmitq);
+ __skb_queue_head_init(&xmitq);
tipc_bcast_lock(net);
if (tipc_link_bc_peers(l))
rc = tipc_link_xmit(l, pkts, &xmitq);
@@ -286,7 +286,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
u32 dnode, selector;
selector = msg_link_selector(buf_msg(skb_peek(pkts)));
- skb_queue_head_init(&_pkts);
+ __skb_queue_head_init(&_pkts);
list_for_each_entry_safe(dst, tmp, &dests->list, list) {
dnode = dst->node;
@@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
* @skb: socket buffer to copy
* @method: send method to be used
* @dests: destination nodes for message.
- * @cong_link_cnt: returns number of encountered congested destination links
* Returns 0 if success, otherwise errno
*/
static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
struct tipc_mc_method *method,
- struct tipc_nlist *dests,
- u16 *cong_link_cnt)
+ struct tipc_nlist *dests)
{
struct tipc_msg *hdr, *_hdr;
struct sk_buff_head tmpq;
struct sk_buff *_skb;
+ u16 cong_link_cnt;
+ int rc = 0;
/* Is a cluster supporting with new capabilities ? */
if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL))
@@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
_hdr = buf_msg(_skb);
msg_set_size(_hdr, MCAST_H_SIZE);
msg_set_is_rcast(_hdr, !msg_is_rcast(hdr));
+ msg_set_errcode(_hdr, TIPC_ERR_NO_PORT);
- skb_queue_head_init(&tmpq);
+ __skb_queue_head_init(&tmpq);
__skb_queue_tail(&tmpq, _skb);
if (method->rcast)
- tipc_bcast_xmit(net, &tmpq, cong_link_cnt);
+ rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt);
else
- tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt);
+ rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt);
/* This queue should normally be empty by now */
__skb_queue_purge(&tmpq);
- return 0;
+ return rc;
}
/* tipc_mcast_xmit - deliver message to indicated destination nodes
@@ -378,7 +379,7 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
int rc = 0;
skb_queue_head_init(&inputq);
- skb_queue_head_init(&localq);
+ __skb_queue_head_init(&localq);
/* Clone packets before they are consumed by next call */
if (dests->local && !tipc_msg_reassemble(pkts, &localq)) {
@@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
msg_set_is_rcast(hdr, method->rcast);
/* Switch method ? */
- if (rcast != method->rcast)
- tipc_mcast_send_sync(net, skb, method,
- dests, cong_link_cnt);
+ if (rcast != method->rcast) {
+ rc = tipc_mcast_send_sync(net, skb, method, dests);
+ if (unlikely(rc)) {
+ pr_err("Unable to send SYN: method %d, rc %d\n",
+ rcast, rc);
+ goto exit;
+ }
+ }
if (method->rcast)
rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
@@ -406,8 +412,10 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
rc = tipc_bcast_xmit(net, pkts, cong_link_cnt);
}
- if (dests->local)
+ if (dests->local) {
+ tipc_loopback_trace(net, &localq);
tipc_sk_mcast_rcv(net, &localq, &inputq);
+ }
exit:
/* This queue should normally be empty by now */
__skb_queue_purge(pkts);
@@ -560,18 +568,18 @@ int tipc_bclink_reset_stats(struct net *net)
return 0;
}
-static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit)
+static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win)
{
struct tipc_link *l = tipc_bc_sndlink(net);
if (!l)
return -ENOPROTOOPT;
- if (limit < BCLINK_WIN_MIN)
- limit = BCLINK_WIN_MIN;
- if (limit > TIPC_MAX_LINK_WIN)
+ if (max_win < BCLINK_WIN_MIN)
+ max_win = BCLINK_WIN_MIN;
+ if (max_win > TIPC_MAX_LINK_WIN)
return -EINVAL;
tipc_bcast_lock(net);
- tipc_link_set_queue_limits(l, limit);
+ tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win);
tipc_bcast_unlock(net);
return 0;
}
@@ -681,6 +689,7 @@ int tipc_bcast_init(struct net *net)
if (!tipc_link_bc_create(net, 0, 0,
FB_MTU,
BCLINK_WIN_DEFAULT,
+ BCLINK_WIN_DEFAULT,
0,
&bb->inputq,
NULL,
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index dadad953e2be..9e847d9617d3 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl);
void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id);
void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id);
int tipc_bcast_get_mtu(struct net *net);
-void tipc_bcast_disable_rcast(struct net *net);
+void tipc_bcast_toggle_rcast(struct net *net, bool supp);
int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts,
struct tipc_mc_method *method, struct tipc_nlist *dests,
u16 *cong_link_cnt);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a809c0ec8d15..34ca7b789eba 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -44,6 +44,7 @@
#include "netlink.h"
#include "udp_media.h"
#include "trace.h"
+#include "crypto.h"
#define MAX_ADDR_STR 60
@@ -310,11 +311,13 @@ static int tipc_enable_bearer(struct net *net, const char *name,
b->identity = bearer_id;
b->tolerance = m->tolerance;
- b->window = m->window;
+ b->min_win = m->min_win;
+ b->max_win = m->max_win;
b->domain = disc_domain;
b->net_plane = bearer_id + 'A';
b->priority = prio;
test_and_set_bit_lock(0, &b->up);
+ refcount_set(&b->refcnt, 1);
res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
if (res) {
@@ -351,6 +354,17 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
return 0;
}
+bool tipc_bearer_hold(struct tipc_bearer *b)
+{
+ return (b && refcount_inc_not_zero(&b->refcnt));
+}
+
+void tipc_bearer_put(struct tipc_bearer *b)
+{
+ if (b && refcount_dec_and_test(&b->refcnt))
+ kfree_rcu(b, rcu);
+}
+
/**
* bearer_disable
*
@@ -369,7 +383,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
if (b->disc)
tipc_disc_delete(b->disc);
RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL);
- kfree_rcu(b, rcu);
+ tipc_bearer_put(b);
tipc_mon_delete(net, bearer_id);
}
@@ -389,6 +403,11 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
dev_put(dev);
return -EINVAL;
}
+ if (dev == net->loopback_dev) {
+ dev_put(dev);
+ pr_info("Enabling <%s> not permitted\n", b->name);
+ return -EINVAL;
+ }
/* Autoconfigure own node identity if needed */
if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) {
@@ -499,10 +518,15 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
rcu_read_lock();
b = bearer_get(net, bearer_id);
- if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr))))
- b->media->send_msg(net, skb, b, dest);
- else
+ if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) {
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_xmit(net, &skb, b, dest, NULL);
+ if (skb)
+#endif
+ b->media->send_msg(net, skb, b, dest);
+ } else {
kfree_skb(skb);
+ }
rcu_read_unlock();
}
@@ -510,7 +534,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
*/
void tipc_bearer_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq,
- struct tipc_media_addr *dst)
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode)
{
struct tipc_bearer *b;
struct sk_buff *skb, *tmp;
@@ -524,10 +549,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id,
__skb_queue_purge(xmitq);
skb_queue_walk_safe(xmitq, skb, tmp) {
__skb_dequeue(xmitq);
- if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb))))
- b->media->send_msg(net, skb, b, dst);
- else
+ if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) {
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_xmit(net, &skb, b, dst, __dnode);
+ if (skb)
+#endif
+ b->media->send_msg(net, skb, b, dst);
+ } else {
kfree_skb(skb);
+ }
}
rcu_read_unlock();
}
@@ -538,6 +568,7 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq)
{
struct tipc_net *tn = tipc_net(net);
+ struct tipc_media_addr *dst;
int net_id = tn->net_id;
struct tipc_bearer *b;
struct sk_buff *skb, *tmp;
@@ -552,7 +583,12 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
msg_set_non_seq(hdr, 1);
msg_set_mc_netid(hdr, net_id);
__skb_dequeue(xmitq);
- b->media->send_msg(net, skb, b, &b->bcast_addr);
+ dst = &b->bcast_addr;
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_xmit(net, &skb, b, dst, NULL);
+ if (skb)
+#endif
+ b->media->send_msg(net, skb, b, dst);
}
rcu_read_unlock();
}
@@ -579,6 +615,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
if (likely(b && test_bit(0, &b->up) &&
(skb->pkt_type <= PACKET_MULTICAST))) {
skb_mark_not_on_list(skb);
+ TIPC_SKB_CB(skb)->flags = 0;
tipc_rcv(dev_net(b->pt.dev), skb, b);
rcu_read_unlock();
return NET_RX_SUCCESS;
@@ -674,6 +711,65 @@ void tipc_bearer_stop(struct net *net)
}
}
+void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts)
+{
+ struct net_device *dev = net->loopback_dev;
+ struct sk_buff *skb, *_skb;
+ int exp;
+
+ skb_queue_walk(pkts, _skb) {
+ skb = pskb_copy(_skb, GFP_ATOMIC);
+ if (!skb)
+ continue;
+
+ exp = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb));
+ if (exp > 0 && pskb_expand_head(skb, exp, 0, GFP_ATOMIC)) {
+ kfree_skb(skb);
+ continue;
+ }
+
+ skb_reset_network_header(skb);
+ dev_hard_header(skb, dev, ETH_P_TIPC, dev->dev_addr,
+ dev->dev_addr, skb->len);
+ skb->dev = dev;
+ skb->pkt_type = PACKET_HOST;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ skb->protocol = eth_type_trans(skb, dev);
+ netif_rx_ni(skb);
+ }
+}
+
+static int tipc_loopback_rcv_pkt(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *od)
+{
+ consume_skb(skb);
+ return NET_RX_SUCCESS;
+}
+
+int tipc_attach_loopback(struct net *net)
+{
+ struct net_device *dev = net->loopback_dev;
+ struct tipc_net *tn = tipc_net(net);
+
+ if (!dev)
+ return -ENODEV;
+
+ dev_hold(dev);
+ tn->loopback_pt.dev = dev;
+ tn->loopback_pt.type = htons(ETH_P_TIPC);
+ tn->loopback_pt.func = tipc_loopback_rcv_pkt;
+ dev_add_pack(&tn->loopback_pt);
+ return 0;
+}
+
+void tipc_detach_loopback(struct net *net)
+{
+ struct tipc_net *tn = tipc_net(net);
+
+ dev_remove_pack(&tn->loopback_pt);
+ dev_put(net->loopback_dev);
+}
+
/* Caller should hold rtnl_lock to protect the bearer */
static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
struct tipc_bearer *bearer, int nlflags)
@@ -701,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance))
goto prop_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window))
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win))
goto prop_msg_full;
if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP)
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu))
@@ -993,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
if (props[TIPC_NLA_PROP_PRIO])
b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
if (props[TIPC_NLA_PROP_WIN])
- b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
if (props[TIPC_NLA_PROP_MTU]) {
if (b->media->type_id != TIPC_MEDIA_TYPE_UDP)
return -EINVAL;
@@ -1047,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance))
goto prop_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win))
goto prop_msg_full;
if (media->type_id == TIPC_MEDIA_TYPE_UDP)
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
@@ -1180,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
if (props[TIPC_NLA_PROP_PRIO])
m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
if (props[TIPC_NLA_PROP_WIN])
- m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
if (props[TIPC_NLA_PROP_MTU]) {
if (m->type_id != TIPC_MEDIA_TYPE_UDP)
return -EINVAL;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 7f4c569594a5..bc0023119da2 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -119,7 +119,8 @@ struct tipc_media {
char *raw);
u32 priority;
u32 tolerance;
- u32 window;
+ u32 min_win;
+ u32 max_win;
u32 mtu;
u32 type_id;
u32 hwaddr_len;
@@ -158,13 +159,15 @@ struct tipc_bearer {
struct packet_type pt;
struct rcu_head rcu;
u32 priority;
- u32 window;
+ u32 min_win;
+ u32 max_win;
u32 tolerance;
u32 domain;
u32 identity;
struct tipc_discoverer *disc;
char net_plane;
unsigned long up;
+ refcount_t refcnt;
};
struct tipc_bearer_names {
@@ -210,6 +213,8 @@ int tipc_media_set_window(const char *name, u32 new_value);
int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a);
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
struct nlattr *attrs[]);
+bool tipc_bearer_hold(struct tipc_bearer *b);
+void tipc_bearer_put(struct tipc_bearer *b);
void tipc_disable_l2_media(struct tipc_bearer *b);
int tipc_l2_send_msg(struct net *net, struct sk_buff *buf,
struct tipc_bearer *b, struct tipc_media_addr *dest);
@@ -229,9 +234,20 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id,
struct tipc_media_addr *dest);
void tipc_bearer_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq,
- struct tipc_media_addr *dst);
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode);
void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
struct sk_buff_head *xmitq);
+void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts);
+int tipc_attach_loopback(struct net *net);
+void tipc_detach_loopback(struct net *net);
+
+static inline void tipc_loopback_trace(struct net *net,
+ struct sk_buff_head *pkts)
+{
+ if (unlikely(dev_nit_active(net->loopback_dev)))
+ tipc_clone_to_loopback(net, pkts);
+}
/* check if device MTU is too low for tipc headers */
static inline bool tipc_mtu_bad(struct net_device *dev, unsigned int reserve)
diff --git a/net/tipc/core.c b/net/tipc/core.c
index c8370722f0bb..4f6dc74adf45 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -34,8 +34,6 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include "core.h"
#include "name_table.h"
#include "subscr.h"
@@ -44,6 +42,7 @@
#include "socket.h"
#include "bcast.h"
#include "node.h"
+#include "crypto.h"
#include <linux/module.h>
@@ -68,6 +67,11 @@ static int __net_init tipc_init_net(struct net *net)
INIT_LIST_HEAD(&tn->node_list);
spin_lock_init(&tn->node_list_lock);
+#ifdef CONFIG_TIPC_CRYPTO
+ err = tipc_crypto_start(&tn->crypto_tx, net, NULL);
+ if (err)
+ goto out_crypto;
+#endif
err = tipc_sk_rht_init(net);
if (err)
goto out_sk_rht;
@@ -82,6 +86,10 @@ static int __net_init tipc_init_net(struct net *net)
if (err)
goto out_bclink;
+ err = tipc_attach_loopback(net);
+ if (err)
+ goto out_bclink;
+
return 0;
out_bclink:
@@ -89,17 +97,35 @@ out_bclink:
out_nametbl:
tipc_sk_rht_destroy(net);
out_sk_rht:
+
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_stop(&tn->crypto_tx);
+out_crypto:
+#endif
return err;
}
static void __net_exit tipc_exit_net(struct net *net)
{
+ tipc_detach_loopback(net);
tipc_net_stop(net);
tipc_bcast_stop(net);
tipc_nametbl_stop(net);
tipc_sk_rht_destroy(net);
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_stop(&tipc_net(net)->crypto_tx);
+#endif
}
+static void __net_exit tipc_pernet_pre_exit(struct net *net)
+{
+ tipc_node_pre_cleanup_net(net);
+}
+
+static struct pernet_operations tipc_pernet_pre_exit_ops = {
+ .pre_exit = tipc_pernet_pre_exit,
+};
+
static struct pernet_operations tipc_net_ops = {
.init = tipc_init_net,
.exit = tipc_exit_net,
@@ -122,14 +148,6 @@ static int __init tipc_init(void)
sysctl_tipc_rmem[1] = RCVBUF_DEF;
sysctl_tipc_rmem[2] = RCVBUF_MAX;
- err = tipc_netlink_start();
- if (err)
- goto out_netlink;
-
- err = tipc_netlink_compat_start();
- if (err)
- goto out_netlink_compat;
-
err = tipc_register_sysctl();
if (err)
goto out_sysctl;
@@ -146,13 +164,32 @@ static int __init tipc_init(void)
if (err)
goto out_pernet_topsrv;
+ err = register_pernet_subsys(&tipc_pernet_pre_exit_ops);
+ if (err)
+ goto out_register_pernet_subsys;
+
err = tipc_bearer_setup();
if (err)
goto out_bearer;
+ err = tipc_netlink_start();
+ if (err)
+ goto out_netlink;
+
+ err = tipc_netlink_compat_start();
+ if (err)
+ goto out_netlink_compat;
+
pr_info("Started in single node mode\n");
return 0;
+
+out_netlink_compat:
+ tipc_netlink_stop();
+out_netlink:
+ tipc_bearer_cleanup();
out_bearer:
+ unregister_pernet_subsys(&tipc_pernet_pre_exit_ops);
+out_register_pernet_subsys:
unregister_pernet_device(&tipc_topsrv_net_ops);
out_pernet_topsrv:
tipc_socket_stop();
@@ -161,22 +198,19 @@ out_socket:
out_pernet:
tipc_unregister_sysctl();
out_sysctl:
- tipc_netlink_compat_stop();
-out_netlink_compat:
- tipc_netlink_stop();
-out_netlink:
pr_err("Unable to start in single node mode\n");
return err;
}
static void __exit tipc_exit(void)
{
+ tipc_netlink_compat_stop();
+ tipc_netlink_stop();
tipc_bearer_cleanup();
+ unregister_pernet_subsys(&tipc_pernet_pre_exit_ops);
unregister_pernet_device(&tipc_topsrv_net_ops);
tipc_socket_stop();
unregister_pernet_device(&tipc_net_ops);
- tipc_netlink_stop();
- tipc_netlink_compat_stop();
tipc_unregister_sysctl();
pr_info("Deactivated\n");
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 7a68e1b6a066..631d83c9705f 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -59,6 +59,13 @@
#include <net/netns/generic.h>
#include <linux/rhashtable.h>
#include <net/genetlink.h>
+#include <net/netns/hash.h>
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
struct tipc_node;
struct tipc_bearer;
@@ -67,6 +74,9 @@ struct tipc_link;
struct tipc_name_table;
struct tipc_topsrv;
struct tipc_monitor;
+#ifdef CONFIG_TIPC_CRYPTO
+struct tipc_crypto;
+#endif
#define TIPC_MOD_VER "2.0.0"
@@ -125,6 +135,14 @@ struct tipc_net {
/* Cluster capabilities */
u16 capabilities;
+
+ /* Tracing of node internal messages */
+ struct packet_type loopback_pt;
+
+#ifdef CONFIG_TIPC_CRYPTO
+ /* TX crypto handler */
+ struct tipc_crypto *crypto_tx;
+#endif
};
static inline struct tipc_net *tipc_net(struct net *net)
@@ -182,6 +200,11 @@ static inline int in_range(u16 val, u16 min, u16 max)
return !less(val, min) && !more(val, max);
}
+static inline u32 tipc_net_hash_mixes(struct net *net, int tn_rand)
+{
+ return net_hash_mix(&init_net) ^ net_hash_mix(net) ^ tn_rand;
+}
+
#ifdef CONFIG_SYSCTL
int tipc_register_sysctl(void);
void tipc_unregister_sysctl(void);
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
new file mode 100644
index 000000000000..c8c47fc72653
--- /dev/null
+++ b/net/tipc/crypto.c
@@ -0,0 +1,1983 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption
+ *
+ * Copyright (c) 2019, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <crypto/aead.h>
+#include <crypto/aes.h>
+#include "crypto.h"
+
+#define TIPC_TX_PROBE_LIM msecs_to_jiffies(1000) /* > 1s */
+#define TIPC_TX_LASTING_LIM msecs_to_jiffies(120000) /* 2 mins */
+#define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */
+#define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(180000) /* 3 mins */
+#define TIPC_MAX_TFMS_DEF 10
+#define TIPC_MAX_TFMS_LIM 1000
+
+/**
+ * TIPC Key ids
+ */
+enum {
+ KEY_UNUSED = 0,
+ KEY_MIN,
+ KEY_1 = KEY_MIN,
+ KEY_2,
+ KEY_3,
+ KEY_MAX = KEY_3,
+};
+
+/**
+ * TIPC Crypto statistics
+ */
+enum {
+ STAT_OK,
+ STAT_NOK,
+ STAT_ASYNC,
+ STAT_ASYNC_OK,
+ STAT_ASYNC_NOK,
+ STAT_BADKEYS, /* tx only */
+ STAT_BADMSGS = STAT_BADKEYS, /* rx only */
+ STAT_NOKEYS,
+ STAT_SWITCHES,
+
+ MAX_STATS,
+};
+
+/* TIPC crypto statistics' header */
+static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok",
+ "async_nok", "badmsgs", "nokeys",
+ "switches"};
+
+/* Max TFMs number per key */
+int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF;
+
+/**
+ * struct tipc_key - TIPC keys' status indicator
+ *
+ * 7 6 5 4 3 2 1 0
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ * key: | (reserved)|passive idx| active idx|pending idx|
+ * +-----+-----+-----+-----+-----+-----+-----+-----+
+ */
+struct tipc_key {
+#define KEY_BITS (2)
+#define KEY_MASK ((1 << KEY_BITS) - 1)
+ union {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 pending:2,
+ active:2,
+ passive:2, /* rx only */
+ reserved:2;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ u8 reserved:2,
+ passive:2, /* rx only */
+ active:2,
+ pending:2;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ } __packed;
+ u8 keys;
+ };
+};
+
+/**
+ * struct tipc_tfm - TIPC TFM structure to form a list of TFMs
+ */
+struct tipc_tfm {
+ struct crypto_aead *tfm;
+ struct list_head list;
+};
+
+/**
+ * struct tipc_aead - TIPC AEAD key structure
+ * @tfm_entry: per-cpu pointer to one entry in TFM list
+ * @crypto: TIPC crypto owns this key
+ * @cloned: reference to the source key in case cloning
+ * @users: the number of the key users (TX/RX)
+ * @salt: the key's SALT value
+ * @authsize: authentication tag size (max = 16)
+ * @mode: crypto mode is applied to the key
+ * @hint[]: a hint for user key
+ * @rcu: struct rcu_head
+ * @seqno: the key seqno (cluster scope)
+ * @refcnt: the key reference counter
+ */
+struct tipc_aead {
+#define TIPC_AEAD_HINT_LEN (5)
+ struct tipc_tfm * __percpu *tfm_entry;
+ struct tipc_crypto *crypto;
+ struct tipc_aead *cloned;
+ atomic_t users;
+ u32 salt;
+ u8 authsize;
+ u8 mode;
+ char hint[TIPC_AEAD_HINT_LEN + 1];
+ struct rcu_head rcu;
+
+ atomic64_t seqno ____cacheline_aligned;
+ refcount_t refcnt ____cacheline_aligned;
+
+} ____cacheline_aligned;
+
+/**
+ * struct tipc_crypto_stats - TIPC Crypto statistics
+ */
+struct tipc_crypto_stats {
+ unsigned int stat[MAX_STATS];
+};
+
+/**
+ * struct tipc_crypto - TIPC TX/RX crypto structure
+ * @net: struct net
+ * @node: TIPC node (RX)
+ * @aead: array of pointers to AEAD keys for encryption/decryption
+ * @peer_rx_active: replicated peer RX active key index
+ * @key: the key states
+ * @working: the crypto is working or not
+ * @stats: the crypto statistics
+ * @sndnxt: the per-peer sndnxt (TX)
+ * @timer1: general timer 1 (jiffies)
+ * @timer2: general timer 1 (jiffies)
+ * @lock: tipc_key lock
+ */
+struct tipc_crypto {
+ struct net *net;
+ struct tipc_node *node;
+ struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */
+ atomic_t peer_rx_active;
+ struct tipc_key key;
+ u8 working:1;
+ struct tipc_crypto_stats __percpu *stats;
+
+ atomic64_t sndnxt ____cacheline_aligned;
+ unsigned long timer1;
+ unsigned long timer2;
+ spinlock_t lock; /* crypto lock */
+
+} ____cacheline_aligned;
+
+/* struct tipc_crypto_tx_ctx - TX context for callbacks */
+struct tipc_crypto_tx_ctx {
+ struct tipc_aead *aead;
+ struct tipc_bearer *bearer;
+ struct tipc_media_addr dst;
+};
+
+/* struct tipc_crypto_rx_ctx - RX context for callbacks */
+struct tipc_crypto_rx_ctx {
+ struct tipc_aead *aead;
+ struct tipc_bearer *bearer;
+};
+
+static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead);
+static inline void tipc_aead_put(struct tipc_aead *aead);
+static void tipc_aead_free(struct rcu_head *rp);
+static int tipc_aead_users(struct tipc_aead __rcu *aead);
+static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim);
+static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim);
+static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val);
+static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead);
+static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
+ u8 mode);
+static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src);
+static void *tipc_aead_mem_alloc(struct crypto_aead *tfm,
+ unsigned int crypto_ctx_size,
+ u8 **iv, struct aead_request **req,
+ struct scatterlist **sg, int nsg);
+static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode);
+static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err);
+static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead,
+ struct sk_buff *skb, struct tipc_bearer *b);
+static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err);
+static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr);
+static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead,
+ u8 tx_key, struct sk_buff *skb,
+ struct tipc_crypto *__rx);
+static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
+ u8 new_passive,
+ u8 new_active,
+ u8 new_pending);
+static int tipc_crypto_key_attach(struct tipc_crypto *c,
+ struct tipc_aead *aead, u8 pos);
+static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending);
+static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
+ struct tipc_crypto *rx,
+ struct sk_buff *skb);
+static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active,
+ struct tipc_msg *hdr);
+static int tipc_crypto_key_revoke(struct net *net, u8 tx_key);
+static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
+ struct tipc_bearer *b,
+ struct sk_buff **skb, int err);
+static void tipc_crypto_do_cmd(struct net *net, int cmd);
+static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf);
+#ifdef TIPC_CRYPTO_DEBUG
+static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
+ char *buf);
+#endif
+
+#define key_next(cur) ((cur) % KEY_MAX + 1)
+
+#define tipc_aead_rcu_ptr(rcu_ptr, lock) \
+ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock))
+
+#define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \
+do { \
+ typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \
+ lockdep_is_held(lock)); \
+ rcu_assign_pointer((rcu_ptr), (ptr)); \
+ tipc_aead_put(__tmp); \
+} while (0)
+
+#define tipc_crypto_key_detach(rcu_ptr, lock) \
+ tipc_aead_rcu_replace((rcu_ptr), NULL, lock)
+
+/**
+ * tipc_aead_key_validate - Validate a AEAD user key
+ */
+int tipc_aead_key_validate(struct tipc_aead_key *ukey)
+{
+ int keylen;
+
+ /* Check if algorithm exists */
+ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) {
+ pr_info("Not found cipher: \"%s\"!\n", ukey->alg_name);
+ return -ENODEV;
+ }
+
+ /* Currently, we only support the "gcm(aes)" cipher algorithm */
+ if (strcmp(ukey->alg_name, "gcm(aes)"))
+ return -ENOTSUPP;
+
+ /* Check if key size is correct */
+ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE;
+ if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 &&
+ keylen != TIPC_AES_GCM_KEY_SIZE_192 &&
+ keylen != TIPC_AES_GCM_KEY_SIZE_256))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead)
+{
+ struct tipc_aead *tmp;
+
+ rcu_read_lock();
+ tmp = rcu_dereference(aead);
+ if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt)))
+ tmp = NULL;
+ rcu_read_unlock();
+
+ return tmp;
+}
+
+static inline void tipc_aead_put(struct tipc_aead *aead)
+{
+ if (aead && refcount_dec_and_test(&aead->refcnt))
+ call_rcu(&aead->rcu, tipc_aead_free);
+}
+
+/**
+ * tipc_aead_free - Release AEAD key incl. all the TFMs in the list
+ * @rp: rcu head pointer
+ */
+static void tipc_aead_free(struct rcu_head *rp)
+{
+ struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu);
+ struct tipc_tfm *tfm_entry, *head, *tmp;
+
+ if (aead->cloned) {
+ tipc_aead_put(aead->cloned);
+ } else {
+ head = *this_cpu_ptr(aead->tfm_entry);
+ list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) {
+ crypto_free_aead(tfm_entry->tfm);
+ list_del(&tfm_entry->list);
+ kfree(tfm_entry);
+ }
+ /* Free the head */
+ crypto_free_aead(head->tfm);
+ list_del(&head->list);
+ kfree(head);
+ }
+ free_percpu(aead->tfm_entry);
+ kfree(aead);
+}
+
+static int tipc_aead_users(struct tipc_aead __rcu *aead)
+{
+ struct tipc_aead *tmp;
+ int users = 0;
+
+ rcu_read_lock();
+ tmp = rcu_dereference(aead);
+ if (tmp)
+ users = atomic_read(&tmp->users);
+ rcu_read_unlock();
+
+ return users;
+}
+
+static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim)
+{
+ struct tipc_aead *tmp;
+
+ rcu_read_lock();
+ tmp = rcu_dereference(aead);
+ if (tmp)
+ atomic_add_unless(&tmp->users, 1, lim);
+ rcu_read_unlock();
+}
+
+static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim)
+{
+ struct tipc_aead *tmp;
+
+ rcu_read_lock();
+ tmp = rcu_dereference(aead);
+ if (tmp)
+ atomic_add_unless(&rcu_dereference(aead)->users, -1, lim);
+ rcu_read_unlock();
+}
+
+static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val)
+{
+ struct tipc_aead *tmp;
+ int cur;
+
+ rcu_read_lock();
+ tmp = rcu_dereference(aead);
+ if (tmp) {
+ do {
+ cur = atomic_read(&tmp->users);
+ if (cur == val)
+ break;
+ } while (atomic_cmpxchg(&tmp->users, cur, val) != cur);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it
+ */
+static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead)
+{
+ struct tipc_tfm **tfm_entry = this_cpu_ptr(aead->tfm_entry);
+
+ *tfm_entry = list_next_entry(*tfm_entry, list);
+ return (*tfm_entry)->tfm;
+}
+
+/**
+ * tipc_aead_init - Initiate TIPC AEAD
+ * @aead: returned new TIPC AEAD key handle pointer
+ * @ukey: pointer to user key data
+ * @mode: the key mode
+ *
+ * Allocate a (list of) new cipher transformation (TFM) with the specific user
+ * key data if valid. The number of the allocated TFMs can be set via the sysfs
+ * "net/tipc/max_tfms" first.
+ * Also, all the other AEAD data are also initialized.
+ *
+ * Return: 0 if the initiation is successful, otherwise: < 0
+ */
+static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey,
+ u8 mode)
+{
+ struct tipc_tfm *tfm_entry, *head;
+ struct crypto_aead *tfm;
+ struct tipc_aead *tmp;
+ int keylen, err, cpu;
+ int tfm_cnt = 0;
+
+ if (unlikely(*aead))
+ return -EEXIST;
+
+ /* Allocate a new AEAD */
+ tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
+ if (unlikely(!tmp))
+ return -ENOMEM;
+
+ /* The key consists of two parts: [AES-KEY][SALT] */
+ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE;
+
+ /* Allocate per-cpu TFM entry pointer */
+ tmp->tfm_entry = alloc_percpu(struct tipc_tfm *);
+ if (!tmp->tfm_entry) {
+ kzfree(tmp);
+ return -ENOMEM;
+ }
+
+ /* Make a list of TFMs with the user key data */
+ do {
+ tfm = crypto_alloc_aead(ukey->alg_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ err = PTR_ERR(tfm);
+ break;
+ }
+
+ if (unlikely(!tfm_cnt &&
+ crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) {
+ crypto_free_aead(tfm);
+ err = -ENOTSUPP;
+ break;
+ }
+
+ err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE);
+ err |= crypto_aead_setkey(tfm, ukey->key, keylen);
+ if (unlikely(err)) {
+ crypto_free_aead(tfm);
+ break;
+ }
+
+ tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL);
+ if (unlikely(!tfm_entry)) {
+ crypto_free_aead(tfm);
+ err = -ENOMEM;
+ break;
+ }
+ INIT_LIST_HEAD(&tfm_entry->list);
+ tfm_entry->tfm = tfm;
+
+ /* First entry? */
+ if (!tfm_cnt) {
+ head = tfm_entry;
+ for_each_possible_cpu(cpu) {
+ *per_cpu_ptr(tmp->tfm_entry, cpu) = head;
+ }
+ } else {
+ list_add_tail(&tfm_entry->list, &head->list);
+ }
+
+ } while (++tfm_cnt < sysctl_tipc_max_tfms);
+
+ /* Not any TFM is allocated? */
+ if (!tfm_cnt) {
+ free_percpu(tmp->tfm_entry);
+ kzfree(tmp);
+ return err;
+ }
+
+ /* Copy some chars from the user key as a hint */
+ memcpy(tmp->hint, ukey->key, TIPC_AEAD_HINT_LEN);
+ tmp->hint[TIPC_AEAD_HINT_LEN] = '\0';
+
+ /* Initialize the other data */
+ tmp->mode = mode;
+ tmp->cloned = NULL;
+ tmp->authsize = TIPC_AES_GCM_TAG_SIZE;
+ memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE);
+ atomic_set(&tmp->users, 0);
+ atomic64_set(&tmp->seqno, 0);
+ refcount_set(&tmp->refcnt, 1);
+
+ *aead = tmp;
+ return 0;
+}
+
+/**
+ * tipc_aead_clone - Clone a TIPC AEAD key
+ * @dst: dest key for the cloning
+ * @src: source key to clone from
+ *
+ * Make a "copy" of the source AEAD key data to the dest, the TFMs list is
+ * common for the keys.
+ * A reference to the source is hold in the "cloned" pointer for the later
+ * freeing purposes.
+ *
+ * Note: this must be done in cluster-key mode only!
+ * Return: 0 in case of success, otherwise < 0
+ */
+static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src)
+{
+ struct tipc_aead *aead;
+ int cpu;
+
+ if (!src)
+ return -ENOKEY;
+
+ if (src->mode != CLUSTER_KEY)
+ return -EINVAL;
+
+ if (unlikely(*dst))
+ return -EEXIST;
+
+ aead = kzalloc(sizeof(*aead), GFP_ATOMIC);
+ if (unlikely(!aead))
+ return -ENOMEM;
+
+ aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC);
+ if (unlikely(!aead->tfm_entry)) {
+ kzfree(aead);
+ return -ENOMEM;
+ }
+
+ for_each_possible_cpu(cpu) {
+ *per_cpu_ptr(aead->tfm_entry, cpu) =
+ *per_cpu_ptr(src->tfm_entry, cpu);
+ }
+
+ memcpy(aead->hint, src->hint, sizeof(src->hint));
+ aead->mode = src->mode;
+ aead->salt = src->salt;
+ aead->authsize = src->authsize;
+ atomic_set(&aead->users, 0);
+ atomic64_set(&aead->seqno, 0);
+ refcount_set(&aead->refcnt, 1);
+
+ WARN_ON(!refcount_inc_not_zero(&src->refcnt));
+ aead->cloned = src;
+
+ *dst = aead;
+ return 0;
+}
+
+/**
+ * tipc_aead_mem_alloc - Allocate memory for AEAD request operations
+ * @tfm: cipher handle to be registered with the request
+ * @crypto_ctx_size: size of crypto context for callback
+ * @iv: returned pointer to IV data
+ * @req: returned pointer to AEAD request data
+ * @sg: returned pointer to SG lists
+ * @nsg: number of SG lists to be allocated
+ *
+ * Allocate memory to store the crypto context data, AEAD request, IV and SG
+ * lists, the memory layout is as follows:
+ * crypto_ctx || iv || aead_req || sg[]
+ *
+ * Return: the pointer to the memory areas in case of success, otherwise NULL
+ */
+static void *tipc_aead_mem_alloc(struct crypto_aead *tfm,
+ unsigned int crypto_ctx_size,
+ u8 **iv, struct aead_request **req,
+ struct scatterlist **sg, int nsg)
+{
+ unsigned int iv_size, req_size;
+ unsigned int len;
+ u8 *mem;
+
+ iv_size = crypto_aead_ivsize(tfm);
+ req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+
+ len = crypto_ctx_size;
+ len += iv_size;
+ len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ len += req_size;
+ len = ALIGN(len, __alignof__(struct scatterlist));
+ len += nsg * sizeof(**sg);
+
+ mem = kmalloc(len, GFP_ATOMIC);
+ if (!mem)
+ return NULL;
+
+ *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size,
+ crypto_aead_alignmask(tfm) + 1);
+ *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+ crypto_tfm_ctx_alignment());
+ *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+ __alignof__(struct scatterlist));
+
+ return (void *)mem;
+}
+
+/**
+ * tipc_aead_encrypt - Encrypt a message
+ * @aead: TIPC AEAD key for the message encryption
+ * @skb: the input/output skb
+ * @b: TIPC bearer where the message will be delivered after the encryption
+ * @dst: the destination media address
+ * @__dnode: TIPC dest node if "known"
+ *
+ * Return:
+ * 0 : if the encryption has completed
+ * -EINPROGRESS/-EBUSY : if a callback will be performed
+ * < 0 : the encryption has failed
+ */
+static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
+ struct tipc_bearer *b,
+ struct tipc_media_addr *dst,
+ struct tipc_node *__dnode)
+{
+ struct crypto_aead *tfm = tipc_aead_tfm_next(aead);
+ struct tipc_crypto_tx_ctx *tx_ctx;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ struct scatterlist *sg;
+ struct tipc_ehdr *ehdr;
+ int ehsz, len, tailen, nsg, rc;
+ void *ctx;
+ u32 salt;
+ u8 *iv;
+
+ /* Make sure message len at least 4-byte aligned */
+ len = ALIGN(skb->len, 4);
+ tailen = len - skb->len + aead->authsize;
+
+ /* Expand skb tail for authentication tag:
+ * As for simplicity, we'd have made sure skb having enough tailroom
+ * for authentication tag @skb allocation. Even when skb is nonlinear
+ * but there is no frag_list, it should be still fine!
+ * Otherwise, we must cow it to be a writable buffer with the tailroom.
+ */
+#ifdef TIPC_CRYPTO_DEBUG
+ SKB_LINEAR_ASSERT(skb);
+ if (tailen > skb_tailroom(skb)) {
+ pr_warn("TX: skb tailroom is not enough: %d, requires: %d\n",
+ skb_tailroom(skb), tailen);
+ }
+#endif
+
+ if (unlikely(!skb_cloned(skb) && tailen <= skb_tailroom(skb))) {
+ nsg = 1;
+ trailer = skb;
+ } else {
+ /* TODO: We could avoid skb_cow_data() if skb has no frag_list
+ * e.g. by skb_fill_page_desc() to add another page to the skb
+ * with the wanted tailen... However, page skbs look not often,
+ * so take it easy now!
+ * Cloned skbs e.g. from link_xmit() seems no choice though :(
+ */
+ nsg = skb_cow_data(skb, tailen, &trailer);
+ if (unlikely(nsg < 0)) {
+ pr_err("TX: skb_cow_data() returned %d\n", nsg);
+ return nsg;
+ }
+ }
+
+ pskb_put(skb, trailer, tailen);
+
+ /* Allocate memory for the AEAD operation */
+ ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg);
+ if (unlikely(!ctx))
+ return -ENOMEM;
+ TIPC_SKB_CB(skb)->crypto_ctx = ctx;
+
+ /* Map skb to the sg lists */
+ sg_init_table(sg, nsg);
+ rc = skb_to_sgvec(skb, sg, 0, skb->len);
+ if (unlikely(rc < 0)) {
+ pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg);
+ goto exit;
+ }
+
+ /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)]
+ * In case we're in cluster-key mode, SALT is varied by xor-ing with
+ * the source address (or w0 of id), otherwise with the dest address
+ * if dest is known.
+ */
+ ehdr = (struct tipc_ehdr *)skb->data;
+ salt = aead->salt;
+ if (aead->mode == CLUSTER_KEY)
+ salt ^= ehdr->addr; /* __be32 */
+ else if (__dnode)
+ salt ^= tipc_node_get_addr(__dnode);
+ memcpy(iv, &salt, 4);
+ memcpy(iv + 4, (u8 *)&ehdr->seqno, 8);
+
+ /* Prepare request */
+ ehsz = tipc_ehdr_size(ehdr);
+ aead_request_set_tfm(req, tfm);
+ aead_request_set_ad(req, ehsz);
+ aead_request_set_crypt(req, sg, sg, len - ehsz, iv);
+
+ /* Set callback function & data */
+ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ tipc_aead_encrypt_done, skb);
+ tx_ctx = (struct tipc_crypto_tx_ctx *)ctx;
+ tx_ctx->aead = aead;
+ tx_ctx->bearer = b;
+ memcpy(&tx_ctx->dst, dst, sizeof(*dst));
+
+ /* Hold bearer */
+ if (unlikely(!tipc_bearer_hold(b))) {
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ /* Now, do encrypt */
+ rc = crypto_aead_encrypt(req);
+ if (rc == -EINPROGRESS || rc == -EBUSY)
+ return rc;
+
+ tipc_bearer_put(b);
+
+exit:
+ kfree(ctx);
+ TIPC_SKB_CB(skb)->crypto_ctx = NULL;
+ return rc;
+}
+
+static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+ struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx;
+ struct tipc_bearer *b = tx_ctx->bearer;
+ struct tipc_aead *aead = tx_ctx->aead;
+ struct tipc_crypto *tx = aead->crypto;
+ struct net *net = tx->net;
+
+ switch (err) {
+ case 0:
+ this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]);
+ if (likely(test_bit(0, &b->up)))
+ b->media->send_msg(net, skb, b, &tx_ctx->dst);
+ else
+ kfree_skb(skb);
+ break;
+ case -EINPROGRESS:
+ return;
+ default:
+ this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]);
+ kfree_skb(skb);
+ break;
+ }
+
+ kfree(tx_ctx);
+ tipc_bearer_put(b);
+ tipc_aead_put(aead);
+}
+
+/**
+ * tipc_aead_decrypt - Decrypt an encrypted message
+ * @net: struct net
+ * @aead: TIPC AEAD for the message decryption
+ * @skb: the input/output skb
+ * @b: TIPC bearer where the message has been received
+ *
+ * Return:
+ * 0 : if the decryption has completed
+ * -EINPROGRESS/-EBUSY : if a callback will be performed
+ * < 0 : the decryption has failed
+ */
+static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead,
+ struct sk_buff *skb, struct tipc_bearer *b)
+{
+ struct tipc_crypto_rx_ctx *rx_ctx;
+ struct aead_request *req;
+ struct crypto_aead *tfm;
+ struct sk_buff *unused;
+ struct scatterlist *sg;
+ struct tipc_ehdr *ehdr;
+ int ehsz, nsg, rc;
+ void *ctx;
+ u32 salt;
+ u8 *iv;
+
+ if (unlikely(!aead))
+ return -ENOKEY;
+
+ /* Cow skb data if needed */
+ if (likely(!skb_cloned(skb) &&
+ (!skb_is_nonlinear(skb) || !skb_has_frag_list(skb)))) {
+ nsg = 1 + skb_shinfo(skb)->nr_frags;
+ } else {
+ nsg = skb_cow_data(skb, 0, &unused);
+ if (unlikely(nsg < 0)) {
+ pr_err("RX: skb_cow_data() returned %d\n", nsg);
+ return nsg;
+ }
+ }
+
+ /* Allocate memory for the AEAD operation */
+ tfm = tipc_aead_tfm_next(aead);
+ ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg);
+ if (unlikely(!ctx))
+ return -ENOMEM;
+ TIPC_SKB_CB(skb)->crypto_ctx = ctx;
+
+ /* Map skb to the sg lists */
+ sg_init_table(sg, nsg);
+ rc = skb_to_sgvec(skb, sg, 0, skb->len);
+ if (unlikely(rc < 0)) {
+ pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg);
+ goto exit;
+ }
+
+ /* Reconstruct IV: */
+ ehdr = (struct tipc_ehdr *)skb->data;
+ salt = aead->salt;
+ if (aead->mode == CLUSTER_KEY)
+ salt ^= ehdr->addr; /* __be32 */
+ else if (ehdr->destined)
+ salt ^= tipc_own_addr(net);
+ memcpy(iv, &salt, 4);
+ memcpy(iv + 4, (u8 *)&ehdr->seqno, 8);
+
+ /* Prepare request */
+ ehsz = tipc_ehdr_size(ehdr);
+ aead_request_set_tfm(req, tfm);
+ aead_request_set_ad(req, ehsz);
+ aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv);
+
+ /* Set callback function & data */
+ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ tipc_aead_decrypt_done, skb);
+ rx_ctx = (struct tipc_crypto_rx_ctx *)ctx;
+ rx_ctx->aead = aead;
+ rx_ctx->bearer = b;
+
+ /* Hold bearer */
+ if (unlikely(!tipc_bearer_hold(b))) {
+ rc = -ENODEV;
+ goto exit;
+ }
+
+ /* Now, do decrypt */
+ rc = crypto_aead_decrypt(req);
+ if (rc == -EINPROGRESS || rc == -EBUSY)
+ return rc;
+
+ tipc_bearer_put(b);
+
+exit:
+ kfree(ctx);
+ TIPC_SKB_CB(skb)->crypto_ctx = NULL;
+ return rc;
+}
+
+static void tipc_aead_decrypt_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+ struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx;
+ struct tipc_bearer *b = rx_ctx->bearer;
+ struct tipc_aead *aead = rx_ctx->aead;
+ struct tipc_crypto_stats __percpu *stats = aead->crypto->stats;
+ struct net *net = aead->crypto->net;
+
+ switch (err) {
+ case 0:
+ this_cpu_inc(stats->stat[STAT_ASYNC_OK]);
+ break;
+ case -EINPROGRESS:
+ return;
+ default:
+ this_cpu_inc(stats->stat[STAT_ASYNC_NOK]);
+ break;
+ }
+
+ kfree(rx_ctx);
+ tipc_crypto_rcv_complete(net, aead, b, &skb, err);
+ if (likely(skb)) {
+ if (likely(test_bit(0, &b->up)))
+ tipc_rcv(net, skb, b);
+ else
+ kfree_skb(skb);
+ }
+
+ tipc_bearer_put(b);
+}
+
+static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr)
+{
+ return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE;
+}
+
+/**
+ * tipc_ehdr_validate - Validate an encryption message
+ * @skb: the message buffer
+ *
+ * Returns "true" if this is a valid encryption message, otherwise "false"
+ */
+bool tipc_ehdr_validate(struct sk_buff *skb)
+{
+ struct tipc_ehdr *ehdr;
+ int ehsz;
+
+ if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE)))
+ return false;
+
+ ehdr = (struct tipc_ehdr *)skb->data;
+ if (unlikely(ehdr->version != TIPC_EVERSION))
+ return false;
+ ehsz = tipc_ehdr_size(ehdr);
+ if (unlikely(!pskb_may_pull(skb, ehsz)))
+ return false;
+ if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE))
+ return false;
+ if (unlikely(!ehdr->tx_key))
+ return false;
+
+ return true;
+}
+
+/**
+ * tipc_ehdr_build - Build TIPC encryption message header
+ * @net: struct net
+ * @aead: TX AEAD key to be used for the message encryption
+ * @tx_key: key id used for the message encryption
+ * @skb: input/output message skb
+ * @__rx: RX crypto handle if dest is "known"
+ *
+ * Return: the header size if the building is successful, otherwise < 0
+ */
+static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead,
+ u8 tx_key, struct sk_buff *skb,
+ struct tipc_crypto *__rx)
+{
+ struct tipc_msg *hdr = buf_msg(skb);
+ struct tipc_ehdr *ehdr;
+ u32 user = msg_user(hdr);
+ u64 seqno;
+ int ehsz;
+
+ /* Make room for encryption header */
+ ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE;
+ WARN_ON(skb_headroom(skb) < ehsz);
+ ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz);
+
+ /* Obtain a seqno first:
+ * Use the key seqno (= cluster wise) if dest is unknown or we're in
+ * cluster key mode, otherwise it's better for a per-peer seqno!
+ */
+ if (!__rx || aead->mode == CLUSTER_KEY)
+ seqno = atomic64_inc_return(&aead->seqno);
+ else
+ seqno = atomic64_inc_return(&__rx->sndnxt);
+
+ /* Revoke the key if seqno is wrapped around */
+ if (unlikely(!seqno))
+ return tipc_crypto_key_revoke(net, tx_key);
+
+ /* Word 1-2 */
+ ehdr->seqno = cpu_to_be64(seqno);
+
+ /* Words 0, 3- */
+ ehdr->version = TIPC_EVERSION;
+ ehdr->user = 0;
+ ehdr->keepalive = 0;
+ ehdr->tx_key = tx_key;
+ ehdr->destined = (__rx) ? 1 : 0;
+ ehdr->rx_key_active = (__rx) ? __rx->key.active : 0;
+ ehdr->reserved_1 = 0;
+ ehdr->reserved_2 = 0;
+
+ switch (user) {
+ case LINK_CONFIG:
+ ehdr->user = LINK_CONFIG;
+ memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN);
+ break;
+ default:
+ if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) {
+ ehdr->user = LINK_PROTOCOL;
+ ehdr->keepalive = msg_is_keepalive(hdr);
+ }
+ ehdr->addr = hdr->hdr[3];
+ break;
+ }
+
+ return ehsz;
+}
+
+static inline void tipc_crypto_key_set_state(struct tipc_crypto *c,
+ u8 new_passive,
+ u8 new_active,
+ u8 new_pending)
+{
+#ifdef TIPC_CRYPTO_DEBUG
+ struct tipc_key old = c->key;
+ char buf[32];
+#endif
+
+ c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) |
+ ((new_active & KEY_MASK) << (KEY_BITS)) |
+ ((new_pending & KEY_MASK));
+
+#ifdef TIPC_CRYPTO_DEBUG
+ pr_info("%s(%s): key changing %s ::%pS\n",
+ (c->node) ? "RX" : "TX",
+ (c->node) ? tipc_node_get_id_str(c->node) :
+ tipc_own_id_string(c->net),
+ tipc_key_change_dump(old, c->key, buf),
+ __builtin_return_address(0));
+#endif
+}
+
+/**
+ * tipc_crypto_key_init - Initiate a new user / AEAD key
+ * @c: TIPC crypto to which new key is attached
+ * @ukey: the user key
+ * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY)
+ *
+ * A new TIPC AEAD key will be allocated and initiated with the specified user
+ * key, then attached to the TIPC crypto.
+ *
+ * Return: new key id in case of success, otherwise: < 0
+ */
+int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
+ u8 mode)
+{
+ struct tipc_aead *aead = NULL;
+ int rc = 0;
+
+ /* Initiate with the new user key */
+ rc = tipc_aead_init(&aead, ukey, mode);
+
+ /* Attach it to the crypto */
+ if (likely(!rc)) {
+ rc = tipc_crypto_key_attach(c, aead, 0);
+ if (rc < 0)
+ tipc_aead_free(&aead->rcu);
+ }
+
+ pr_info("%s(%s): key initiating, rc %d!\n",
+ (c->node) ? "RX" : "TX",
+ (c->node) ? tipc_node_get_id_str(c->node) :
+ tipc_own_id_string(c->net),
+ rc);
+
+ return rc;
+}
+
+/**
+ * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto
+ * @c: TIPC crypto to which the new AEAD key is attached
+ * @aead: the new AEAD key pointer
+ * @pos: desired slot in the crypto key array, = 0 if any!
+ *
+ * Return: new key id in case of success, otherwise: -EBUSY
+ */
+static int tipc_crypto_key_attach(struct tipc_crypto *c,
+ struct tipc_aead *aead, u8 pos)
+{
+ u8 new_pending, new_passive, new_key;
+ struct tipc_key key;
+ int rc = -EBUSY;
+
+ spin_lock_bh(&c->lock);
+ key = c->key;
+ if (key.active && key.passive)
+ goto exit;
+ if (key.passive && !tipc_aead_users(c->aead[key.passive]))
+ goto exit;
+ if (key.pending) {
+ if (pos)
+ goto exit;
+ if (tipc_aead_users(c->aead[key.pending]) > 0)
+ goto exit;
+ /* Replace it */
+ new_pending = key.pending;
+ new_passive = key.passive;
+ new_key = new_pending;
+ } else {
+ if (pos) {
+ if (key.active && pos != key_next(key.active)) {
+ new_pending = key.pending;
+ new_passive = pos;
+ new_key = new_passive;
+ goto attach;
+ } else if (!key.active && !key.passive) {
+ new_pending = pos;
+ new_passive = key.passive;
+ new_key = new_pending;
+ goto attach;
+ }
+ }
+ new_pending = key_next(key.active ?: key.passive);
+ new_passive = key.passive;
+ new_key = new_pending;
+ }
+
+attach:
+ aead->crypto = c;
+ tipc_crypto_key_set_state(c, new_passive, key.active, new_pending);
+ tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock);
+
+ c->working = 1;
+ c->timer1 = jiffies;
+ c->timer2 = jiffies;
+ rc = new_key;
+
+exit:
+ spin_unlock_bh(&c->lock);
+ return rc;
+}
+
+void tipc_crypto_key_flush(struct tipc_crypto *c)
+{
+ int k;
+
+ spin_lock_bh(&c->lock);
+ c->working = 0;
+ tipc_crypto_key_set_state(c, 0, 0, 0);
+ for (k = KEY_MIN; k <= KEY_MAX; k++)
+ tipc_crypto_key_detach(c->aead[k], &c->lock);
+ atomic_set(&c->peer_rx_active, 0);
+ atomic64_set(&c->sndnxt, 0);
+ spin_unlock_bh(&c->lock);
+}
+
+/**
+ * tipc_crypto_key_try_align - Align RX keys if possible
+ * @rx: RX crypto handle
+ * @new_pending: new pending slot if aligned (= TX key from peer)
+ *
+ * Peer has used an unknown key slot, this only happens when peer has left and
+ * rejoned, or we are newcomer.
+ * That means, there must be no active key but a pending key at unaligned slot.
+ * If so, we try to move the pending key to the new slot.
+ * Note: A potential passive key can exist, it will be shifted correspondingly!
+ *
+ * Return: "true" if key is successfully aligned, otherwise "false"
+ */
+static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending)
+{
+ struct tipc_aead *tmp1, *tmp2 = NULL;
+ struct tipc_key key;
+ bool aligned = false;
+ u8 new_passive = 0;
+ int x;
+
+ spin_lock(&rx->lock);
+ key = rx->key;
+ if (key.pending == new_pending) {
+ aligned = true;
+ goto exit;
+ }
+ if (key.active)
+ goto exit;
+ if (!key.pending)
+ goto exit;
+ if (tipc_aead_users(rx->aead[key.pending]) > 0)
+ goto exit;
+
+ /* Try to "isolate" this pending key first */
+ tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock);
+ if (!refcount_dec_if_one(&tmp1->refcnt))
+ goto exit;
+ rcu_assign_pointer(rx->aead[key.pending], NULL);
+
+ /* Move passive key if any */
+ if (key.passive) {
+ tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock));
+ x = (key.passive - key.pending + new_pending) % KEY_MAX;
+ new_passive = (x <= 0) ? x + KEY_MAX : x;
+ }
+
+ /* Re-allocate the key(s) */
+ tipc_crypto_key_set_state(rx, new_passive, 0, new_pending);
+ rcu_assign_pointer(rx->aead[new_pending], tmp1);
+ if (new_passive)
+ rcu_assign_pointer(rx->aead[new_passive], tmp2);
+ refcount_set(&tmp1->refcnt, 1);
+ aligned = true;
+ pr_info("RX(%s): key is aligned!\n", tipc_node_get_id_str(rx->node));
+
+exit:
+ spin_unlock(&rx->lock);
+ return aligned;
+}
+
+/**
+ * tipc_crypto_key_pick_tx - Pick one TX key for message decryption
+ * @tx: TX crypto handle
+ * @rx: RX crypto handle (can be NULL)
+ * @skb: the message skb which will be decrypted later
+ *
+ * This function looks up the existing TX keys and pick one which is suitable
+ * for the message decryption, that must be a cluster key and not used before
+ * on the same message (i.e. recursive).
+ *
+ * Return: the TX AEAD key handle in case of success, otherwise NULL
+ */
+static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx,
+ struct tipc_crypto *rx,
+ struct sk_buff *skb)
+{
+ struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb);
+ struct tipc_aead *aead = NULL;
+ struct tipc_key key = tx->key;
+ u8 k, i = 0;
+
+ /* Initialize data if not yet */
+ if (!skb_cb->tx_clone_deferred) {
+ skb_cb->tx_clone_deferred = 1;
+ memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx));
+ }
+
+ skb_cb->tx_clone_ctx.rx = rx;
+ if (++skb_cb->tx_clone_ctx.recurs > 2)
+ return NULL;
+
+ /* Pick one TX key */
+ spin_lock(&tx->lock);
+ do {
+ k = (i == 0) ? key.pending :
+ ((i == 1) ? key.active : key.passive);
+ if (!k)
+ continue;
+ aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock);
+ if (!aead)
+ continue;
+ if (aead->mode != CLUSTER_KEY ||
+ aead == skb_cb->tx_clone_ctx.last) {
+ aead = NULL;
+ continue;
+ }
+ /* Ok, found one cluster key */
+ skb_cb->tx_clone_ctx.last = aead;
+ WARN_ON(skb->next);
+ skb->next = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!skb->next))
+ pr_warn("Failed to clone skb for next round if any\n");
+ WARN_ON(!refcount_inc_not_zero(&aead->refcnt));
+ break;
+ } while (++i < 3);
+ spin_unlock(&tx->lock);
+
+ return aead;
+}
+
+/**
+ * tipc_crypto_key_synch: Synch own key data according to peer key status
+ * @rx: RX crypto handle
+ * @new_rx_active: latest RX active key from peer
+ * @hdr: TIPCv2 message
+ *
+ * This function updates the peer node related data as the peer RX active key
+ * has changed, so the number of TX keys' users on this node are increased and
+ * decreased correspondingly.
+ *
+ * The "per-peer" sndnxt is also reset when the peer key has switched.
+ */
+static void tipc_crypto_key_synch(struct tipc_crypto *rx, u8 new_rx_active,
+ struct tipc_msg *hdr)
+{
+ struct net *net = rx->net;
+ struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
+ u8 cur_rx_active;
+
+ /* TX might be even not ready yet */
+ if (unlikely(!tx->key.active && !tx->key.pending))
+ return;
+
+ cur_rx_active = atomic_read(&rx->peer_rx_active);
+ if (likely(cur_rx_active == new_rx_active))
+ return;
+
+ /* Make sure this message destined for this node */
+ if (unlikely(msg_short(hdr) ||
+ msg_destnode(hdr) != tipc_own_addr(net)))
+ return;
+
+ /* Peer RX active key has changed, try to update owns' & TX users */
+ if (atomic_cmpxchg(&rx->peer_rx_active,
+ cur_rx_active,
+ new_rx_active) == cur_rx_active) {
+ if (new_rx_active)
+ tipc_aead_users_inc(tx->aead[new_rx_active], INT_MAX);
+ if (cur_rx_active)
+ tipc_aead_users_dec(tx->aead[cur_rx_active], 0);
+
+ atomic64_set(&rx->sndnxt, 0);
+ /* Mark the point TX key users changed */
+ tx->timer1 = jiffies;
+
+#ifdef TIPC_CRYPTO_DEBUG
+ pr_info("TX(%s): key users changed %d-- %d++, peer RX(%s)\n",
+ tipc_own_id_string(net), cur_rx_active,
+ new_rx_active, tipc_node_get_id_str(rx->node));
+#endif
+ }
+}
+
+static int tipc_crypto_key_revoke(struct net *net, u8 tx_key)
+{
+ struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
+ struct tipc_key key;
+
+ spin_lock(&tx->lock);
+ key = tx->key;
+ WARN_ON(!key.active || tx_key != key.active);
+
+ /* Free the active key */
+ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending);
+ tipc_crypto_key_detach(tx->aead[key.active], &tx->lock);
+ spin_unlock(&tx->lock);
+
+ pr_warn("TX(%s): key is revoked!\n", tipc_own_id_string(net));
+ return -EKEYREVOKED;
+}
+
+int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
+ struct tipc_node *node)
+{
+ struct tipc_crypto *c;
+
+ if (*crypto)
+ return -EEXIST;
+
+ /* Allocate crypto */
+ c = kzalloc(sizeof(*c), GFP_ATOMIC);
+ if (!c)
+ return -ENOMEM;
+
+ /* Allocate statistic structure */
+ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC);
+ if (!c->stats) {
+ kzfree(c);
+ return -ENOMEM;
+ }
+
+ c->working = 0;
+ c->net = net;
+ c->node = node;
+ tipc_crypto_key_set_state(c, 0, 0, 0);
+ atomic_set(&c->peer_rx_active, 0);
+ atomic64_set(&c->sndnxt, 0);
+ c->timer1 = jiffies;
+ c->timer2 = jiffies;
+ spin_lock_init(&c->lock);
+ *crypto = c;
+
+ return 0;
+}
+
+void tipc_crypto_stop(struct tipc_crypto **crypto)
+{
+ struct tipc_crypto *c, *tx, *rx;
+ bool is_rx;
+ u8 k;
+
+ if (!*crypto)
+ return;
+
+ rcu_read_lock();
+ /* RX stopping? => decrease TX key users if any */
+ is_rx = !!((*crypto)->node);
+ if (is_rx) {
+ rx = *crypto;
+ tx = tipc_net(rx->net)->crypto_tx;
+ k = atomic_read(&rx->peer_rx_active);
+ if (k) {
+ tipc_aead_users_dec(tx->aead[k], 0);
+ /* Mark the point TX key users changed */
+ tx->timer1 = jiffies;
+ }
+ }
+
+ /* Release AEAD keys */
+ c = *crypto;
+ for (k = KEY_MIN; k <= KEY_MAX; k++)
+ tipc_aead_put(rcu_dereference(c->aead[k]));
+ rcu_read_unlock();
+
+ pr_warn("%s(%s) has been purged, node left!\n",
+ (is_rx) ? "RX" : "TX",
+ (is_rx) ? tipc_node_get_id_str((*crypto)->node) :
+ tipc_own_id_string((*crypto)->net));
+
+ /* Free this crypto statistics */
+ free_percpu(c->stats);
+
+ *crypto = NULL;
+ kzfree(c);
+}
+
+void tipc_crypto_timeout(struct tipc_crypto *rx)
+{
+ struct tipc_net *tn = tipc_net(rx->net);
+ struct tipc_crypto *tx = tn->crypto_tx;
+ struct tipc_key key;
+ u8 new_pending, new_passive;
+ int cmd;
+
+ /* TX key activating:
+ * The pending key (users > 0) -> active
+ * The active key if any (users == 0) -> free
+ */
+ spin_lock(&tx->lock);
+ key = tx->key;
+ if (key.active && tipc_aead_users(tx->aead[key.active]) > 0)
+ goto s1;
+ if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0)
+ goto s1;
+ if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_LIM))
+ goto s1;
+
+ tipc_crypto_key_set_state(tx, key.passive, key.pending, 0);
+ if (key.active)
+ tipc_crypto_key_detach(tx->aead[key.active], &tx->lock);
+ this_cpu_inc(tx->stats->stat[STAT_SWITCHES]);
+ pr_info("TX(%s): key %d is activated!\n", tipc_own_id_string(tx->net),
+ key.pending);
+
+s1:
+ spin_unlock(&tx->lock);
+
+ /* RX key activating:
+ * The pending key (users > 0) -> active
+ * The active key if any -> passive, freed later
+ */
+ spin_lock(&rx->lock);
+ key = rx->key;
+ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0)
+ goto s2;
+
+ new_pending = (key.passive &&
+ !tipc_aead_users(rx->aead[key.passive])) ?
+ key.passive : 0;
+ new_passive = (key.active) ?: ((new_pending) ? 0 : key.passive);
+ tipc_crypto_key_set_state(rx, new_passive, key.pending, new_pending);
+ this_cpu_inc(rx->stats->stat[STAT_SWITCHES]);
+ pr_info("RX(%s): key %d is activated!\n",
+ tipc_node_get_id_str(rx->node), key.pending);
+ goto s5;
+
+s2:
+ /* RX key "faulty" switching:
+ * The faulty pending key (users < -30) -> passive
+ * The passive key (users = 0) -> pending
+ * Note: This only happens after RX deactivated - s3!
+ */
+ key = rx->key;
+ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -30)
+ goto s3;
+ if (!key.passive || tipc_aead_users(rx->aead[key.passive]) != 0)
+ goto s3;
+
+ new_pending = key.passive;
+ new_passive = key.pending;
+ tipc_crypto_key_set_state(rx, new_passive, key.active, new_pending);
+ goto s5;
+
+s3:
+ /* RX key deactivating:
+ * The passive key if any -> pending
+ * The active key -> passive (users = 0) / pending
+ * The pending key if any -> passive (users = 0)
+ */
+ key = rx->key;
+ if (!key.active)
+ goto s4;
+ if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM))
+ goto s4;
+
+ new_pending = (key.passive) ?: key.active;
+ new_passive = (key.passive) ? key.active : key.pending;
+ tipc_aead_users_set(rx->aead[new_pending], 0);
+ if (new_passive)
+ tipc_aead_users_set(rx->aead[new_passive], 0);
+ tipc_crypto_key_set_state(rx, new_passive, 0, new_pending);
+ pr_info("RX(%s): key %d is deactivated!\n",
+ tipc_node_get_id_str(rx->node), key.active);
+ goto s5;
+
+s4:
+ /* RX key passive -> freed: */
+ key = rx->key;
+ if (!key.passive || !tipc_aead_users(rx->aead[key.passive]))
+ goto s5;
+ if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM))
+ goto s5;
+
+ tipc_crypto_key_set_state(rx, 0, key.active, key.pending);
+ tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock);
+ pr_info("RX(%s): key %d is freed!\n", tipc_node_get_id_str(rx->node),
+ key.passive);
+
+s5:
+ spin_unlock(&rx->lock);
+
+ /* Limit max_tfms & do debug commands if needed */
+ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM))
+ return;
+
+ cmd = sysctl_tipc_max_tfms;
+ sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF;
+ tipc_crypto_do_cmd(rx->net, cmd);
+}
+
+/**
+ * tipc_crypto_xmit - Build & encrypt TIPC message for xmit
+ * @net: struct net
+ * @skb: input/output message skb pointer
+ * @b: bearer used for xmit later
+ * @dst: destination media address
+ * @__dnode: destination node for reference if any
+ *
+ * First, build an encryption message header on the top of the message, then
+ * encrypt the original TIPC message by using the active or pending TX key.
+ * If the encryption is successful, the encrypted skb is returned directly or
+ * via the callback.
+ * Otherwise, the skb is freed!
+ *
+ * Return:
+ * 0 : the encryption has succeeded (or no encryption)
+ * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made
+ * -ENOKEK : the encryption has failed due to no key
+ * -EKEYREVOKED : the encryption has failed due to key revoked
+ * -ENOMEM : the encryption has failed due to no memory
+ * < 0 : the encryption has failed due to other reasons
+ */
+int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
+ struct tipc_bearer *b, struct tipc_media_addr *dst,
+ struct tipc_node *__dnode)
+{
+ struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode);
+ struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
+ struct tipc_crypto_stats __percpu *stats = tx->stats;
+ struct tipc_key key = tx->key;
+ struct tipc_aead *aead = NULL;
+ struct sk_buff *probe;
+ int rc = -ENOKEY;
+ u8 tx_key;
+
+ /* No encryption? */
+ if (!tx->working)
+ return 0;
+
+ /* Try with the pending key if available and:
+ * 1) This is the only choice (i.e. no active key) or;
+ * 2) Peer has switched to this key (unicast only) or;
+ * 3) It is time to do a pending key probe;
+ */
+ if (unlikely(key.pending)) {
+ tx_key = key.pending;
+ if (!key.active)
+ goto encrypt;
+ if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key)
+ goto encrypt;
+ if (TIPC_SKB_CB(*skb)->probe)
+ goto encrypt;
+ if (!__rx &&
+ time_after(jiffies, tx->timer2 + TIPC_TX_PROBE_LIM)) {
+ tx->timer2 = jiffies;
+ probe = skb_clone(*skb, GFP_ATOMIC);
+ if (probe) {
+ TIPC_SKB_CB(probe)->probe = 1;
+ tipc_crypto_xmit(net, &probe, b, dst, __dnode);
+ if (probe)
+ b->media->send_msg(net, probe, b, dst);
+ }
+ }
+ }
+ /* Else, use the active key if any */
+ if (likely(key.active)) {
+ tx_key = key.active;
+ goto encrypt;
+ }
+ goto exit;
+
+encrypt:
+ aead = tipc_aead_get(tx->aead[tx_key]);
+ if (unlikely(!aead))
+ goto exit;
+ rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx);
+ if (likely(rc > 0))
+ rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode);
+
+exit:
+ switch (rc) {
+ case 0:
+ this_cpu_inc(stats->stat[STAT_OK]);
+ break;
+ case -EINPROGRESS:
+ case -EBUSY:
+ this_cpu_inc(stats->stat[STAT_ASYNC]);
+ *skb = NULL;
+ return rc;
+ default:
+ this_cpu_inc(stats->stat[STAT_NOK]);
+ if (rc == -ENOKEY)
+ this_cpu_inc(stats->stat[STAT_NOKEYS]);
+ else if (rc == -EKEYREVOKED)
+ this_cpu_inc(stats->stat[STAT_BADKEYS]);
+ kfree_skb(*skb);
+ *skb = NULL;
+ break;
+ }
+
+ tipc_aead_put(aead);
+ return rc;
+}
+
+/**
+ * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer
+ * @net: struct net
+ * @rx: RX crypto handle
+ * @skb: input/output message skb pointer
+ * @b: bearer where the message has been received
+ *
+ * If the decryption is successful, the decrypted skb is returned directly or
+ * as the callback, the encryption header and auth tag will be trimed out
+ * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete().
+ * Otherwise, the skb will be freed!
+ * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX
+ * cluster key(s) can be taken for decryption (- recursive).
+ *
+ * Return:
+ * 0 : the decryption has successfully completed
+ * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made
+ * -ENOKEY : the decryption has failed due to no key
+ * -EBADMSG : the decryption has failed due to bad message
+ * -ENOMEM : the decryption has failed due to no memory
+ * < 0 : the decryption has failed due to other reasons
+ */
+int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
+ struct sk_buff **skb, struct tipc_bearer *b)
+{
+ struct tipc_crypto *tx = tipc_net(net)->crypto_tx;
+ struct tipc_crypto_stats __percpu *stats;
+ struct tipc_aead *aead = NULL;
+ struct tipc_key key;
+ int rc = -ENOKEY;
+ u8 tx_key = 0;
+
+ /* New peer?
+ * Let's try with TX key (i.e. cluster mode) & verify the skb first!
+ */
+ if (unlikely(!rx))
+ goto pick_tx;
+
+ /* Pick RX key according to TX key, three cases are possible:
+ * 1) The current active key (likely) or;
+ * 2) The pending (new or deactivated) key (if any) or;
+ * 3) The passive or old active key (i.e. users > 0);
+ */
+ tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key;
+ key = rx->key;
+ if (likely(tx_key == key.active))
+ goto decrypt;
+ if (tx_key == key.pending)
+ goto decrypt;
+ if (tx_key == key.passive) {
+ rx->timer2 = jiffies;
+ if (tipc_aead_users(rx->aead[key.passive]) > 0)
+ goto decrypt;
+ }
+
+ /* Unknown key, let's try to align RX key(s) */
+ if (tipc_crypto_key_try_align(rx, tx_key))
+ goto decrypt;
+
+pick_tx:
+ /* No key suitable? Try to pick one from TX... */
+ aead = tipc_crypto_key_pick_tx(tx, rx, *skb);
+ if (aead)
+ goto decrypt;
+ goto exit;
+
+decrypt:
+ rcu_read_lock();
+ if (!aead)
+ aead = tipc_aead_get(rx->aead[tx_key]);
+ rc = tipc_aead_decrypt(net, aead, *skb, b);
+ rcu_read_unlock();
+
+exit:
+ stats = ((rx) ?: tx)->stats;
+ switch (rc) {
+ case 0:
+ this_cpu_inc(stats->stat[STAT_OK]);
+ break;
+ case -EINPROGRESS:
+ case -EBUSY:
+ this_cpu_inc(stats->stat[STAT_ASYNC]);
+ *skb = NULL;
+ return rc;
+ default:
+ this_cpu_inc(stats->stat[STAT_NOK]);
+ if (rc == -ENOKEY) {
+ kfree_skb(*skb);
+ *skb = NULL;
+ if (rx)
+ tipc_node_put(rx->node);
+ this_cpu_inc(stats->stat[STAT_NOKEYS]);
+ return rc;
+ } else if (rc == -EBADMSG) {
+ this_cpu_inc(stats->stat[STAT_BADMSGS]);
+ }
+ break;
+ }
+
+ tipc_crypto_rcv_complete(net, aead, b, skb, rc);
+ return rc;
+}
+
+static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead,
+ struct tipc_bearer *b,
+ struct sk_buff **skb, int err)
+{
+ struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb);
+ struct tipc_crypto *rx = aead->crypto;
+ struct tipc_aead *tmp = NULL;
+ struct tipc_ehdr *ehdr;
+ struct tipc_node *n;
+ u8 rx_key_active;
+ bool destined;
+
+ /* Is this completed by TX? */
+ if (unlikely(!rx->node)) {
+ rx = skb_cb->tx_clone_ctx.rx;
+#ifdef TIPC_CRYPTO_DEBUG
+ pr_info("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n",
+ (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead,
+ (*skb)->next, skb_cb->flags);
+ pr_info("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n",
+ skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last,
+ aead->crypto->aead[1], aead->crypto->aead[2],
+ aead->crypto->aead[3]);
+#endif
+ if (unlikely(err)) {
+ if (err == -EBADMSG && (*skb)->next)
+ tipc_rcv(net, (*skb)->next, b);
+ goto free_skb;
+ }
+
+ if (likely((*skb)->next)) {
+ kfree_skb((*skb)->next);
+ (*skb)->next = NULL;
+ }
+ ehdr = (struct tipc_ehdr *)(*skb)->data;
+ if (!rx) {
+ WARN_ON(ehdr->user != LINK_CONFIG);
+ n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0,
+ true);
+ rx = tipc_node_crypto_rx(n);
+ if (unlikely(!rx))
+ goto free_skb;
+ }
+
+ /* Skip cloning this time as we had a RX pending key */
+ if (rx->key.pending)
+ goto rcv;
+ if (tipc_aead_clone(&tmp, aead) < 0)
+ goto rcv;
+ if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) {
+ tipc_aead_free(&tmp->rcu);
+ goto rcv;
+ }
+ tipc_aead_put(aead);
+ aead = tipc_aead_get(tmp);
+ }
+
+ if (unlikely(err)) {
+ tipc_aead_users_dec(aead, INT_MIN);
+ goto free_skb;
+ }
+
+ /* Set the RX key's user */
+ tipc_aead_users_set(aead, 1);
+
+rcv:
+ /* Mark this point, RX works */
+ rx->timer1 = jiffies;
+
+ /* Remove ehdr & auth. tag prior to tipc_rcv() */
+ ehdr = (struct tipc_ehdr *)(*skb)->data;
+ destined = ehdr->destined;
+ rx_key_active = ehdr->rx_key_active;
+ skb_pull(*skb, tipc_ehdr_size(ehdr));
+ pskb_trim(*skb, (*skb)->len - aead->authsize);
+
+ /* Validate TIPCv2 message */
+ if (unlikely(!tipc_msg_validate(skb))) {
+ pr_err_ratelimited("Packet dropped after decryption!\n");
+ goto free_skb;
+ }
+
+ /* Update peer RX active key & TX users */
+ if (destined)
+ tipc_crypto_key_synch(rx, rx_key_active, buf_msg(*skb));
+
+ /* Mark skb decrypted */
+ skb_cb->decrypted = 1;
+
+ /* Clear clone cxt if any */
+ if (likely(!skb_cb->tx_clone_deferred))
+ goto exit;
+ skb_cb->tx_clone_deferred = 0;
+ memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx));
+ goto exit;
+
+free_skb:
+ kfree_skb(*skb);
+ *skb = NULL;
+
+exit:
+ tipc_aead_put(aead);
+ if (rx)
+ tipc_node_put(rx->node);
+}
+
+static void tipc_crypto_do_cmd(struct net *net, int cmd)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_crypto *tx = tn->crypto_tx, *rx;
+ struct list_head *p;
+ unsigned int stat;
+ int i, j, cpu;
+ char buf[200];
+
+ /* Currently only one command is supported */
+ switch (cmd) {
+ case 0xfff1:
+ goto print_stats;
+ default:
+ return;
+ }
+
+print_stats:
+ /* Print a header */
+ pr_info("\n=============== TIPC Crypto Statistics ===============\n\n");
+
+ /* Print key status */
+ pr_info("Key status:\n");
+ pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net),
+ tipc_crypto_key_dump(tx, buf));
+
+ rcu_read_lock();
+ for (p = tn->node_list.next; p != &tn->node_list; p = p->next) {
+ rx = tipc_node_crypto_rx_by_list(p);
+ pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node),
+ tipc_crypto_key_dump(rx, buf));
+ }
+ rcu_read_unlock();
+
+ /* Print crypto statistics */
+ for (i = 0, j = 0; i < MAX_STATS; i++)
+ j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]);
+ pr_info("\nCounter %s", buf);
+
+ memset(buf, '-', 115);
+ buf[115] = '\0';
+ pr_info("%s\n", buf);
+
+ j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net));
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < MAX_STATS; i++) {
+ stat = per_cpu_ptr(tx->stats, cpu)->stat[i];
+ j += scnprintf(buf + j, 200 - j, "|%11d ", stat);
+ }
+ pr_info("%s", buf);
+ j = scnprintf(buf, 200, "%12s", " ");
+ }
+
+ rcu_read_lock();
+ for (p = tn->node_list.next; p != &tn->node_list; p = p->next) {
+ rx = tipc_node_crypto_rx_by_list(p);
+ j = scnprintf(buf, 200, "RX(%7.7s) ",
+ tipc_node_get_id_str(rx->node));
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < MAX_STATS; i++) {
+ stat = per_cpu_ptr(rx->stats, cpu)->stat[i];
+ j += scnprintf(buf + j, 200 - j, "|%11d ",
+ stat);
+ }
+ pr_info("%s", buf);
+ j = scnprintf(buf, 200, "%12s", " ");
+ }
+ }
+ rcu_read_unlock();
+
+ pr_info("\n======================== Done ========================\n");
+}
+
+static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf)
+{
+ struct tipc_key key = c->key;
+ struct tipc_aead *aead;
+ int k, i = 0;
+ char *s;
+
+ for (k = KEY_MIN; k <= KEY_MAX; k++) {
+ if (k == key.passive)
+ s = "PAS";
+ else if (k == key.active)
+ s = "ACT";
+ else if (k == key.pending)
+ s = "PEN";
+ else
+ s = "-";
+ i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s);
+
+ rcu_read_lock();
+ aead = rcu_dereference(c->aead[k]);
+ if (aead)
+ i += scnprintf(buf + i, 200 - i,
+ "{\"%s...\", \"%s\"}/%d:%d",
+ aead->hint,
+ (aead->mode == CLUSTER_KEY) ? "c" : "p",
+ atomic_read(&aead->users),
+ refcount_read(&aead->refcnt));
+ rcu_read_unlock();
+ i += scnprintf(buf + i, 200 - i, "\n");
+ }
+
+ if (c->node)
+ i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n",
+ atomic_read(&c->peer_rx_active));
+
+ return buf;
+}
+
+#ifdef TIPC_CRYPTO_DEBUG
+static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new,
+ char *buf)
+{
+ struct tipc_key *key = &old;
+ int k, i = 0;
+ char *s;
+
+ /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */
+again:
+ i += scnprintf(buf + i, 32 - i, "[");
+ for (k = KEY_MIN; k <= KEY_MAX; k++) {
+ if (k == key->passive)
+ s = "pas";
+ else if (k == key->active)
+ s = "act";
+ else if (k == key->pending)
+ s = "pen";
+ else
+ s = "-";
+ i += scnprintf(buf + i, 32 - i,
+ (k != KEY_MAX) ? "%s " : "%s", s);
+ }
+ if (key != &new) {
+ i += scnprintf(buf + i, 32 - i, "] -> ");
+ key = &new;
+ goto again;
+ }
+ i += scnprintf(buf + i, 32 - i, "]");
+ return buf;
+}
+#endif
diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h
new file mode 100644
index 000000000000..c3de769f49e8
--- /dev/null
+++ b/net/tipc/crypto.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * net/tipc/crypto.h: Include file for TIPC crypto
+ *
+ * Copyright (c) 2019, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifdef CONFIG_TIPC_CRYPTO
+#ifndef _TIPC_CRYPTO_H
+#define _TIPC_CRYPTO_H
+
+#include "core.h"
+#include "node.h"
+#include "msg.h"
+#include "bearer.h"
+
+#define TIPC_EVERSION 7
+
+/* AEAD aes(gcm) */
+#define TIPC_AES_GCM_KEY_SIZE_128 16
+#define TIPC_AES_GCM_KEY_SIZE_192 24
+#define TIPC_AES_GCM_KEY_SIZE_256 32
+
+#define TIPC_AES_GCM_SALT_SIZE 4
+#define TIPC_AES_GCM_IV_SIZE 12
+#define TIPC_AES_GCM_TAG_SIZE 16
+
+/**
+ * TIPC crypto modes:
+ * - CLUSTER_KEY:
+ * One single key is used for both TX & RX in all nodes in the cluster.
+ * - PER_NODE_KEY:
+ * Each nodes in the cluster has one TX key, for RX a node needs to know
+ * its peers' TX key for the decryption of messages from those nodes.
+ */
+enum {
+ CLUSTER_KEY = 1,
+ PER_NODE_KEY = (1 << 1),
+};
+
+extern int sysctl_tipc_max_tfms __read_mostly;
+
+/**
+ * TIPC encryption message format:
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+ * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * w0:|Ver=7| User |D|TX |RX |K| Rsvd |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * w1:| Seqno |
+ * w2:| (8 octets) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * w3:\ Prevnode \
+ * / (4 or 16 octets) /
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * \ \
+ * / Encrypted complete TIPC V2 header and user data /
+ * \ \
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | |
+ * | AuthTag |
+ * | (16 octets) |
+ * | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Word0:
+ * Ver : = 7 i.e. TIPC encryption message version
+ * User : = 7 (for LINK_PROTOCOL); = 13 (for LINK_CONFIG) or = 0
+ * D : The destined bit i.e. the message's destination node is
+ * "known" or not at the message encryption
+ * TX : TX key used for the message encryption
+ * RX : Currently RX active key corresponding to the destination
+ * node's TX key (when the "D" bit is set)
+ * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only)
+ * Rsvd : Reserved bit, field
+ * Word1-2:
+ * Seqno : The 64-bit sequence number of the encrypted message, also
+ * part of the nonce used for the message encryption/decryption
+ * Word3-:
+ * Prevnode: The source node address, or ID in case LINK_CONFIG only
+ * AuthTag : The authentication tag for the message integrity checking
+ * generated by the message encryption
+ */
+struct tipc_ehdr {
+ union {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 destined:1,
+ user:4,
+ version:3;
+ __u8 reserved_1:3,
+ keepalive:1,
+ rx_key_active:2,
+ tx_key:2;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 version:3,
+ user:4,
+ destined:1;
+ __u8 tx_key:2,
+ rx_key_active:2,
+ keepalive:1,
+ reserved_1:3;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __be16 reserved_2;
+ } __packed;
+ __be32 w0;
+ };
+ __be64 seqno;
+ union {
+ __be32 addr;
+ __u8 id[NODE_ID_LEN]; /* For a LINK_CONFIG message only! */
+ };
+#define EHDR_SIZE (offsetof(struct tipc_ehdr, addr) + sizeof(__be32))
+#define EHDR_CFG_SIZE (sizeof(struct tipc_ehdr))
+#define EHDR_MIN_SIZE (EHDR_SIZE)
+#define EHDR_MAX_SIZE (EHDR_CFG_SIZE)
+#define EMSG_OVERHEAD (EHDR_SIZE + TIPC_AES_GCM_TAG_SIZE)
+} __packed;
+
+int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net,
+ struct tipc_node *node);
+void tipc_crypto_stop(struct tipc_crypto **crypto);
+void tipc_crypto_timeout(struct tipc_crypto *rx);
+int tipc_crypto_xmit(struct net *net, struct sk_buff **skb,
+ struct tipc_bearer *b, struct tipc_media_addr *dst,
+ struct tipc_node *__dnode);
+int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx,
+ struct sk_buff **skb, struct tipc_bearer *b);
+int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey,
+ u8 mode);
+void tipc_crypto_key_flush(struct tipc_crypto *c);
+int tipc_aead_key_validate(struct tipc_aead_key *ukey);
+bool tipc_ehdr_validate(struct sk_buff *skb);
+
+#endif /* _TIPC_CRYPTO_H */
+#endif
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index c138d68e8a69..bfe43da127c0 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -94,6 +94,7 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb,
msg_set_dest_domain(hdr, dest_domain);
msg_set_bc_netid(hdr, tn->net_id);
b->media->addr2msg(msg_media_addr(hdr), &b->addr);
+ msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random));
msg_set_node_id(hdr, tipc_own_id(net));
}
@@ -193,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
{
struct tipc_net *tn = tipc_net(net);
struct tipc_msg *hdr = buf_msg(skb);
+ u32 pnet_hash = msg_peer_net_hash(hdr);
u16 caps = msg_node_capabilities(hdr);
bool legacy = tn->legacy_addr_format;
u32 sugg = msg_sugg_node_addr(hdr);
@@ -241,7 +243,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
return;
if (!tipc_in_scope(legacy, b->domain, src))
return;
- tipc_node_check_dest(net, src, peer_id, b, caps, signature,
+ tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash,
&maddr, &respond, &dupl_addr);
if (dupl_addr)
disc_dupl_alert(b, src, &maddr);
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index f69a2fde9f4a..8b0bb600602d 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -92,7 +92,8 @@ struct tipc_media eth_media_info = {
.raw2addr = tipc_eth_raw2addr,
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
- .window = TIPC_DEF_LINK_WIN,
+ .min_win = TIPC_DEF_LINK_WIN,
+ .max_win = TIPC_MAX_LINK_WIN,
.type_id = TIPC_MEDIA_TYPE_ETH,
.hwaddr_len = ETH_ALEN,
.name = "eth"
diff --git a/net/tipc/group.c b/net/tipc/group.c
index 5f98d38bcf08..89257e2a980d 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -199,7 +199,7 @@ void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf)
struct tipc_member *m, *tmp;
struct sk_buff_head xmitq;
- skb_queue_head_init(&xmitq);
+ __skb_queue_head_init(&xmitq);
rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) {
tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq);
tipc_group_update_member(m, 0);
@@ -435,7 +435,7 @@ bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport,
return true;
if (state == MBR_PENDING && adv == ADV_IDLE)
return true;
- skb_queue_head_init(&xmitq);
+ __skb_queue_head_init(&xmitq);
tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq);
tipc_node_distr_xmit(grp->net, &xmitq);
return true;
diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c
index e8c16718e3fa..7aa9ff88458d 100644
--- a/net/tipc/ib_media.c
+++ b/net/tipc/ib_media.c
@@ -42,6 +42,8 @@
#include "core.h"
#include "bearer.h"
+#define TIPC_MAX_IB_LINK_WIN 500
+
/* convert InfiniBand address (media address format) media address to string */
static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf,
int str_size)
@@ -94,7 +96,8 @@ struct tipc_media ib_media_info = {
.raw2addr = tipc_ib_raw2addr,
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
- .window = TIPC_DEF_LINK_WIN,
+ .min_win = TIPC_DEF_LINK_WIN,
+ .max_win = TIPC_MAX_IB_LINK_WIN,
.type_id = TIPC_MEDIA_TYPE_IB,
.hwaddr_len = INFINIBAND_ALEN,
.name = "ib"
diff --git a/net/tipc/link.c b/net/tipc/link.c
index c2c5c53cad22..467c53a1fb5c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -44,6 +44,7 @@
#include "netlink.h"
#include "monitor.h"
#include "trace.h"
+#include "crypto.h"
#include <linux/pkt_sched.h>
@@ -160,9 +161,9 @@ struct tipc_link {
struct {
u16 len;
u16 limit;
+ struct sk_buff *target_bskb;
} backlog[5];
u16 snd_nxt;
- u16 window;
/* Reception */
u16 rcv_nxt;
@@ -173,9 +174,16 @@ struct tipc_link {
/* Congestion handling */
struct sk_buff_head wakeupq;
+ u16 window;
+ u16 min_win;
+ u16 ssthresh;
+ u16 max_win;
+ u16 cong_acks;
+ u16 checkpoint;
/* Fragmentation/reassembly */
struct sk_buff *reasm_buf;
+ struct sk_buff *reasm_tnlmsg;
/* Broadcast */
u16 ackers;
@@ -241,12 +249,13 @@ static int tipc_link_build_nack_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
static void tipc_link_build_bc_init_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
-static bool tipc_link_release_pkts(struct tipc_link *l, u16 to);
-static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data);
+static int tipc_link_release_pkts(struct tipc_link *l, u16 to);
+static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap);
static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
struct tipc_gap_ack_blks *ga,
struct sk_buff_head *xmitq);
-
+static void tipc_link_update_cwin(struct tipc_link *l, int released,
+ bool retransmitted);
/*
* Simple non-static link routines (i.e. referenced outside this file)
*/
@@ -305,9 +314,14 @@ u32 tipc_link_id(struct tipc_link *l)
return l->peer_bearer_id << 16 | l->bearer_id;
}
-int tipc_link_window(struct tipc_link *l)
+int tipc_link_min_win(struct tipc_link *l)
+{
+ return l->min_win;
+}
+
+int tipc_link_max_win(struct tipc_link *l)
{
- return l->window;
+ return l->max_win;
}
int tipc_link_prio(struct tipc_link *l)
@@ -395,6 +409,15 @@ int tipc_link_mtu(struct tipc_link *l)
return l->mtu;
}
+int tipc_link_mss(struct tipc_link *l)
+{
+#ifdef CONFIG_TIPC_CRYPTO
+ return l->mtu - INT_H_SIZE - EMSG_OVERHEAD;
+#else
+ return l->mtu - INT_H_SIZE;
+#endif
+}
+
u16 tipc_link_rcv_nxt(struct tipc_link *l)
{
return l->rcv_nxt;
@@ -424,7 +447,8 @@ u32 tipc_link_state(struct tipc_link *l)
* @net_plane: network plane (A,B,c..) this link belongs to
* @mtu: mtu to be advertised by link
* @priority: priority to be used by link
- * @window: send window to be used by link
+ * @min_win: minimal send window to be used by link
+ * @max_win: maximal send window to be used by link
* @session: session to be used by link
* @ownnode: identity of own node
* @peer: node id of peer node
@@ -439,7 +463,7 @@ u32 tipc_link_state(struct tipc_link *l)
*/
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
int tolerance, char net_plane, u32 mtu, int priority,
- int window, u32 session, u32 self,
+ u32 min_win, u32 max_win, u32 session, u32 self,
u32 peer, u8 *peer_id, u16 peer_caps,
struct tipc_link *bc_sndlink,
struct tipc_link *bc_rcvlink,
@@ -483,7 +507,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
l->advertised_mtu = mtu;
l->mtu = mtu;
l->priority = priority;
- tipc_link_set_queue_limits(l, window);
+ tipc_link_set_queue_limits(l, min_win, max_win);
l->ackers = 1;
l->bc_sndlink = bc_sndlink;
l->bc_rcvlink = bc_rcvlink;
@@ -511,7 +535,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
* Returns true if link was created, otherwise false
*/
bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
- int mtu, int window, u16 peer_caps,
+ int mtu, u32 min_win, u32 max_win, u16 peer_caps,
struct sk_buff_head *inputq,
struct sk_buff_head *namedq,
struct tipc_link *bc_sndlink,
@@ -519,9 +543,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
{
struct tipc_link *l;
- if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window,
- 0, ownnode, peer, NULL, peer_caps, bc_sndlink,
- NULL, inputq, namedq, link))
+ if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win,
+ max_win, 0, ownnode, peer, NULL, peer_caps,
+ bc_sndlink, NULL, inputq, namedq, link))
return false;
l = *link;
@@ -538,7 +562,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
/* Disable replicast if even a single peer doesn't support it */
if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
- tipc_bcast_disable_rcast(net);
+ tipc_bcast_toggle_rcast(net, false);
return true;
}
@@ -760,6 +784,8 @@ bool tipc_link_too_silent(struct tipc_link *l)
return (l->silent_intv_cnt + 2 > l->abort_limit);
}
+static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
+ u16 from, u16 to, struct sk_buff_head *xmitq);
/* tipc_link_timeout - perform periodic task as instructed from node timeout
*/
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
@@ -792,6 +818,11 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
probe |= l->silent_intv_cnt;
if (probe || mstate->monitoring)
l->silent_intv_cnt++;
+ if (l->snd_nxt == l->checkpoint) {
+ tipc_link_update_cwin(l, 0, 0);
+ probe = true;
+ }
+ l->checkpoint = l->snd_nxt;
break;
case LINK_RESET:
setup = l->rst_cnt++ <= 4;
@@ -849,23 +880,37 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
*/
static void link_prepare_wakeup(struct tipc_link *l)
{
+ struct sk_buff_head *wakeupq = &l->wakeupq;
+ struct sk_buff_head *inputq = l->inputq;
struct sk_buff *skb, *tmp;
- int imp, i = 0;
+ struct sk_buff_head tmpq;
+ int avail[5] = {0,};
+ int imp = 0;
- skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
+ __skb_queue_head_init(&tmpq);
+
+ for (; imp <= TIPC_SYSTEM_IMPORTANCE; imp++)
+ avail[imp] = l->backlog[imp].limit - l->backlog[imp].len;
+
+ skb_queue_walk_safe(wakeupq, skb, tmp) {
imp = TIPC_SKB_CB(skb)->chain_imp;
- if (l->backlog[imp].len < l->backlog[imp].limit) {
- skb_unlink(skb, &l->wakeupq);
- skb_queue_tail(l->inputq, skb);
- } else if (i++ > 10) {
- break;
- }
+ if (avail[imp] <= 0)
+ continue;
+ avail[imp]--;
+ __skb_unlink(skb, wakeupq);
+ __skb_queue_tail(&tmpq, skb);
}
+
+ spin_lock_bh(&inputq->lock);
+ skb_queue_splice_tail(&tmpq, inputq);
+ spin_unlock_bh(&inputq->lock);
+
}
void tipc_link_reset(struct tipc_link *l)
{
struct sk_buff_head list;
+ u32 imp;
__skb_queue_head_init(&list);
@@ -887,14 +932,15 @@ void tipc_link_reset(struct tipc_link *l)
__skb_queue_purge(&l->deferdq);
__skb_queue_purge(&l->backlogq);
__skb_queue_purge(&l->failover_deferdq);
- l->backlog[TIPC_LOW_IMPORTANCE].len = 0;
- l->backlog[TIPC_MEDIUM_IMPORTANCE].len = 0;
- l->backlog[TIPC_HIGH_IMPORTANCE].len = 0;
- l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0;
- l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0;
+ for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) {
+ l->backlog[imp].len = 0;
+ l->backlog[imp].target_bskb = NULL;
+ }
kfree_skb(l->reasm_buf);
+ kfree_skb(l->reasm_tnlmsg);
kfree_skb(l->failover_reasm_skb);
l->reasm_buf = NULL;
+ l->reasm_tnlmsg = NULL;
l->failover_reasm_skb = NULL;
l->rcv_unacked = 0;
l->snd_nxt = 1;
@@ -923,20 +969,25 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *xmitq)
{
struct tipc_msg *hdr = buf_msg(skb_peek(list));
- unsigned int maxwin = l->window;
- int imp = msg_importance(hdr);
- unsigned int mtu = l->mtu;
+ struct sk_buff_head *backlogq = &l->backlogq;
+ struct sk_buff_head *transmq = &l->transmq;
+ struct sk_buff *skb, *_skb;
+ u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
u16 ack = l->rcv_nxt - 1;
u16 seqno = l->snd_nxt;
- u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
- struct sk_buff_head *transmq = &l->transmq;
- struct sk_buff_head *backlogq = &l->backlogq;
- struct sk_buff *skb, *_skb, *bskb;
int pkt_cnt = skb_queue_len(list);
+ int imp = msg_importance(hdr);
+ unsigned int mss = tipc_link_mss(l);
+ unsigned int cwin = l->window;
+ unsigned int mtu = l->mtu;
+ bool new_bundle;
int rc = 0;
if (unlikely(msg_size(hdr) > mtu)) {
- skb_queue_purge(list);
+ pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
+ skb_queue_len(list), msg_user(hdr),
+ msg_type(hdr), msg_size(hdr), mtu);
+ __skb_queue_purge(list);
return -EMSGSIZE;
}
@@ -955,20 +1006,18 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
}
/* Prepare each packet for sending, and add to relevant queue: */
- while (skb_queue_len(list)) {
- skb = skb_peek(list);
- hdr = buf_msg(skb);
- msg_set_seqno(hdr, seqno);
- msg_set_ack(hdr, ack);
- msg_set_bcast_ack(hdr, bc_ack);
-
- if (likely(skb_queue_len(transmq) < maxwin)) {
+ while ((skb = __skb_dequeue(list))) {
+ if (likely(skb_queue_len(transmq) < cwin)) {
+ hdr = buf_msg(skb);
+ msg_set_seqno(hdr, seqno);
+ msg_set_ack(hdr, ack);
+ msg_set_bcast_ack(hdr, bc_ack);
_skb = skb_clone(skb, GFP_ATOMIC);
if (!_skb) {
- skb_queue_purge(list);
+ kfree_skb(skb);
+ __skb_queue_purge(list);
return -ENOBUFS;
}
- __skb_dequeue(list);
__skb_queue_tail(transmq, skb);
/* next retransmit attempt */
if (link_is_bc_sndlink(l))
@@ -980,36 +1029,86 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
seqno++;
continue;
}
- if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) {
- kfree_skb(__skb_dequeue(list));
- l->stats.sent_bundled++;
- continue;
- }
- if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) {
- kfree_skb(__skb_dequeue(list));
- __skb_queue_tail(backlogq, bskb);
- l->backlog[msg_importance(buf_msg(bskb))].len++;
- l->stats.sent_bundled++;
- l->stats.sent_bundles++;
+ if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb,
+ mss, l->addr, &new_bundle)) {
+ if (skb) {
+ /* Keep a ref. to the skb for next try */
+ l->backlog[imp].target_bskb = skb;
+ l->backlog[imp].len++;
+ __skb_queue_tail(backlogq, skb);
+ } else {
+ if (new_bundle) {
+ l->stats.sent_bundles++;
+ l->stats.sent_bundled++;
+ }
+ l->stats.sent_bundled++;
+ }
continue;
}
- l->backlog[imp].len += skb_queue_len(list);
+ l->backlog[imp].target_bskb = NULL;
+ l->backlog[imp].len += (1 + skb_queue_len(list));
+ __skb_queue_tail(backlogq, skb);
skb_queue_splice_tail_init(list, backlogq);
}
l->snd_nxt = seqno;
return rc;
}
+static void tipc_link_update_cwin(struct tipc_link *l, int released,
+ bool retransmitted)
+{
+ int bklog_len = skb_queue_len(&l->backlogq);
+ struct sk_buff_head *txq = &l->transmq;
+ int txq_len = skb_queue_len(txq);
+ u16 cwin = l->window;
+
+ /* Enter fast recovery */
+ if (unlikely(retransmitted)) {
+ l->ssthresh = max_t(u16, l->window / 2, 300);
+ l->window = l->ssthresh;
+ return;
+ }
+ /* Enter slow start */
+ if (unlikely(!released)) {
+ l->ssthresh = max_t(u16, l->window / 2, 300);
+ l->window = l->min_win;
+ return;
+ }
+ /* Don't increase window if no pressure on the transmit queue */
+ if (txq_len + bklog_len < cwin)
+ return;
+
+ /* Don't increase window if there are holes the transmit queue */
+ if (txq_len && l->snd_nxt - buf_seqno(skb_peek(txq)) != txq_len)
+ return;
+
+ l->cong_acks += released;
+
+ /* Slow start */
+ if (cwin <= l->ssthresh) {
+ l->window = min_t(u16, cwin + released, l->max_win);
+ return;
+ }
+ /* Congestion avoidance */
+ if (l->cong_acks < cwin)
+ return;
+ l->window = min_t(u16, ++cwin, l->max_win);
+ l->cong_acks = 0;
+}
+
static void tipc_link_advance_backlog(struct tipc_link *l,
struct sk_buff_head *xmitq)
{
+ u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+ struct sk_buff_head *txq = &l->transmq;
struct sk_buff *skb, *_skb;
- struct tipc_msg *hdr;
- u16 seqno = l->snd_nxt;
u16 ack = l->rcv_nxt - 1;
- u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+ u16 seqno = l->snd_nxt;
+ struct tipc_msg *hdr;
+ u16 cwin = l->window;
+ u32 imp;
- while (skb_queue_len(&l->transmq) < l->window) {
+ while (skb_queue_len(txq) < cwin) {
skb = skb_peek(&l->backlogq);
if (!skb)
break;
@@ -1018,7 +1117,10 @@ static void tipc_link_advance_backlog(struct tipc_link *l,
break;
__skb_dequeue(&l->backlogq);
hdr = buf_msg(skb);
- l->backlog[msg_importance(hdr)].len--;
+ imp = msg_importance(hdr);
+ l->backlog[imp].len--;
+ if (unlikely(skb == l->backlog[imp].target_bskb))
+ l->backlog[imp].target_bskb = NULL;
__skb_queue_tail(&l->transmq, skb);
/* next retransmit attempt */
if (link_is_bc_sndlink(l))
@@ -1058,7 +1160,7 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
return false;
if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp +
- msecs_to_jiffies(r->tolerance)))
+ msecs_to_jiffies(r->tolerance * 10)))
return false;
hdr = buf_msg(skb);
@@ -1102,6 +1204,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
u16 ack = l->rcv_nxt - 1;
+ int retransmitted = 0;
struct tipc_msg *hdr;
int rc = 0;
@@ -1121,11 +1224,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
continue;
if (more(msg_seqno(hdr), to))
break;
-
if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
continue;
TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
- _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC);
+ _skb = pskb_copy(skb, GFP_ATOMIC);
if (!_skb)
return 0;
hdr = buf_msg(_skb);
@@ -1134,11 +1236,12 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r,
_skb->priority = TC_PRIO_CONTROL;
__skb_queue_tail(xmitq, _skb);
l->stats.retransmitted++;
-
+ retransmitted++;
/* Increase actual retrans counter & mark first time */
if (!TIPC_SKB_CB(skb)->retr_cnt++)
TIPC_SKB_CB(skb)->retr_stamp = jiffies;
}
+ tipc_link_update_cwin(l, 0, retransmitted);
return 0;
}
@@ -1238,6 +1341,7 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *inputq)
{
struct sk_buff **reasm_skb = &l->failover_reasm_skb;
+ struct sk_buff **reasm_tnlmsg = &l->reasm_tnlmsg;
struct sk_buff_head *fdefq = &l->failover_deferdq;
struct tipc_msg *hdr = buf_msg(skb);
struct sk_buff *iskb;
@@ -1245,46 +1349,62 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
int rc = 0;
u16 seqno;
- /* SYNCH_MSG */
- if (msg_type(hdr) == SYNCH_MSG)
- goto drop;
+ if (msg_type(hdr) == SYNCH_MSG) {
+ kfree_skb(skb);
+ return 0;
+ }
- /* FAILOVER_MSG */
- if (!tipc_msg_extract(skb, &iskb, &ipos)) {
- pr_warn_ratelimited("Cannot extract FAILOVER_MSG, defq: %d\n",
- skb_queue_len(fdefq));
- return rc;
+ /* Not a fragment? */
+ if (likely(!msg_nof_fragms(hdr))) {
+ if (unlikely(!tipc_msg_extract(skb, &iskb, &ipos))) {
+ pr_warn_ratelimited("Unable to extract msg, defq: %d\n",
+ skb_queue_len(fdefq));
+ return 0;
+ }
+ kfree_skb(skb);
+ } else {
+ /* Set fragment type for buf_append */
+ if (msg_fragm_no(hdr) == 1)
+ msg_set_type(hdr, FIRST_FRAGMENT);
+ else if (msg_fragm_no(hdr) < msg_nof_fragms(hdr))
+ msg_set_type(hdr, FRAGMENT);
+ else
+ msg_set_type(hdr, LAST_FRAGMENT);
+
+ if (!tipc_buf_append(reasm_tnlmsg, &skb)) {
+ /* Successful but non-complete reassembly? */
+ if (*reasm_tnlmsg || link_is_bc_rcvlink(l))
+ return 0;
+ pr_warn_ratelimited("Unable to reassemble tunnel msg\n");
+ return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ }
+ iskb = skb;
}
do {
seqno = buf_seqno(iskb);
-
if (unlikely(less(seqno, l->drop_point))) {
kfree_skb(iskb);
continue;
}
-
if (unlikely(seqno != l->drop_point)) {
__tipc_skb_queue_sorted(fdefq, seqno, iskb);
continue;
}
l->drop_point++;
-
if (!tipc_data_input(l, iskb, inputq))
rc |= tipc_link_input(l, iskb, inputq, reasm_skb);
if (unlikely(rc))
break;
} while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point)));
-drop:
- kfree_skb(skb);
return rc;
}
-static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
+static int tipc_link_release_pkts(struct tipc_link *l, u16 acked)
{
- bool released = false;
+ int released = 0;
struct sk_buff *skb, *tmp;
skb_queue_walk_safe(&l->transmq, skb, tmp) {
@@ -1292,7 +1412,7 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
break;
__skb_unlink(skb, &l->transmq);
kfree_skb(skb);
- released = true;
+ released++;
}
return released;
}
@@ -1303,14 +1423,14 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked)
*
* returns the actual allocated memory size
*/
-static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data)
+static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap)
{
struct sk_buff *skb = skb_peek(&l->deferdq);
struct tipc_gap_ack_blks *ga = data;
u16 len, expect, seqno = 0;
u8 n = 0;
- if (!skb)
+ if (!skb || !gap)
goto exit;
expect = buf_seqno(skb);
@@ -1361,8 +1481,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap,
struct sk_buff *skb, *_skb, *tmp;
struct tipc_msg *hdr;
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
+ bool retransmitted = false;
u16 ack = l->rcv_nxt - 1;
bool passed = false;
+ u16 released = 0;
u16 seqno, n = 0;
int rc = 0;
@@ -1374,6 +1496,7 @@ next_gap_ack:
/* release skb */
__skb_unlink(skb, &l->transmq);
kfree_skb(skb);
+ released++;
} else if (less_eq(seqno, acked + gap)) {
/* First, check if repeated retrans failures occurs? */
if (!passed && link_retransmit_failure(l, l, &rc))
@@ -1384,8 +1507,7 @@ next_gap_ack:
if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
continue;
TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME;
- _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE,
- GFP_ATOMIC);
+ _skb = pskb_copy(skb, GFP_ATOMIC);
if (!_skb)
continue;
hdr = buf_msg(_skb);
@@ -1394,7 +1516,7 @@ next_gap_ack:
_skb->priority = TC_PRIO_CONTROL;
__skb_queue_tail(xmitq, _skb);
l->stats.retransmitted++;
-
+ retransmitted = true;
/* Increase actual retrans counter & mark first time */
if (!TIPC_SKB_CB(skb)->retr_cnt++)
TIPC_SKB_CB(skb)->retr_stamp = jiffies;
@@ -1408,7 +1530,10 @@ next_gap_ack:
goto next_gap_ack;
}
}
-
+ if (released || retransmitted)
+ tipc_link_update_cwin(l, released, retransmitted);
+ if (released)
+ tipc_link_advance_backlog(l, xmitq);
return 0;
}
@@ -1432,7 +1557,6 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
l->snd_nxt = l->rcv_nxt;
return TIPC_LINK_SND_STATE;
}
-
/* Unicast ACK */
l->rcv_unacked = 0;
l->stats.sent_acks++;
@@ -1466,7 +1590,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l,
struct sk_buff_head *xmitq)
{
u32 def_cnt = ++l->stats.deferred_recv;
- u32 defq_len = skb_queue_len(&l->deferdq);
+ struct sk_buff_head *dfq = &l->deferdq;
+ u32 defq_len = skb_queue_len(dfq);
int match1, match2;
if (link_is_bc_rcvlink(l)) {
@@ -1477,8 +1602,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l,
return 0;
}
- if (defq_len >= 3 && !((defq_len - 3) % 16))
- tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq);
+ if (defq_len >= 3 && !((defq_len - 3) % 16)) {
+ u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
+
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, 0,
+ rcvgap, 0, 0, xmitq);
+ }
return 0;
}
@@ -1493,6 +1622,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
struct sk_buff_head *defq = &l->deferdq;
struct tipc_msg *hdr = buf_msg(skb);
u16 seqno, rcv_nxt, win_lim;
+ int released = 0;
int rc = 0;
/* Verify and update link state */
@@ -1511,21 +1641,17 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
if (unlikely(!link_is_up(l))) {
if (l->state == LINK_ESTABLISHING)
rc = TIPC_LINK_UP_EVT;
- goto drop;
+ kfree_skb(skb);
+ break;
}
/* Drop if outside receive window */
if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) {
l->stats.duplicates++;
- goto drop;
- }
-
- /* Forward queues and wake up waiting users */
- if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) {
- tipc_link_advance_backlog(l, xmitq);
- if (unlikely(!skb_queue_empty(&l->wakeupq)))
- link_prepare_wakeup(l);
+ kfree_skb(skb);
+ break;
}
+ released += tipc_link_release_pkts(l, msg_ack(hdr));
/* Defer delivery if sequence gap */
if (unlikely(seqno != rcv_nxt)) {
@@ -1548,9 +1674,13 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
break;
} while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt)));
- return rc;
-drop:
- kfree_skb(skb);
+ /* Forward queues and wake up waiting users */
+ if (released) {
+ tipc_link_update_cwin(l, released, 0);
+ tipc_link_advance_backlog(l, xmitq);
+ if (unlikely(!skb_queue_empty(&l->wakeupq)))
+ link_prepare_wakeup(l);
+ }
return rc;
}
@@ -1576,7 +1706,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
if (!tipc_link_is_up(l) && (mtyp == STATE_MSG))
return;
- if (!skb_queue_empty(dfq))
+ if ((probe || probe_reply) && !skb_queue_empty(dfq))
rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
@@ -1609,7 +1739,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
msg_set_probe(hdr, probe);
msg_set_is_keepalive(hdr, probe || probe_reply);
if (l->peer_caps & TIPC_GAP_ACK_BLOCK)
- glen = tipc_build_gap_ack_blks(l, data);
+ glen = tipc_build_gap_ack_blks(l, data, rcvgap);
tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id);
msg_set_size(hdr, INT_H_SIZE + glen + dlen);
skb_trim(skb, INT_H_SIZE + glen + dlen);
@@ -1644,7 +1774,7 @@ void tipc_link_create_dummy_tnl_msg(struct tipc_link *l,
struct sk_buff *skb;
u32 dnode = l->addr;
- skb_queue_head_init(&tnlq);
+ __skb_queue_head_init(&tnlq);
skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG,
INT_H_SIZE, BASIC_H_SIZE,
dnode, onode, 0, 0, 0);
@@ -1675,15 +1805,43 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
struct sk_buff *skb, *tnlskb;
struct tipc_msg *hdr, tnlhdr;
struct sk_buff_head *queue = &l->transmq;
- struct sk_buff_head tmpxq, tnlq;
+ struct sk_buff_head tmpxq, tnlq, frags;
u16 pktlen, pktcnt, seqno = l->snd_nxt;
+ bool pktcnt_need_update = false;
+ u16 syncpt;
+ int rc;
if (!tnl)
return;
- skb_queue_head_init(&tnlq);
- skb_queue_head_init(&tmpxq);
+ __skb_queue_head_init(&tnlq);
+ /* Link Synching:
+ * From now on, send only one single ("dummy") SYNCH message
+ * to peer. The SYNCH message does not contain any data, just
+ * a header conveying the synch point to the peer.
+ */
+ if (mtyp == SYNCH_MSG && (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) {
+ tnlskb = tipc_msg_create(TUNNEL_PROTOCOL, SYNCH_MSG,
+ INT_H_SIZE, 0, l->addr,
+ tipc_own_addr(l->net),
+ 0, 0, 0);
+ if (!tnlskb) {
+ pr_warn("%sunable to create dummy SYNCH_MSG\n",
+ link_co_err);
+ return;
+ }
+
+ hdr = buf_msg(tnlskb);
+ syncpt = l->snd_nxt + skb_queue_len(&l->backlogq) - 1;
+ msg_set_syncpt(hdr, syncpt);
+ msg_set_bearer_id(hdr, l->peer_bearer_id);
+ __skb_queue_tail(&tnlq, tnlskb);
+ tipc_link_xmit(tnl, &tnlq, xmitq);
+ return;
+ }
+ __skb_queue_head_init(&tmpxq);
+ __skb_queue_head_init(&frags);
/* At least one packet required for safe algorithm => add dummy */
skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net),
@@ -1692,7 +1850,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
pr_warn("%sunable to create tunnel packet\n", link_co_err);
return;
}
- skb_queue_tail(&tnlq, skb);
+ __skb_queue_tail(&tnlq, skb);
tipc_link_xmit(l, &tnlq, &tmpxq);
__skb_queue_purge(&tmpxq);
@@ -1713,6 +1871,39 @@ tnl:
if (queue == &l->backlogq)
msg_set_seqno(hdr, seqno++);
pktlen = msg_size(hdr);
+
+ /* Tunnel link MTU is not large enough? This could be
+ * due to:
+ * 1) Link MTU has just changed or set differently;
+ * 2) Or FAILOVER on the top of a SYNCH message
+ *
+ * The 2nd case should not happen if peer supports
+ * TIPC_TUNNEL_ENHANCED
+ */
+ if (pktlen > tnl->mtu - INT_H_SIZE) {
+ if (mtyp == FAILOVER_MSG &&
+ (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) {
+ rc = tipc_msg_fragment(skb, &tnlhdr, tnl->mtu,
+ &frags);
+ if (rc) {
+ pr_warn("%sunable to frag msg: rc %d\n",
+ link_co_err, rc);
+ return;
+ }
+ pktcnt += skb_queue_len(&frags) - 1;
+ pktcnt_need_update = true;
+ skb_queue_splice_tail_init(&frags, &tnlq);
+ continue;
+ }
+ /* Unluckily, peer doesn't have TIPC_TUNNEL_ENHANCED
+ * => Just warn it and return!
+ */
+ pr_warn_ratelimited("%stoo large msg <%d, %d>: %d!\n",
+ link_co_err, msg_user(hdr),
+ msg_type(hdr), msg_size(hdr));
+ return;
+ }
+
msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
if (!tnlskb) {
@@ -1728,6 +1919,12 @@ tnl:
goto tnl;
}
+ if (pktcnt_need_update)
+ skb_queue_walk(&tnlq, skb) {
+ hdr = buf_msg(skb);
+ msg_set_msgcnt(hdr, pktcnt);
+ }
+
tipc_link_xmit(tnl, &tnlq, xmitq);
if (mtyp == FAILOVER_MSG) {
@@ -1762,7 +1959,7 @@ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl,
tipc_link_create_dummy_tnl_msg(tnl, xmitq);
- /* This failover link enpoint was never established before,
+ /* This failover link endpoint was never established before,
* so it has not received anything from peer.
* Otherwise, it must be a normal failover situation or the
* node has entered SELF_DOWN_PEER_LEAVING and both peer nodes
@@ -1952,19 +2149,18 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
&l->mon_state, l->bearer_id);
/* Send NACK if peer has sent pkts we haven't received yet */
- if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l))
+ if ((reply || msg_is_keepalive(hdr)) &&
+ more(peers_snd_nxt, rcv_nxt) &&
+ !tipc_link_is_synching(l) &&
+ skb_queue_empty(&l->deferdq))
rcvgap = peers_snd_nxt - l->rcv_nxt;
if (rcvgap || reply)
tipc_link_build_proto_msg(l, STATE_MSG, 0, reply,
rcvgap, 0, 0, xmitq);
rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq);
-
- /* If NACK, retransmit will now start at right position */
if (gap)
l->stats.recv_nacks++;
-
- tipc_link_advance_backlog(l, xmitq);
if (unlikely(!skb_queue_empty(&l->wakeupq)))
link_prepare_wakeup(l);
}
@@ -2183,15 +2379,18 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
return 0;
}
-void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
+void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win)
{
int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE);
- l->window = win;
- l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win);
- l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2);
- l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3);
- l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4);
+ l->min_win = min_win;
+ l->ssthresh = max_win;
+ l->max_win = max_win;
+ l->window = min_win;
+ l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2;
+ l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4;
+ l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6;
+ l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8;
l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk;
}
@@ -2244,10 +2443,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[])
}
if (props[TIPC_NLA_PROP_WIN]) {
- u32 win;
+ u32 max_win;
- win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
- if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN))
+ max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ if (max_win < TIPC_DEF_LINK_WIN || max_win > TIPC_MAX_LINK_WIN)
return -EINVAL;
}
@@ -2483,7 +2682,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP);
if (!prop)
goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window))
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win))
goto prop_msg_full;
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode))
goto prop_msg_full;
diff --git a/net/tipc/link.h b/net/tipc/link.h
index adcad65e761c..d3c1c3fc1659 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -73,7 +73,7 @@ enum {
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
int tolerance, char net_plane, u32 mtu, int priority,
- int window, u32 session, u32 ownnode,
+ u32 min_win, u32 max_win, u32 session, u32 ownnode,
u32 peer, u8 *peer_id, u16 peer_caps,
struct tipc_link *bc_sndlink,
struct tipc_link *bc_rcvlink,
@@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
struct sk_buff_head *namedq,
struct tipc_link **link);
bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
- int mtu, int window, u16 peer_caps,
+ int mtu, u32 min_win, u32 max_win, u16 peer_caps,
struct sk_buff_head *inputq,
struct sk_buff_head *namedq,
struct tipc_link *bc_sndlink,
@@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf);
u32 tipc_link_state(struct tipc_link *l);
char tipc_link_plane(struct tipc_link *l);
int tipc_link_prio(struct tipc_link *l);
-int tipc_link_window(struct tipc_link *l);
+int tipc_link_min_win(struct tipc_link *l);
+int tipc_link_max_win(struct tipc_link *l);
void tipc_link_update_caps(struct tipc_link *l, u16 capabilities);
bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr);
unsigned long tipc_link_tolerance(struct tipc_link *l);
@@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
void tipc_link_set_prio(struct tipc_link *l, u32 prio,
struct sk_buff_head *xmitq);
void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit);
-void tipc_link_set_queue_limits(struct tipc_link *l, u32 window);
+void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win);
int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
struct tipc_link *link, int nlflags);
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
@@ -141,6 +142,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
int tipc_link_bc_peers(struct tipc_link *l);
void tipc_link_set_mtu(struct tipc_link *l, int mtu);
int tipc_link_mtu(struct tipc_link *l);
+int tipc_link_mss(struct tipc_link *l);
void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked,
struct sk_buff_head *xmitq);
void tipc_link_build_bc_sync_msg(struct tipc_link *l,
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 6a6eae88442f..58708b4c7719 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -665,6 +665,21 @@ void tipc_mon_delete(struct net *net, int bearer_id)
kfree(mon);
}
+void tipc_mon_reinit_self(struct net *net)
+{
+ struct tipc_monitor *mon;
+ int bearer_id;
+
+ for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
+ mon = tipc_monitor(net, bearer_id);
+ if (!mon)
+ continue;
+ write_lock_bh(&mon->lock);
+ mon->self->addr = tipc_own_addr(net);
+ write_unlock_bh(&mon->lock);
+ }
+}
+
int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size)
{
struct tipc_net *tn = tipc_net(net);
diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h
index 2a21b93e0d04..ed63d2e650b0 100644
--- a/net/tipc/monitor.h
+++ b/net/tipc/monitor.h
@@ -77,6 +77,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
u32 bearer_id);
int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg,
u32 bearer_id, u32 *prev_node);
+void tipc_mon_reinit_self(struct net *net);
extern const int tipc_max_domain_size;
#endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index f48e5857210f..0d515d20b056 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -39,10 +39,16 @@
#include "msg.h"
#include "addr.h"
#include "name_table.h"
+#include "crypto.h"
#define MAX_FORWARD_SIZE 1024
+#ifdef CONFIG_TIPC_CRYPTO
+#define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16)
+#define BUF_TAILROOM (TIPC_AES_GCM_TAG_SIZE)
+#else
#define BUF_HEADROOM (LL_MAX_HEADER + 48)
#define BUF_TAILROOM 16
+#endif
static unsigned int align(unsigned int i)
{
@@ -61,7 +67,11 @@ static unsigned int align(unsigned int i)
struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp)
{
struct sk_buff *skb;
+#ifdef CONFIG_TIPC_CRYPTO
+ unsigned int buf_size = (BUF_HEADROOM + size + BUF_TAILROOM + 3) & ~3u;
+#else
unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
+#endif
skb = alloc_skb_fclone(buf_size, gfp);
if (skb) {
@@ -173,7 +183,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)
}
if (fragid == LAST_FRAGMENT) {
- TIPC_SKB_CB(head)->validated = false;
+ TIPC_SKB_CB(head)->validated = 0;
if (unlikely(!tipc_msg_validate(&head)))
goto err;
*buf = head;
@@ -190,6 +200,59 @@ err:
return 0;
}
+/**
+ * tipc_msg_append(): Append data to tail of an existing buffer queue
+ * @hdr: header to be used
+ * @m: the data to be appended
+ * @mss: max allowable size of buffer
+ * @dlen: size of data to be appended
+ * @txq: queue to appand to
+ * Returns the number og 1k blocks appended or errno value
+ */
+int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
+ int mss, struct sk_buff_head *txq)
+{
+ struct sk_buff *skb, *prev;
+ int accounted, total, curr;
+ int mlen, cpy, rem = dlen;
+ struct tipc_msg *hdr;
+
+ skb = skb_peek_tail(txq);
+ accounted = skb ? msg_blocks(buf_msg(skb)) : 0;
+ total = accounted;
+
+ while (rem) {
+ if (!skb || skb->len >= mss) {
+ prev = skb;
+ skb = tipc_buf_acquire(mss, GFP_KERNEL);
+ if (unlikely(!skb))
+ return -ENOMEM;
+ skb_orphan(skb);
+ skb_trim(skb, MIN_H_SIZE);
+ hdr = buf_msg(skb);
+ skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE);
+ msg_set_hdr_sz(hdr, MIN_H_SIZE);
+ msg_set_size(hdr, MIN_H_SIZE);
+ __skb_queue_tail(txq, skb);
+ total += 1;
+ if (prev)
+ msg_set_ack_required(buf_msg(prev), 0);
+ msg_set_ack_required(hdr, 1);
+ }
+ hdr = buf_msg(skb);
+ curr = msg_blocks(hdr);
+ mlen = msg_size(hdr);
+ cpy = min_t(int, rem, mss - mlen);
+ if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter))
+ return -EFAULT;
+ msg_set_size(hdr, mlen + cpy);
+ skb_put(skb, cpy);
+ rem -= cpy;
+ total += msg_blocks(hdr) - curr;
+ }
+ return total - accounted;
+}
+
/* tipc_msg_validate - validate basic format of received message
*
* This routine ensures a TIPC message has an acceptable header, and at least
@@ -218,6 +281,7 @@ bool tipc_msg_validate(struct sk_buff **_skb)
if (unlikely(TIPC_SKB_CB(skb)->validated))
return true;
+
if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE)))
return false;
@@ -239,11 +303,70 @@ bool tipc_msg_validate(struct sk_buff **_skb)
if (unlikely(skb->len < msz))
return false;
- TIPC_SKB_CB(skb)->validated = true;
+ TIPC_SKB_CB(skb)->validated = 1;
return true;
}
/**
+ * tipc_msg_fragment - build a fragment skb list for TIPC message
+ *
+ * @skb: TIPC message skb
+ * @hdr: internal msg header to be put on the top of the fragments
+ * @pktmax: max size of a fragment incl. the header
+ * @frags: returned fragment skb list
+ *
+ * Returns 0 if the fragmentation is successful, otherwise: -EINVAL
+ * or -ENOMEM
+ */
+int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
+ int pktmax, struct sk_buff_head *frags)
+{
+ int pktno, nof_fragms, dsz, dmax, eat;
+ struct tipc_msg *_hdr;
+ struct sk_buff *_skb;
+ u8 *data;
+
+ /* Non-linear buffer? */
+ if (skb_linearize(skb))
+ return -ENOMEM;
+
+ data = (u8 *)skb->data;
+ dsz = msg_size(buf_msg(skb));
+ dmax = pktmax - INT_H_SIZE;
+ if (dsz <= dmax || !dmax)
+ return -EINVAL;
+
+ nof_fragms = dsz / dmax + 1;
+ for (pktno = 1; pktno <= nof_fragms; pktno++) {
+ if (pktno < nof_fragms)
+ eat = dmax;
+ else
+ eat = dsz % dmax;
+ /* Allocate a new fragment */
+ _skb = tipc_buf_acquire(INT_H_SIZE + eat, GFP_ATOMIC);
+ if (!_skb)
+ goto error;
+ skb_orphan(_skb);
+ __skb_queue_tail(frags, _skb);
+ /* Copy header & data to the fragment */
+ skb_copy_to_linear_data(_skb, hdr, INT_H_SIZE);
+ skb_copy_to_linear_data_offset(_skb, INT_H_SIZE, data, eat);
+ data += eat;
+ /* Update the fragment's header */
+ _hdr = buf_msg(_skb);
+ msg_set_fragm_no(_hdr, pktno);
+ msg_set_nof_fragms(_hdr, nof_fragms);
+ msg_set_size(_hdr, INT_H_SIZE + eat);
+ }
+ return 0;
+
+error:
+ __skb_queue_purge(frags);
+ __skb_queue_head_init(frags);
+ return -ENOMEM;
+}
+
+/**
* tipc_msg_build - create buffer chain containing specified header and data
* @mhdr: Message header, to be prepended to data
* @m: User message
@@ -360,48 +483,98 @@ error:
}
/**
- * tipc_msg_bundle(): Append contents of a buffer to tail of an existing one
- * @skb: the buffer to append to ("bundle")
- * @msg: message to be appended
- * @mtu: max allowable size for the bundle buffer
- * Consumes buffer if successful
- * Returns true if bundling could be performed, otherwise false
+ * tipc_msg_bundle - Append contents of a buffer to tail of an existing one
+ * @bskb: the bundle buffer to append to
+ * @msg: message to be appended
+ * @max: max allowable size for the bundle buffer
+ *
+ * Returns "true" if bundling has been performed, otherwise "false"
*/
-bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu)
+static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg,
+ u32 max)
{
- struct tipc_msg *bmsg;
- unsigned int bsz;
- unsigned int msz = msg_size(msg);
- u32 start, pad;
- u32 max = mtu - INT_H_SIZE;
+ struct tipc_msg *bmsg = buf_msg(bskb);
+ u32 msz, bsz, offset, pad;
- if (likely(msg_user(msg) == MSG_FRAGMENTER))
- return false;
- if (!skb)
- return false;
- bmsg = buf_msg(skb);
+ msz = msg_size(msg);
bsz = msg_size(bmsg);
- start = align(bsz);
- pad = start - bsz;
+ offset = align(bsz);
+ pad = offset - bsz;
- if (unlikely(msg_user(msg) == TUNNEL_PROTOCOL))
+ if (unlikely(skb_tailroom(bskb) < (pad + msz)))
return false;
- if (unlikely(msg_user(msg) == BCAST_PROTOCOL))
+ if (unlikely(max < (offset + msz)))
return false;
- if (unlikely(msg_user(bmsg) != MSG_BUNDLER))
+
+ skb_put(bskb, pad + msz);
+ skb_copy_to_linear_data_offset(bskb, offset, msg, msz);
+ msg_set_size(bmsg, offset + msz);
+ msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1);
+ return true;
+}
+
+/**
+ * tipc_msg_try_bundle - Try to bundle a new message to the last one
+ * @tskb: the last/target message to which the new one will be appended
+ * @skb: the new message skb pointer
+ * @mss: max message size (header inclusive)
+ * @dnode: destination node for the message
+ * @new_bundle: if this call made a new bundle or not
+ *
+ * Return: "true" if the new message skb is potential for bundling this time or
+ * later, in the case a bundling has been done this time, the skb is consumed
+ * (the skb pointer = NULL).
+ * Otherwise, "false" if the skb cannot be bundled at all.
+ */
+bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss,
+ u32 dnode, bool *new_bundle)
+{
+ struct tipc_msg *msg, *inner, *outer;
+ u32 tsz;
+
+ /* First, check if the new buffer is suitable for bundling */
+ msg = buf_msg(*skb);
+ if (msg_user(msg) == MSG_FRAGMENTER)
return false;
- if (unlikely(skb_tailroom(skb) < (pad + msz)))
+ if (msg_user(msg) == TUNNEL_PROTOCOL)
return false;
- if (unlikely(max < (start + msz)))
+ if (msg_user(msg) == BCAST_PROTOCOL)
return false;
- if ((msg_importance(msg) < TIPC_SYSTEM_IMPORTANCE) &&
- (msg_importance(bmsg) == TIPC_SYSTEM_IMPORTANCE))
+ if (mss <= INT_H_SIZE + msg_size(msg))
return false;
- skb_put(skb, pad + msz);
- skb_copy_to_linear_data_offset(skb, start, msg, msz);
- msg_set_size(bmsg, start + msz);
- msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1);
+ /* Ok, but the last/target buffer can be empty? */
+ if (unlikely(!tskb))
+ return true;
+
+ /* Is it a bundle already? Try to bundle the new message to it */
+ if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) {
+ *new_bundle = false;
+ goto bundle;
+ }
+
+ /* Make a new bundle of the two messages if possible */
+ tsz = msg_size(buf_msg(tskb));
+ if (unlikely(mss < align(INT_H_SIZE + tsz) + msg_size(msg)))
+ return true;
+ if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE,
+ GFP_ATOMIC)))
+ return true;
+ inner = buf_msg(tskb);
+ skb_push(tskb, INT_H_SIZE);
+ outer = buf_msg(tskb);
+ tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE,
+ dnode);
+ msg_set_importance(outer, msg_importance(inner));
+ msg_set_size(outer, INT_H_SIZE + tsz);
+ msg_set_msgcnt(outer, 1);
+ *new_bundle = true;
+
+bundle:
+ if (likely(tipc_msg_bundle(tskb, msg, mss))) {
+ consume_skb(*skb);
+ *skb = NULL;
+ }
return true;
}
@@ -451,52 +624,6 @@ none:
}
/**
- * tipc_msg_make_bundle(): Create bundle buf and append message to its tail
- * @list: the buffer chain, where head is the buffer to replace/append
- * @skb: buffer to be created, appended to and returned in case of success
- * @msg: message to be appended
- * @mtu: max allowable size for the bundle buffer, inclusive header
- * @dnode: destination node for message. (Not always present in header)
- * Returns true if success, otherwise false
- */
-bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
- u32 mtu, u32 dnode)
-{
- struct sk_buff *_skb;
- struct tipc_msg *bmsg;
- u32 msz = msg_size(msg);
- u32 max = mtu - INT_H_SIZE;
-
- if (msg_user(msg) == MSG_FRAGMENTER)
- return false;
- if (msg_user(msg) == TUNNEL_PROTOCOL)
- return false;
- if (msg_user(msg) == BCAST_PROTOCOL)
- return false;
- if (msz > (max / 2))
- return false;
-
- _skb = tipc_buf_acquire(max, GFP_ATOMIC);
- if (!_skb)
- return false;
-
- skb_trim(_skb, INT_H_SIZE);
- bmsg = buf_msg(_skb);
- tipc_msg_init(msg_prevnode(msg), bmsg, MSG_BUNDLER, 0,
- INT_H_SIZE, dnode);
- if (msg_isdata(msg))
- msg_set_importance(bmsg, TIPC_CRITICAL_IMPORTANCE);
- else
- msg_set_importance(bmsg, TIPC_SYSTEM_IMPORTANCE);
- msg_set_seqno(bmsg, msg_seqno(msg));
- msg_set_ack(bmsg, msg_ack(msg));
- msg_set_bcast_ack(bmsg, msg_bcast_ack(msg));
- tipc_msg_bundle(_skb, msg, mtu);
- *skb = _skb;
- return true;
-}
-
-/**
* tipc_msg_reverse(): swap source and destination addresses and add error code
* @own_node: originating node id for reversed message
* @skb: buffer containing message to be reversed; will be consumed
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index d7ebc9e955f6..6d466ebdb64f 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -102,16 +102,42 @@ struct plist;
#define TIPC_MEDIA_INFO_OFFSET 5
struct tipc_skb_cb {
- struct sk_buff *tail;
- unsigned long nxt_retr;
- unsigned long retr_stamp;
- u32 bytes_read;
- u32 orig_member;
- u16 chain_imp;
- u16 ackers;
- u16 retr_cnt;
- bool validated;
-};
+ union {
+ struct {
+ struct sk_buff *tail;
+ unsigned long nxt_retr;
+ unsigned long retr_stamp;
+ u32 bytes_read;
+ u32 orig_member;
+ u16 chain_imp;
+ u16 ackers;
+ u16 retr_cnt;
+ } __packed;
+#ifdef CONFIG_TIPC_CRYPTO
+ struct {
+ struct tipc_crypto *rx;
+ struct tipc_aead *last;
+ u8 recurs;
+ } tx_clone_ctx __packed;
+#endif
+ } __packed;
+ union {
+ struct {
+ u8 validated:1;
+#ifdef CONFIG_TIPC_CRYPTO
+ u8 encrypted:1;
+ u8 decrypted:1;
+ u8 probe:1;
+ u8 tx_clone_deferred:1;
+#endif
+ };
+ u8 flags;
+ };
+ u8 reserved;
+#ifdef CONFIG_TIPC_CRYPTO
+ void *crypto_ctx;
+#endif
+} __packed;
#define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
@@ -290,6 +316,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d)
msg_set_bits(m, 0, 18, 1, d);
}
+static inline int msg_ack_required(struct tipc_msg *m)
+{
+ return msg_bits(m, 0, 18, 1);
+}
+
+static inline void msg_set_ack_required(struct tipc_msg *m, u32 d)
+{
+ msg_set_bits(m, 0, 18, 1, d);
+}
+
static inline bool msg_is_rcast(struct tipc_msg *m)
{
return msg_bits(m, 0, 18, 0x1);
@@ -723,12 +759,26 @@ static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
msg_set_bits(m, 4, 16, 0xffff, n);
}
+static inline u32 msg_nof_fragms(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n)
+{
+ msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+static inline u32 msg_fragm_no(struct tipc_msg *m)
+{
+ return msg_bits(m, 4, 16, 0xffff);
+}
+
static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
{
msg_set_bits(m, 4, 16, 0xffff, n);
}
-
static inline u16 msg_next_sent(struct tipc_msg *m)
{
return msg_bits(m, 4, 0, 0xffff);
@@ -879,6 +929,16 @@ static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n)
msg_set_bits(m, 9, 16, 0xffff, n);
}
+static inline u16 msg_syncpt(struct tipc_msg *m)
+{
+ return msg_bits(m, 9, 16, 0xffff);
+}
+
+static inline void msg_set_syncpt(struct tipc_msg *m, u16 n)
+{
+ msg_set_bits(m, 9, 16, 0xffff, n);
+}
+
static inline u32 msg_conn_ack(struct tipc_msg *m)
{
return msg_bits(m, 9, 16, 0xffff);
@@ -1002,6 +1062,20 @@ static inline bool msg_is_reset(struct tipc_msg *hdr)
return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
}
+/* Word 13
+ */
+static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n)
+{
+ msg_set_word(m, 13, n);
+}
+
+static inline u32 msg_peer_net_hash(struct tipc_msg *m)
+{
+ return msg_word(m, 13);
+}
+
+/* Word 14
+ */
static inline u32 msg_sugg_node_addr(struct tipc_msg *m)
{
return msg_word(m, 14);
@@ -1033,12 +1107,15 @@ struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz,
uint data_sz, u32 dnode, u32 onode,
u32 dport, u32 oport, int errcode);
int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf);
-bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu);
-bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
- u32 mtu, u32 dnode);
+bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss,
+ u32 dnode, bool *new_bundle);
bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
+int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
+ int pktmax, struct sk_buff_head *frags);
int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
int offset, int dsz, int mtu, struct sk_buff_head *list);
+int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen,
+ int mss, struct sk_buff_head *txq);
bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
bool tipc_msg_assemble(struct sk_buff_head *list);
bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 44abc8e9c990..5feaf3b67380 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -146,7 +146,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
struct publication *publ;
struct sk_buff *skb = NULL;
struct distr_item *item = NULL;
- u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) /
+ u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) /
ITEM_SIZE) * ITEM_SIZE;
u32 msg_rem = msg_dsz;
@@ -190,7 +190,7 @@ void tipc_named_node_up(struct net *net, u32 dnode)
struct name_table *nt = tipc_name_table(net);
struct sk_buff_head head;
- skb_queue_head_init(&head);
+ __skb_queue_head_init(&head);
read_lock_bh(&nt->cluster_scope_lock);
named_distribute(net, &head, dnode, &nt->cluster_scope);
@@ -223,7 +223,8 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
publ->key);
}
- kfree_rcu(p, rcu);
+ if (p)
+ kfree_rcu(p, rcu);
}
/**
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 66a65c2cdb23..359b2bc888cf 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -35,6 +35,8 @@
*/
#include <net/sock.h>
+#include <linux/list_sort.h>
+#include <linux/rbtree_augmented.h>
#include "core.h"
#include "netlink.h"
#include "name_table.h"
@@ -50,6 +52,7 @@
* @lower: service range lower bound
* @upper: service range upper bound
* @tree_node: member of service range RB tree
+ * @max: largest 'upper' in this node subtree
* @local_publ: list of identical publications made from this node
* Used by closest_first lookup and multicast lookup algorithm
* @all_publ: all publications identical to this one, whatever node and scope
@@ -59,6 +62,7 @@ struct service_range {
u32 lower;
u32 upper;
struct rb_node tree_node;
+ u32 max;
struct list_head local_publ;
struct list_head all_publ;
};
@@ -66,6 +70,7 @@ struct service_range {
/**
* struct tipc_service - container for all published instances of a service type
* @type: 32 bit 'type' value for service
+ * @publ_cnt: increasing counter for publications in this service
* @ranges: rb tree containing all service ranges for this service
* @service_list: links to adjacent name ranges in hash chain
* @subscriptions: list of subscriptions for this service type
@@ -74,6 +79,7 @@ struct service_range {
*/
struct tipc_service {
u32 type;
+ u32 publ_cnt;
struct rb_root ranges;
struct hlist_node service_list;
struct list_head subscriptions;
@@ -81,6 +87,130 @@ struct tipc_service {
struct rcu_head rcu;
};
+#define service_range_upper(sr) ((sr)->upper)
+RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks,
+ struct service_range, tree_node, u32, max,
+ service_range_upper)
+
+#define service_range_entry(rbtree_node) \
+ (container_of(rbtree_node, struct service_range, tree_node))
+
+#define service_range_overlap(sr, start, end) \
+ ((sr)->lower <= (end) && (sr)->upper >= (start))
+
+/**
+ * service_range_foreach_match - iterate over tipc service rbtree for each
+ * range match
+ * @sr: the service range pointer as a loop cursor
+ * @sc: the pointer to tipc service which holds the service range rbtree
+ * @start, end: the range (end >= start) for matching
+ */
+#define service_range_foreach_match(sr, sc, start, end) \
+ for (sr = service_range_match_first((sc)->ranges.rb_node, \
+ start, \
+ end); \
+ sr; \
+ sr = service_range_match_next(&(sr)->tree_node, \
+ start, \
+ end))
+
+/**
+ * service_range_match_first - find first service range matching a range
+ * @n: the root node of service range rbtree for searching
+ * @start, end: the range (end >= start) for matching
+ *
+ * Return: the leftmost service range node in the rbtree that overlaps the
+ * specific range if any. Otherwise, returns NULL.
+ */
+static struct service_range *service_range_match_first(struct rb_node *n,
+ u32 start, u32 end)
+{
+ struct service_range *sr;
+ struct rb_node *l, *r;
+
+ /* Non overlaps in tree at all? */
+ if (!n || service_range_entry(n)->max < start)
+ return NULL;
+
+ while (n) {
+ l = n->rb_left;
+ if (l && service_range_entry(l)->max >= start) {
+ /* A leftmost overlap range node must be one in the left
+ * subtree. If not, it has lower > end, then nodes on
+ * the right side cannot satisfy the condition either.
+ */
+ n = l;
+ continue;
+ }
+
+ /* No one in the left subtree can match, return if this node is
+ * an overlap i.e. leftmost.
+ */
+ sr = service_range_entry(n);
+ if (service_range_overlap(sr, start, end))
+ return sr;
+
+ /* Ok, try to lookup on the right side */
+ r = n->rb_right;
+ if (sr->lower <= end &&
+ r && service_range_entry(r)->max >= start) {
+ n = r;
+ continue;
+ }
+ break;
+ }
+
+ return NULL;
+}
+
+/**
+ * service_range_match_next - find next service range matching a range
+ * @n: a node in service range rbtree from which the searching starts
+ * @start, end: the range (end >= start) for matching
+ *
+ * Return: the next service range node to the given node in the rbtree that
+ * overlaps the specific range if any. Otherwise, returns NULL.
+ */
+static struct service_range *service_range_match_next(struct rb_node *n,
+ u32 start, u32 end)
+{
+ struct service_range *sr;
+ struct rb_node *p, *r;
+
+ while (n) {
+ r = n->rb_right;
+ if (r && service_range_entry(r)->max >= start)
+ /* A next overlap range node must be one in the right
+ * subtree. If not, it has lower > end, then any next
+ * successor (- an ancestor) of this node cannot
+ * satisfy the condition either.
+ */
+ return service_range_match_first(r, start, end);
+
+ /* No one in the right subtree can match, go up to find an
+ * ancestor of this node which is parent of a left-hand child.
+ */
+ while ((p = rb_parent(n)) && n == p->rb_right)
+ n = p;
+ if (!p)
+ break;
+
+ /* Return if this ancestor is an overlap */
+ sr = service_range_entry(p);
+ if (service_range_overlap(sr, start, end))
+ return sr;
+
+ /* Ok, try to lookup more from this ancestor */
+ if (sr->lower <= end) {
+ n = p;
+ continue;
+ }
+ break;
+ }
+
+ return NULL;
+}
+
static int hash(int x)
{
return x & (TIPC_NAMETBL_SIZE - 1);
@@ -109,6 +239,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper,
INIT_LIST_HEAD(&publ->binding_node);
INIT_LIST_HEAD(&publ->local_publ);
INIT_LIST_HEAD(&publ->all_publ);
+ INIT_LIST_HEAD(&publ->list);
return publ;
}
@@ -135,84 +266,51 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
return service;
}
-/**
- * tipc_service_first_range - find first service range in tree matching instance
- *
- * Very time-critical, so binary search through range rb tree
- */
-static struct service_range *tipc_service_first_range(struct tipc_service *sc,
- u32 instance)
-{
- struct rb_node *n = sc->ranges.rb_node;
- struct service_range *sr;
-
- while (n) {
- sr = container_of(n, struct service_range, tree_node);
- if (sr->lower > instance)
- n = n->rb_left;
- else if (sr->upper < instance)
- n = n->rb_right;
- else
- return sr;
- }
- return NULL;
-}
-
/* tipc_service_find_range - find service range matching publication parameters
*/
static struct service_range *tipc_service_find_range(struct tipc_service *sc,
u32 lower, u32 upper)
{
- struct rb_node *n = sc->ranges.rb_node;
struct service_range *sr;
- sr = tipc_service_first_range(sc, lower);
- if (!sr)
- return NULL;
-
- /* Look for exact match */
- for (n = &sr->tree_node; n; n = rb_next(n)) {
- sr = container_of(n, struct service_range, tree_node);
- if (sr->upper == upper)
- break;
+ service_range_foreach_match(sr, sc, lower, upper) {
+ /* Look for exact match */
+ if (sr->lower == lower && sr->upper == upper)
+ return sr;
}
- if (!n || sr->lower != lower || sr->upper != upper)
- return NULL;
- return sr;
+ return NULL;
}
static struct service_range *tipc_service_create_range(struct tipc_service *sc,
u32 lower, u32 upper)
{
struct rb_node **n, *parent = NULL;
- struct service_range *sr, *tmp;
+ struct service_range *sr;
n = &sc->ranges.rb_node;
while (*n) {
- tmp = container_of(*n, struct service_range, tree_node);
parent = *n;
- tmp = container_of(parent, struct service_range, tree_node);
- if (lower < tmp->lower)
- n = &(*n)->rb_left;
- else if (lower > tmp->lower)
- n = &(*n)->rb_right;
- else if (upper < tmp->upper)
- n = &(*n)->rb_left;
- else if (upper > tmp->upper)
- n = &(*n)->rb_right;
+ sr = service_range_entry(parent);
+ if (lower == sr->lower && upper == sr->upper)
+ return sr;
+ if (sr->max < upper)
+ sr->max = upper;
+ if (lower <= sr->lower)
+ n = &parent->rb_left;
else
- return tmp;
+ n = &parent->rb_right;
}
sr = kzalloc(sizeof(*sr), GFP_ATOMIC);
if (!sr)
return NULL;
sr->lower = lower;
sr->upper = upper;
+ sr->max = upper;
INIT_LIST_HEAD(&sr->local_publ);
INIT_LIST_HEAD(&sr->all_publ);
rb_link_node(&sr->tree_node, parent, n);
- rb_insert_color(&sr->tree_node, &sc->ranges);
+ rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
return sr;
}
@@ -244,6 +342,8 @@ static struct publication *tipc_service_insert_publ(struct net *net,
p = tipc_publ_create(type, lower, upper, scope, node, port, key);
if (!p)
goto err;
+ /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */
+ p->id = sc->publ_cnt++;
if (in_own_node(net, node))
list_add(&p->local_publ, &sr->local_publ);
list_add(&p->all_publ, &sr->all_publ);
@@ -278,6 +378,20 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr,
}
/**
+ * Code reused: time_after32() for the same purpose
+ */
+#define publication_after(pa, pb) time_after32((pa)->id, (pb)->id)
+static int tipc_publ_sort(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct publication *pa, *pb;
+
+ pa = container_of(a, struct publication, list);
+ pb = container_of(b, struct publication, list);
+ return publication_after(pa, pb);
+}
+
+/**
* tipc_service_subscribe - attach a subscription, and optionally
* issue the prescribed number of events if there is any service
* range overlapping with the requested range
@@ -286,36 +400,44 @@ static void tipc_service_subscribe(struct tipc_service *service,
struct tipc_subscription *sub)
{
struct tipc_subscr *sb = &sub->evt.s;
+ struct publication *p, *first, *tmp;
+ struct list_head publ_list;
struct service_range *sr;
struct tipc_name_seq ns;
- struct publication *p;
- struct rb_node *n;
- bool first;
+ u32 filter;
ns.type = tipc_sub_read(sb, seq.type);
ns.lower = tipc_sub_read(sb, seq.lower);
ns.upper = tipc_sub_read(sb, seq.upper);
+ filter = tipc_sub_read(sb, filter);
tipc_sub_get(sub);
list_add(&sub->service_list, &service->subscriptions);
- if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS)
+ if (filter & TIPC_SUB_NO_STATUS)
return;
- for (n = rb_first(&service->ranges); n; n = rb_next(n)) {
- sr = container_of(n, struct service_range, tree_node);
- if (sr->lower > ns.upper)
- break;
- if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper))
- continue;
- first = true;
-
+ INIT_LIST_HEAD(&publ_list);
+ service_range_foreach_match(sr, service, ns.lower, ns.upper) {
+ first = NULL;
list_for_each_entry(p, &sr->all_publ, all_publ) {
- tipc_sub_report_overlap(sub, sr->lower, sr->upper,
- TIPC_PUBLISHED, p->port,
- p->node, p->scope, first);
- first = false;
+ if (filter & TIPC_SUB_PORTS)
+ list_add_tail(&p->list, &publ_list);
+ else if (!first || publication_after(first, p))
+ /* Pick this range's *first* publication */
+ first = p;
}
+ if (first)
+ list_add_tail(&first->list, &publ_list);
+ }
+
+ /* Sort the publications before reporting */
+ list_sort(NULL, &publ_list, tipc_publ_sort);
+ list_for_each_entry_safe(p, tmp, &publ_list, list) {
+ tipc_sub_report_overlap(sub, p->lower, p->upper,
+ TIPC_PUBLISHED, p->port, p->node,
+ p->scope, true);
+ list_del_init(&p->list);
}
}
@@ -390,7 +512,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
/* Remove service range item if this was its last publication */
if (list_empty(&sr->all_publ)) {
- rb_erase(&sr->tree_node, &sc->ranges);
+ rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
kfree(sr);
}
@@ -438,34 +560,39 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
rcu_read_lock();
sc = tipc_service_find(net, type);
if (unlikely(!sc))
- goto not_found;
+ goto exit;
spin_lock_bh(&sc->lock);
- sr = tipc_service_first_range(sc, instance);
- if (unlikely(!sr))
- goto no_match;
-
- /* Select lookup algorithm: local, closest-first or round-robin */
- if (*dnode == self) {
- list = &sr->local_publ;
- if (list_empty(list))
- goto no_match;
- p = list_first_entry(list, struct publication, local_publ);
- list_move_tail(&p->local_publ, &sr->local_publ);
- } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) {
- list = &sr->local_publ;
- p = list_first_entry(list, struct publication, local_publ);
- list_move_tail(&p->local_publ, &sr->local_publ);
- } else {
- list = &sr->all_publ;
- p = list_first_entry(list, struct publication, all_publ);
- list_move_tail(&p->all_publ, &sr->all_publ);
+ service_range_foreach_match(sr, sc, instance, instance) {
+ /* Select lookup algo: local, closest-first or round-robin */
+ if (*dnode == self) {
+ list = &sr->local_publ;
+ if (list_empty(list))
+ continue;
+ p = list_first_entry(list, struct publication,
+ local_publ);
+ list_move_tail(&p->local_publ, &sr->local_publ);
+ } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) {
+ list = &sr->local_publ;
+ p = list_first_entry(list, struct publication,
+ local_publ);
+ list_move_tail(&p->local_publ, &sr->local_publ);
+ } else {
+ list = &sr->all_publ;
+ p = list_first_entry(list, struct publication,
+ all_publ);
+ list_move_tail(&p->all_publ, &sr->all_publ);
+ }
+ port = p->port;
+ node = p->node;
+ /* Todo: as for legacy, pick the first matching range only, a
+ * "true" round-robin will be performed as needed.
+ */
+ break;
}
- port = p->port;
- node = p->node;
-no_match:
spin_unlock_bh(&sc->lock);
-not_found:
+
+exit:
rcu_read_unlock();
*dnode = node;
return port;
@@ -488,7 +615,8 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
spin_lock_bh(&sc->lock);
- sr = tipc_service_first_range(sc, instance);
+ /* Todo: a full search i.e. service_range_foreach_match() instead? */
+ sr = service_range_match_first(sc->ranges.rb_node, instance, instance);
if (!sr)
goto no_match;
@@ -517,7 +645,6 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
struct service_range *sr;
struct tipc_service *sc;
struct publication *p;
- struct rb_node *n;
rcu_read_lock();
sc = tipc_service_find(net, type);
@@ -525,13 +652,7 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
goto exit;
spin_lock_bh(&sc->lock);
-
- for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
- sr = container_of(n, struct service_range, tree_node);
- if (sr->upper < lower)
- continue;
- if (sr->lower > upper)
- break;
+ service_range_foreach_match(sr, sc, lower, upper) {
list_for_each_entry(p, &sr->local_publ, local_publ) {
if (p->scope == scope || (!exact && p->scope < scope))
tipc_dest_push(dports, 0, p->port);
@@ -552,7 +673,6 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
struct service_range *sr;
struct tipc_service *sc;
struct publication *p;
- struct rb_node *n;
rcu_read_lock();
sc = tipc_service_find(net, type);
@@ -560,13 +680,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
goto exit;
spin_lock_bh(&sc->lock);
-
- for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
- sr = container_of(n, struct service_range, tree_node);
- if (sr->upper < lower)
- continue;
- if (sr->lower > upper)
- break;
+ service_range_foreach_match(sr, sc, lower, upper) {
list_for_each_entry(p, &sr->all_publ, all_publ) {
tipc_nlist_add(nodes, p->node);
}
@@ -764,7 +878,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc)
tipc_service_remove_publ(sr, p->node, p->key);
kfree_rcu(p, rcu);
}
- rb_erase(&sr->tree_node, &sc->ranges);
+ rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks);
kfree(sr);
}
hlist_del_init_rcu(&sc->service_list);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index f79066334cc8..728bc7016c38 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -58,6 +58,7 @@ struct tipc_group;
* @node: network address of publishing socket's node
* @port: publishing port
* @key: publication key, unique across the cluster
+ * @id: publication id
* @binding_node: all publications from the same node which bound this one
* - Remote publications: in node->publ_list
* Used by node/name distr to withdraw publications when node is lost
@@ -69,6 +70,7 @@ struct tipc_group;
* Used by closest_first and multicast receive lookup algorithms
* @all_publ: all publications identical to this one, whatever node and scope
* Used by round-robin lookup algorithm
+ * @list: to form a list of publications in temporal order
* @rcu: RCU callback head used for deferred freeing
*/
struct publication {
@@ -79,10 +81,12 @@ struct publication {
u32 node;
u32 port;
u32 key;
+ u32 id;
struct list_head binding_node;
struct list_head binding_sock;
struct list_head local_publ;
struct list_head all_publ;
+ struct list_head list;
struct rcu_head rcu;
};
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 85707c185360..85400e4242de 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -42,6 +42,7 @@
#include "node.h"
#include "bcast.h"
#include "netlink.h"
+#include "monitor.h"
/*
* The TIPC locking policy is designed to ensure a very fine locking
@@ -136,6 +137,7 @@ static void tipc_net_finalize(struct net *net, u32 addr)
tipc_set_node_addr(net, addr);
tipc_named_reinit(net);
tipc_sk_reinit(net);
+ tipc_mon_reinit_self(net);
tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
TIPC_CLUSTER_SCOPE, 0, addr);
}
@@ -300,3 +302,59 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
return err;
}
+
+static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg)
+{
+ struct tipc_net *tn = tipc_net(net);
+ struct nlattr *attrs;
+ void *hdr;
+
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ 0, TIPC_NL_ADDR_LEGACY_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_NET);
+ if (!attrs)
+ goto msg_full;
+
+ if (tn->legacy_addr_format)
+ if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY))
+ goto attr_msg_full;
+
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
+
+ return 0;
+
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ genlmsg_cancel(msg->skb, hdr);
+
+ return -EMSGSIZE;
+}
+
+int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_nl_msg msg;
+ struct sk_buff *rep;
+ int err;
+
+ rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!rep)
+ return -ENOMEM;
+
+ msg.skb = rep;
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
+ err = __tipc_nl_addr_legacy_get(net, &msg);
+ if (err) {
+ nlmsg_free(msg.skb);
+ return err;
+ }
+
+ return genlmsg_reply(msg.skb, info);
+}
diff --git a/net/tipc/net.h b/net/tipc/net.h
index b7f2e364eb99..6740d97c706e 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net);
int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info);
int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info);
#endif
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index d6165ad384c0..7c35094c20b8 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -83,6 +83,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
[TIPC_NLA_NET_ADDR] = { .type = NLA_U32 },
[TIPC_NLA_NET_NODEID] = { .type = NLA_U64 },
[TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 },
+ [TIPC_NLA_NET_ADDR_LEGACY] = { .type = NLA_FLAG }
};
const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
@@ -102,7 +103,11 @@ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
[TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC },
[TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 },
- [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }
+ [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG },
+ [TIPC_NLA_NODE_ID] = { .type = NLA_BINARY,
+ .len = TIPC_NODEID_LEN},
+ [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY,
+ .len = TIPC_AEAD_KEY_SIZE_MAX},
};
/* Properties valid for media, bearer and link */
@@ -176,7 +181,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
},
{
.cmd = TIPC_NL_PUBL_GET,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = tipc_nl_publ_dump,
},
{
@@ -239,7 +245,8 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
},
{
.cmd = TIPC_NL_MON_PEER_GET,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = tipc_nl_node_dump_monitor_peer,
},
{
@@ -250,10 +257,28 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
#ifdef CONFIG_TIPC_MEDIA_UDP
{
.cmd = TIPC_NL_UDP_GET_REMOTEIP,
- .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = tipc_udp_nl_dump_remoteip,
},
#endif
+#ifdef CONFIG_TIPC_CRYPTO
+ {
+ .cmd = TIPC_NL_KEY_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = tipc_nl_node_set_key,
+ },
+ {
+ .cmd = TIPC_NL_KEY_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = tipc_nl_node_flush_key,
+ },
+#endif
+ {
+ .cmd = TIPC_NL_ADDR_LEGACY_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = tipc_nl_net_addr_legacy_get,
+ },
};
struct genl_family tipc_genl_family __ro_after_init = {
@@ -268,18 +293,6 @@ struct genl_family tipc_genl_family __ro_after_init = {
.n_ops = ARRAY_SIZE(tipc_genl_v2_ops),
};
-int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
-{
- u32 maxattr = tipc_genl_family.maxattr;
-
- *attr = genl_family_attrbuf(&tipc_genl_family);
- if (!*attr)
- return -EOPNOTSUPP;
-
- return nlmsg_parse_deprecated(nlh, GENL_HDRLEN, *attr, maxattr,
- tipc_nl_policy, NULL);
-}
-
int __init tipc_netlink_start(void)
{
int res;
diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h
index 4ba0ad422110..7cf777723e3e 100644
--- a/net/tipc/netlink.h
+++ b/net/tipc/netlink.h
@@ -38,7 +38,6 @@
#include <net/netlink.h>
extern struct genl_family tipc_genl_family;
-int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf);
struct tipc_nl_msg {
struct sk_buff *skb;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index e135d4e11231..217516357ef2 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -181,15 +181,18 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
struct tipc_nl_compat_msg *msg,
struct sk_buff *arg)
{
+ struct genl_dumpit_info info;
int len = 0;
int err;
struct sk_buff *buf;
struct nlmsghdr *nlmsg;
struct netlink_callback cb;
+ struct nlattr **attrbuf;
memset(&cb, 0, sizeof(cb));
cb.nlh = (struct nlmsghdr *)arg->data;
cb.skb = arg;
+ cb.data = &info;
buf = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (!buf)
@@ -201,19 +204,35 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
return -ENOMEM;
}
+ attrbuf = kcalloc(tipc_genl_family.maxattr + 1,
+ sizeof(struct nlattr *), GFP_KERNEL);
+ if (!attrbuf) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ info.attrs = attrbuf;
+ err = nlmsg_parse_deprecated(cb.nlh, GENL_HDRLEN, attrbuf,
+ tipc_genl_family.maxattr,
+ tipc_genl_family.policy, NULL);
+ if (err)
+ goto err_out;
+
do {
int rem;
len = (*cmd->dumpit)(buf, &cb);
nlmsg_for_each_msg(nlmsg, nlmsg_hdr(buf), len, rem) {
- struct nlattr **attrs;
-
- err = tipc_nlmsg_parse(nlmsg, &attrs);
+ err = nlmsg_parse_deprecated(nlmsg, GENL_HDRLEN,
+ attrbuf,
+ tipc_genl_family.maxattr,
+ tipc_genl_family.policy,
+ NULL);
if (err)
goto err_out;
- err = (*cmd->format)(msg, attrs);
+ err = (*cmd->format)(msg, attrbuf);
if (err)
goto err_out;
@@ -231,6 +250,7 @@ static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
err = 0;
err_out:
+ kfree(attrbuf);
tipc_dump_done(&cb);
kfree_skb(buf);
@@ -550,7 +570,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
if (len <= 0)
return -EINVAL;
- len = min_t(int, len, TIPC_MAX_BEARER_NAME);
+ len = min_t(int, len, TIPC_MAX_LINK_NAME);
if (!string_is_valid(name, len))
return -EINVAL;
@@ -822,7 +842,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
if (len <= 0)
return -EINVAL;
- len = min_t(int, len, TIPC_MAX_BEARER_NAME);
+ len = min_t(int, len, TIPC_MAX_LINK_NAME);
if (!string_is_valid(name, len))
return -EINVAL;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 3a5be1d7e572..0c88778c88b5 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -44,6 +44,7 @@
#include "discover.h"
#include "netlink.h"
#include "trace.h"
+#include "crypto.h"
#define INVALID_NODE_SIG 0x10000
#define NODE_CLEANUP_AFTER 300000
@@ -89,6 +90,7 @@ struct tipc_bclink_entry {
* @links: array containing references to all links to node
* @action_flags: bit mask of different types of node actions
* @state: connectivity state vs peer node
+ * @preliminary: a preliminary node or not
* @sync_point: sequence number where synch/failover is finished
* @list: links to adjacent nodes in sorted list of cluster's nodes
* @working_links: number of working links to node (both active and standby)
@@ -99,6 +101,7 @@ struct tipc_bclink_entry {
* @publ_list: list of publications
* @rcu: rcu struct for tipc_node
* @delete_at: indicates the time for deleting a down node
+ * @crypto_rx: RX crypto handler
*/
struct tipc_node {
u32 addr;
@@ -112,6 +115,7 @@ struct tipc_node {
int action_flags;
struct list_head list;
int state;
+ bool preliminary;
bool failover_sent;
u16 sync_point;
int link_cnt;
@@ -120,12 +124,18 @@ struct tipc_node {
u32 signature;
u32 link_id;
u8 peer_id[16];
+ char peer_id_string[NODE_ID_STR_LEN];
struct list_head publ_list;
struct list_head conn_sks;
unsigned long keepalive_intv;
struct timer_list timer;
struct rcu_head rcu;
unsigned long delete_at;
+ struct net *peer_net;
+ u32 peer_hash_mix;
+#ifdef CONFIG_TIPC_CRYPTO
+ struct tipc_crypto *crypto_rx;
+#endif
};
/* Node FSM states and events:
@@ -163,7 +173,6 @@ static void tipc_node_timeout(struct timer_list *t);
static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
-static void tipc_node_put(struct tipc_node *node);
static bool node_is_up(struct tipc_node *n);
static void tipc_node_delete_from_list(struct tipc_node *node);
@@ -184,7 +193,7 @@ static struct tipc_link *node_active_link(struct tipc_node *n, int sel)
return n->links[bearer_id].link;
}
-int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
+int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected)
{
struct tipc_node *n;
int bearer_id;
@@ -194,6 +203,14 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
if (unlikely(!n))
return mtu;
+ /* Allow MAX_MSG_SIZE when building connection oriented message
+ * if they are in the same core network
+ */
+ if (n->peer_net && connected) {
+ tipc_node_put(n);
+ return mtu;
+ }
+
bearer_id = n->active_links[sel & 1];
if (likely(bearer_id != INVALID_BEARER_ID))
mtu = n->links[bearer_id].mtu;
@@ -235,15 +252,51 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr)
return caps;
}
+u32 tipc_node_get_addr(struct tipc_node *node)
+{
+ return (node) ? node->addr : 0;
+}
+
+char *tipc_node_get_id_str(struct tipc_node *node)
+{
+ return node->peer_id_string;
+}
+
+#ifdef CONFIG_TIPC_CRYPTO
+/**
+ * tipc_node_crypto_rx - Retrieve crypto RX handle from node
+ * Note: node ref counter must be held first!
+ */
+struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n)
+{
+ return (__n) ? __n->crypto_rx : NULL;
+}
+
+struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos)
+{
+ return container_of(pos, struct tipc_node, list)->crypto_rx;
+}
+#endif
+
+static void tipc_node_free(struct rcu_head *rp)
+{
+ struct tipc_node *n = container_of(rp, struct tipc_node, rcu);
+
+#ifdef CONFIG_TIPC_CRYPTO
+ tipc_crypto_stop(&n->crypto_rx);
+#endif
+ kfree(n);
+}
+
static void tipc_node_kref_release(struct kref *kref)
{
struct tipc_node *n = container_of(kref, struct tipc_node, kref);
kfree(n->bc_entry.link);
- kfree_rcu(n, rcu);
+ call_rcu(&n->rcu, tipc_node_free);
}
-static void tipc_node_put(struct tipc_node *node)
+void tipc_node_put(struct tipc_node *node)
{
kref_put(&node->kref, tipc_node_kref_release);
}
@@ -264,7 +317,7 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
rcu_read_lock();
hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) {
- if (node->addr != addr)
+ if (node->addr != addr || node->preliminary)
continue;
if (!kref_get_unless_zero(&node->kref))
node = NULL;
@@ -360,18 +413,71 @@ static void tipc_node_write_unlock(struct tipc_node *n)
}
}
-static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
- u8 *peer_id, u16 capabilities)
+static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes)
+{
+ int net_id = tipc_netid(n->net);
+ struct tipc_net *tn_peer;
+ struct net *tmp;
+ u32 hash_chk;
+
+ if (n->peer_net)
+ return;
+
+ for_each_net_rcu(tmp) {
+ tn_peer = tipc_net(tmp);
+ if (!tn_peer)
+ continue;
+ /* Integrity checking whether node exists in namespace or not */
+ if (tn_peer->net_id != net_id)
+ continue;
+ if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN))
+ continue;
+ hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random);
+ if (hash_mixes ^ hash_chk)
+ continue;
+ n->peer_net = tmp;
+ n->peer_hash_mix = hash_mixes;
+ break;
+ }
+}
+
+struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
+ u16 capabilities, u32 hash_mixes,
+ bool preliminary)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_node *n, *temp_node;
struct tipc_link *l;
+ unsigned long intv;
int bearer_id;
int i;
spin_lock_bh(&tn->node_list_lock);
- n = tipc_node_find(net, addr);
+ n = tipc_node_find(net, addr) ?:
+ tipc_node_find_by_id(net, peer_id);
if (n) {
+ if (!n->preliminary)
+ goto update;
+ if (preliminary)
+ goto exit;
+ /* A preliminary node becomes "real" now, refresh its data */
+ tipc_node_write_lock(n);
+ n->preliminary = false;
+ n->addr = addr;
+ hlist_del_rcu(&n->hash);
+ hlist_add_head_rcu(&n->hash,
+ &tn->node_htable[tipc_hashfn(addr)]);
+ list_del_rcu(&n->list);
+ list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
+ if (n->addr < temp_node->addr)
+ break;
+ }
+ list_add_tail_rcu(&n->list, &temp_node->list);
+ tipc_node_write_unlock_fast(n);
+
+update:
+ if (n->peer_hash_mix ^ hash_mixes)
+ tipc_node_assign_peer_net(n, hash_mixes);
if (n->capabilities == capabilities)
goto exit;
/* Same node may come back with new capabilities */
@@ -389,6 +495,10 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
tn->capabilities &= temp_node->capabilities;
}
+
+ tipc_bcast_toggle_rcast(net,
+ (tn->capabilities & TIPC_BCAST_RCAST));
+
goto exit;
}
n = kzalloc(sizeof(*n), GFP_ATOMIC);
@@ -396,9 +506,23 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
pr_warn("Node creation failed, no memory\n");
goto exit;
}
+ tipc_nodeid2string(n->peer_id_string, peer_id);
+#ifdef CONFIG_TIPC_CRYPTO
+ if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) {
+ pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string);
+ kfree(n);
+ n = NULL;
+ goto exit;
+ }
+#endif
n->addr = addr;
+ n->preliminary = preliminary;
memcpy(&n->peer_id, peer_id, 16);
n->net = net;
+ n->peer_net = NULL;
+ n->peer_hash_mix = 0;
+ /* Assign kernel local namespace if exists */
+ tipc_node_assign_peer_net(n, hash_mixes);
n->capabilities = capabilities;
kref_init(&n->kref);
rwlock_init(&n->lock);
@@ -417,22 +541,14 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
n->signature = INVALID_NODE_SIG;
n->active_links[0] = INVALID_BEARER_ID;
n->active_links[1] = INVALID_BEARER_ID;
- if (!tipc_link_bc_create(net, tipc_own_addr(net),
- addr, U16_MAX,
- tipc_link_window(tipc_bc_sndlink(net)),
- n->capabilities,
- &n->bc_entry.inputq1,
- &n->bc_entry.namedq,
- tipc_bc_sndlink(net),
- &n->bc_entry.link)) {
- pr_warn("Broadcast rcv link creation failed, no memory\n");
- kfree(n);
- n = NULL;
- goto exit;
- }
+ n->bc_entry.link = NULL;
tipc_node_get(n);
timer_setup(&n->timer, tipc_node_timeout, 0);
- n->keepalive_intv = U32_MAX;
+ /* Start a slow timer anyway, crypto needs it */
+ n->keepalive_intv = 10000;
+ intv = jiffies + msecs_to_jiffies(n->keepalive_intv);
+ if (!mod_timer(&n->timer, intv))
+ tipc_node_get(n);
hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]);
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
if (n->addr < temp_node->addr)
@@ -444,6 +560,7 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
tn->capabilities &= temp_node->capabilities;
}
+ tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST));
trace_tipc_node_create(n, true, " ");
exit:
spin_unlock_bh(&tn->node_list_lock);
@@ -617,12 +734,18 @@ static bool tipc_node_cleanup(struct tipc_node *peer)
}
tipc_node_write_unlock(peer);
+ if (!deleted) {
+ spin_unlock_bh(&tn->node_list_lock);
+ return deleted;
+ }
+
/* Calculate cluster capabilities */
tn->capabilities = TIPC_NODE_CAPABILITIES;
list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
tn->capabilities &= temp_node->capabilities;
}
-
+ tipc_bcast_toggle_rcast(peer->net,
+ (tn->capabilities & TIPC_BCAST_RCAST));
spin_unlock_bh(&tn->node_list_lock);
return deleted;
}
@@ -645,6 +768,10 @@ static void tipc_node_timeout(struct timer_list *t)
return;
}
+#ifdef CONFIG_TIPC_CRYPTO
+ /* Take any crypto key related actions first */
+ tipc_crypto_timeout(n->crypto_rx);
+#endif
__skb_queue_head_init(&xmitq);
/* Initial node interval to value larger (10 seconds), then it will be
@@ -665,7 +792,7 @@ static void tipc_node_timeout(struct timer_list *t)
remains--;
}
tipc_node_read_unlock(n);
- tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n);
if (rc & TIPC_LINK_DOWN_EVT)
tipc_node_link_down(n, bearer_id, false);
}
@@ -697,7 +824,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
n->link_id = tipc_link_id(nl);
/* Leave room for tunnel header when returning 'mtu' to users: */
- n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE;
+ n->links[bearer_id].mtu = tipc_link_mss(nl);
tipc_bearer_add_dest(n->net, bearer_id, n->addr);
tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id);
@@ -751,7 +878,7 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
tipc_node_write_lock(n);
__tipc_node_link_up(n, bearer_id, xmitq);
maddr = &n->links[bearer_id].maddr;
- tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr);
+ tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n);
tipc_node_write_unlock(n);
}
@@ -906,7 +1033,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
if (delete)
tipc_mon_remove_peer(n->net, n->addr, old_bearer_id);
if (!skb_queue_empty(&xmitq))
- tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
+ tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n);
tipc_sk_rcv(n->net, &le->inputq);
}
@@ -950,6 +1077,8 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
{
struct tipc_net *tn = tipc_net(net);
struct tipc_node *n;
+ bool preliminary;
+ u32 sugg_addr;
/* Suggest new address if some other peer is using this one */
n = tipc_node_find(net, addr);
@@ -965,9 +1094,11 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
/* Suggest previously used address if peer is known */
n = tipc_node_find_by_id(net, id);
if (n) {
- addr = n->addr;
+ sugg_addr = n->addr;
+ preliminary = n->preliminary;
tipc_node_put(n);
- return addr;
+ if (!preliminary)
+ return sugg_addr;
}
/* Even this node may be in conflict */
@@ -979,12 +1110,12 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
void tipc_node_check_dest(struct net *net, u32 addr,
u8 *peer_id, struct tipc_bearer *b,
- u16 capabilities, u32 signature,
+ u16 capabilities, u32 signature, u32 hash_mixes,
struct tipc_media_addr *maddr,
bool *respond, bool *dupl_addr)
{
struct tipc_node *n;
- struct tipc_link *l;
+ struct tipc_link *l, *snd_l;
struct tipc_link_entry *le;
bool addr_match = false;
bool sign_match = false;
@@ -998,11 +1129,28 @@ void tipc_node_check_dest(struct net *net, u32 addr,
*dupl_addr = false;
*respond = false;
- n = tipc_node_create(net, addr, peer_id, capabilities);
+ n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes,
+ false);
if (!n)
return;
tipc_node_write_lock(n);
+ if (unlikely(!n->bc_entry.link)) {
+ snd_l = tipc_bc_sndlink(net);
+ if (!tipc_link_bc_create(net, tipc_own_addr(net),
+ addr, U16_MAX,
+ tipc_link_min_win(snd_l),
+ tipc_link_max_win(snd_l),
+ n->capabilities,
+ &n->bc_entry.inputq1,
+ &n->bc_entry.namedq, snd_l,
+ &n->bc_entry.link)) {
+ pr_warn("Broadcast rcv link creation failed, no mem\n");
+ tipc_node_write_unlock_fast(n);
+ tipc_node_put(n);
+ return;
+ }
+ }
le = &n->links[b->identity];
@@ -1017,6 +1165,9 @@ void tipc_node_check_dest(struct net *net, u32 addr,
if (sign_match && addr_match && link_up) {
/* All is fine. Do nothing. */
reset = false;
+ /* Peer node is not a container/local namespace */
+ if (!n->peer_hash_mix)
+ n->peer_hash_mix = hash_mixes;
} else if (sign_match && addr_match && !link_up) {
/* Respond. The link will come up in due time */
*respond = true;
@@ -1083,7 +1234,7 @@ void tipc_node_check_dest(struct net *net, u32 addr,
get_random_bytes(&session, sizeof(u16));
if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
b->net_plane, b->mtu, b->priority,
- b->window, session,
+ b->min_win, b->max_win, session,
tipc_own_addr(net), addr, peer_id,
n->capabilities,
tipc_bc_sndlink(n->net), n->bc_entry.link,
@@ -1342,7 +1493,8 @@ static void node_lost_contact(struct tipc_node *n,
/* Notify publications from this node */
n->action_flags |= TIPC_NOTIFY_NODE_DOWN;
-
+ n->peer_net = NULL;
+ n->peer_hash_mix = 0;
/* Notify sockets connected to node */
list_for_each_entry_safe(conn, safe, conns, list) {
skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG,
@@ -1424,6 +1576,56 @@ msg_full:
return -EMSGSIZE;
}
+static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list)
+{
+ struct tipc_msg *hdr = buf_msg(skb_peek(list));
+ struct sk_buff_head inputq;
+
+ switch (msg_user(hdr)) {
+ case TIPC_LOW_IMPORTANCE:
+ case TIPC_MEDIUM_IMPORTANCE:
+ case TIPC_HIGH_IMPORTANCE:
+ case TIPC_CRITICAL_IMPORTANCE:
+ if (msg_connected(hdr) || msg_named(hdr)) {
+ tipc_loopback_trace(peer_net, list);
+ spin_lock_init(&list->lock);
+ tipc_sk_rcv(peer_net, list);
+ return;
+ }
+ if (msg_mcast(hdr)) {
+ tipc_loopback_trace(peer_net, list);
+ skb_queue_head_init(&inputq);
+ tipc_sk_mcast_rcv(peer_net, list, &inputq);
+ __skb_queue_purge(list);
+ skb_queue_purge(&inputq);
+ return;
+ }
+ return;
+ case MSG_FRAGMENTER:
+ if (tipc_msg_assemble(list)) {
+ tipc_loopback_trace(peer_net, list);
+ skb_queue_head_init(&inputq);
+ tipc_sk_mcast_rcv(peer_net, list, &inputq);
+ __skb_queue_purge(list);
+ skb_queue_purge(&inputq);
+ }
+ return;
+ case GROUP_PROTOCOL:
+ case CONN_MANAGER:
+ tipc_loopback_trace(peer_net, list);
+ spin_lock_init(&list->lock);
+ tipc_sk_rcv(peer_net, list);
+ return;
+ case LINK_PROTOCOL:
+ case NAME_DISTRIBUTOR:
+ case TUNNEL_PROTOCOL:
+ case BCAST_PROTOCOL:
+ return;
+ default:
+ return;
+ };
+}
+
/**
* tipc_node_xmit() is the general link level function for message sending
* @net: the applicable net namespace
@@ -1439,26 +1641,40 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
struct tipc_link_entry *le = NULL;
struct tipc_node *n;
struct sk_buff_head xmitq;
+ bool node_up = false;
int bearer_id;
int rc;
if (in_own_node(net, dnode)) {
+ tipc_loopback_trace(net, list);
+ spin_lock_init(&list->lock);
tipc_sk_rcv(net, list);
return 0;
}
n = tipc_node_find(net, dnode);
if (unlikely(!n)) {
- skb_queue_purge(list);
+ __skb_queue_purge(list);
return -EHOSTUNREACH;
}
tipc_node_read_lock(n);
+ node_up = node_is_up(n);
+ if (node_up && n->peer_net && check_net(n->peer_net)) {
+ /* xmit inner linux container */
+ tipc_lxc_xmit(n->peer_net, list);
+ if (likely(skb_queue_empty(list))) {
+ tipc_node_read_unlock(n);
+ tipc_node_put(n);
+ return 0;
+ }
+ }
+
bearer_id = n->active_links[selector & 1];
if (unlikely(bearer_id == INVALID_BEARER_ID)) {
tipc_node_read_unlock(n);
tipc_node_put(n);
- skb_queue_purge(list);
+ __skb_queue_purge(list);
return -EHOSTUNREACH;
}
@@ -1472,7 +1688,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
if (unlikely(rc == -ENOBUFS))
tipc_node_link_down(n, bearer_id, false);
else
- tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n);
tipc_node_put(n);
@@ -1490,7 +1706,7 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
{
struct sk_buff_head head;
- skb_queue_head_init(&head);
+ __skb_queue_head_init(&head);
__skb_queue_tail(&head, skb);
tipc_node_xmit(net, &head, dnode, selector);
return 0;
@@ -1620,7 +1836,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
}
if (!skb_queue_empty(&xmitq))
- tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n);
if (!skb_queue_empty(&be->inputq1))
tipc_node_mcast_rcv(n);
@@ -1649,7 +1865,6 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
int usr = msg_user(hdr);
int mtyp = msg_type(hdr);
u16 oseqno = msg_seqno(hdr);
- u16 iseqno = msg_seqno(msg_inner_hdr(hdr));
u16 exp_pkts = msg_msgcnt(hdr);
u16 rcv_nxt, syncpt, dlv_nxt, inputq_len;
int state = n->state;
@@ -1748,7 +1963,10 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
/* Initiate synch mode if applicable */
if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) {
- syncpt = iseqno + exp_pkts - 1;
+ if (n->capabilities & TIPC_TUNNEL_ENHANCED)
+ syncpt = msg_syncpt(hdr);
+ else
+ syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1;
if (!tipc_link_is_up(l))
__tipc_node_link_up(n, bearer_id, xmitq);
if (n->state == SELF_UP_PEER_UP) {
@@ -1796,20 +2014,38 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
{
struct sk_buff_head xmitq;
- struct tipc_node *n;
+ struct tipc_link_entry *le;
struct tipc_msg *hdr;
+ struct tipc_node *n;
int bearer_id = b->identity;
- struct tipc_link_entry *le;
u32 self = tipc_own_addr(net);
int usr, rc = 0;
u16 bc_ack;
+#ifdef CONFIG_TIPC_CRYPTO
+ struct tipc_ehdr *ehdr;
- __skb_queue_head_init(&xmitq);
+ /* Check if message must be decrypted first */
+ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb))
+ goto rcv;
+
+ ehdr = (struct tipc_ehdr *)skb->data;
+ if (likely(ehdr->user != LINK_CONFIG)) {
+ n = tipc_node_find(net, ntohl(ehdr->addr));
+ if (unlikely(!n))
+ goto discard;
+ } else {
+ n = tipc_node_find_by_id(net, ehdr->id);
+ }
+ tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b);
+ if (!skb)
+ return;
+rcv:
+#endif
/* Ensure message is well-formed before touching the header */
- TIPC_SKB_CB(skb)->validated = false;
if (unlikely(!tipc_msg_validate(&skb)))
goto discard;
+ __skb_queue_head_init(&xmitq);
hdr = buf_msg(skb);
usr = msg_user(hdr);
bc_ack = msg_bcast_ack(hdr);
@@ -1880,7 +2116,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
tipc_sk_rcv(net, &le->inputq);
if (!skb_queue_empty(&xmitq))
- tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n);
tipc_node_put(n);
discard:
@@ -1911,7 +2147,7 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
tipc_link_set_mtu(e->link, b->mtu);
}
tipc_node_write_unlock(n);
- tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr);
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL);
}
rcu_read_unlock();
@@ -1922,7 +2158,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
struct net *net = sock_net(skb->sk);
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
- struct tipc_node *peer;
+ struct tipc_node *peer, *temp_node;
u32 addr;
int err;
@@ -1963,6 +2199,12 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
tipc_node_write_unlock(peer);
tipc_node_delete(peer);
+ /* Calculate cluster capabilities */
+ tn->capabilities = TIPC_NODE_CAPABILITIES;
+ list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
+ tn->capabilities &= temp_node->capabilities;
+ }
+ tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST));
err = 0;
err_out:
tipc_node_put(peer);
@@ -2007,6 +2249,8 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
}
list_for_each_entry_rcu(node, &tn->node_list, list) {
+ if (node->preliminary)
+ continue;
if (last_addr) {
if (node->addr == last_addr)
last_addr = 0;
@@ -2117,8 +2361,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
if (attrs[TIPC_NLA_LINK_PROP]) {
struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
- err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP],
- props);
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props);
if (err) {
res = err;
goto out;
@@ -2137,16 +2380,19 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
tipc_link_set_prio(link, prio, &xmitq);
}
if (props[TIPC_NLA_PROP_WIN]) {
- u32 win;
+ u32 max_win;
- win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
- tipc_link_set_queue_limits(link, win);
+ max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ tipc_link_set_queue_limits(link,
+ tipc_link_min_win(link),
+ max_win);
}
}
out:
tipc_node_read_unlock(node);
- tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr);
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr,
+ NULL);
return res;
}
@@ -2480,13 +2726,9 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
int err;
if (!prev_node) {
- struct nlattr **attrs;
+ struct nlattr **attrs = genl_dumpit_info(cb)->attrs;
struct nlattr *mon[TIPC_NLA_MON_MAX + 1];
- err = tipc_nlmsg_parse(cb->nlh, &attrs);
- if (err)
- return err;
-
if (!attrs[TIPC_NLA_MON])
return -EINVAL;
@@ -2526,11 +2768,142 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
return skb->len;
}
-u32 tipc_node_get_addr(struct tipc_node *node)
+#ifdef CONFIG_TIPC_CRYPTO
+static int tipc_nl_retrieve_key(struct nlattr **attrs,
+ struct tipc_aead_key **key)
{
- return (node) ? node->addr : 0;
+ struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
+
+ if (!attr)
+ return -ENODATA;
+
+ *key = (struct tipc_aead_key *)nla_data(attr);
+ if (nla_len(attr) < tipc_aead_key_size(*key))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id)
+{
+ struct nlattr *attr = attrs[TIPC_NLA_NODE_ID];
+
+ if (!attr)
+ return -ENODATA;
+
+ if (nla_len(attr) < TIPC_NODEID_LEN)
+ return -EINVAL;
+
+ *node_id = (u8 *)nla_data(attr);
+ return 0;
+}
+
+static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_node *n = NULL;
+ struct tipc_aead_key *ukey;
+ struct tipc_crypto *c;
+ u8 *id, *own_id;
+ int rc = 0;
+
+ if (!info->attrs[TIPC_NLA_NODE])
+ return -EINVAL;
+
+ rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX,
+ info->attrs[TIPC_NLA_NODE],
+ tipc_nl_node_policy, info->extack);
+ if (rc)
+ goto exit;
+
+ own_id = tipc_own_id(net);
+ if (!own_id) {
+ rc = -EPERM;
+ goto exit;
+ }
+
+ rc = tipc_nl_retrieve_key(attrs, &ukey);
+ if (rc)
+ goto exit;
+
+ rc = tipc_aead_key_validate(ukey);
+ if (rc)
+ goto exit;
+
+ rc = tipc_nl_retrieve_nodeid(attrs, &id);
+ switch (rc) {
+ case -ENODATA:
+ /* Cluster key mode */
+ rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY);
+ break;
+ case 0:
+ /* Per-node key mode */
+ if (!memcmp(id, own_id, NODE_ID_LEN)) {
+ c = tn->crypto_tx;
+ } else {
+ n = tipc_node_find_by_id(net, id) ?:
+ tipc_node_create(net, 0, id, 0xffffu, 0, true);
+ if (unlikely(!n)) {
+ rc = -ENOMEM;
+ break;
+ }
+ c = n->crypto_rx;
+ }
+
+ rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY);
+ if (n)
+ tipc_node_put(n);
+ break;
+ default:
+ break;
+ }
+
+exit:
+ return (rc < 0) ? rc : 0;
+}
+
+int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+
+ rtnl_lock();
+ err = __tipc_nl_node_set_key(skb, info);
+ rtnl_unlock();
+
+ return err;
+}
+
+static int __tipc_nl_node_flush_key(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = tipc_net(net);
+ struct tipc_node *n;
+
+ tipc_crypto_key_flush(tn->crypto_tx);
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, &tn->node_list, list)
+ tipc_crypto_key_flush(n->crypto_rx);
+ rcu_read_unlock();
+
+ pr_info("All keys are flushed!\n");
+ return 0;
}
+int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+
+ rtnl_lock();
+ err = __tipc_nl_node_flush_key(skb, info);
+ rtnl_unlock();
+
+ return err;
+}
+#endif
+
/**
* tipc_node_dump - dump TIPC node data
* @n: tipc node to be dumped
@@ -2587,3 +2960,33 @@ int tipc_node_dump(struct tipc_node *n, bool more, char *buf)
return i;
}
+
+void tipc_node_pre_cleanup_net(struct net *exit_net)
+{
+ struct tipc_node *n;
+ struct tipc_net *tn;
+ struct net *tmp;
+
+ rcu_read_lock();
+ for_each_net_rcu(tmp) {
+ if (tmp == exit_net)
+ continue;
+ tn = tipc_net(tmp);
+ if (!tn)
+ continue;
+ spin_lock_bh(&tn->node_list_lock);
+ list_for_each_entry_rcu(n, &tn->node_list, list) {
+ if (!n->peer_net)
+ continue;
+ if (n->peer_net != exit_net)
+ continue;
+ tipc_node_write_lock(n);
+ n->peer_net = NULL;
+ n->peer_hash_mix = 0;
+ tipc_node_write_unlock_fast(n);
+ break;
+ }
+ spin_unlock_bh(&tn->node_list_lock);
+ }
+ rcu_read_unlock();
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index c0bf49ea3de4..a6803b449a2c 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -53,7 +53,9 @@ enum {
TIPC_NODE_ID128 = (1 << 5),
TIPC_LINK_PROTO_SEQNO = (1 << 6),
TIPC_MCAST_RBCTL = (1 << 7),
- TIPC_GAP_ACK_BLOCK = (1 << 8)
+ TIPC_GAP_ACK_BLOCK = (1 << 8),
+ TIPC_TUNNEL_ENHANCED = (1 << 9),
+ TIPC_NAGLE = (1 << 10)
};
#define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT | \
@@ -64,16 +66,28 @@ enum {
TIPC_NODE_ID128 | \
TIPC_LINK_PROTO_SEQNO | \
TIPC_MCAST_RBCTL | \
- TIPC_GAP_ACK_BLOCK)
+ TIPC_GAP_ACK_BLOCK | \
+ TIPC_TUNNEL_ENHANCED | \
+ TIPC_NAGLE)
+
#define INVALID_BEARER_ID -1
void tipc_node_stop(struct net *net);
bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
u32 tipc_node_get_addr(struct tipc_node *node);
+char *tipc_node_get_id_str(struct tipc_node *node);
+void tipc_node_put(struct tipc_node *node);
+struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id,
+ u16 capabilities, u32 hash_mixes,
+ bool preliminary);
+#ifdef CONFIG_TIPC_CRYPTO
+struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n);
+struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos);
+#endif
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
struct tipc_bearer *bearer,
- u16 capabilities, u32 signature,
+ u16 capabilities, u32 signature, u32 hash_mixes,
struct tipc_media_addr *maddr,
bool *respond, bool *dupl_addr);
void tipc_node_delete_links(struct net *net, int bearer_id);
@@ -90,7 +104,7 @@ void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr);
void tipc_node_broadcast(struct net *net, struct sk_buff *skb);
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
-int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel);
+int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected);
bool tipc_node_is_up(struct net *net, u32 addr);
u16 tipc_node_get_capabilities(struct net *net, u32 addr);
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
@@ -105,4 +119,9 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb);
int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
struct netlink_callback *cb);
+#ifdef CONFIG_TIPC_CRYPTO
+int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info);
+#endif
+void tipc_node_pre_cleanup_net(struct net *exit_net);
#endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 83ae41d7e554..693e8902161e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -75,6 +75,7 @@ struct sockaddr_pair {
* @conn_instance: TIPC instance used when connection was established
* @published: non-zero if port has one or more associated names
* @max_pkt: maximum packet size "hint" used when building messages sent by port
+ * @maxnagle: maximum size of msg which can be subject to nagle
* @portid: unique port identity in TIPC socket hash table
* @phdr: preformatted message header used when sending messages
* #cong_links: list of congested links
@@ -97,6 +98,7 @@ struct tipc_sock {
u32 conn_instance;
int published;
u32 max_pkt;
+ u32 maxnagle;
u32 portid;
struct tipc_msg phdr;
struct list_head cong_links;
@@ -116,6 +118,10 @@ struct tipc_sock {
struct tipc_mc_method mc_method;
struct rcu_head rcu;
struct tipc_group *group;
+ u32 oneway;
+ u16 snd_backlog;
+ bool expect_ack;
+ bool nodelay;
bool group_is_open;
};
@@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk);
static void tipc_sk_remove(struct tipc_sock *tsk);
static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
+static void tipc_sk_push_backlog(struct tipc_sock *tsk);
static const struct proto_ops packet_ops;
static const struct proto_ops stream_ops;
@@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen)
return 1;
}
+/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle
+ */
+static void tsk_set_nagle(struct tipc_sock *tsk)
+{
+ struct sock *sk = &tsk->sk;
+
+ tsk->maxnagle = 0;
+ if (sk->sk_type != SOCK_STREAM)
+ return;
+ if (tsk->nodelay)
+ return;
+ if (!(tsk->peer_caps & TIPC_NAGLE))
+ return;
+ /* Limit node local buffer size to avoid receive queue overflow */
+ if (tsk->max_pkt == MAX_MSG_SIZE)
+ tsk->maxnagle = 1500;
+ else
+ tsk->maxnagle = tsk->max_pkt;
+}
+
/**
* tsk_advance_rx_queue - discard first buffer in socket receive queue
*
@@ -260,12 +287,12 @@ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err)
*
* Caller must hold socket lock
*/
-static void tsk_rej_rx_queue(struct sock *sk)
+static void tsk_rej_rx_queue(struct sock *sk, int error)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sk->sk_receive_queue)))
- tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT);
+ tipc_sk_respond(sk, skb, error);
}
static bool tipc_sk_connected(struct sock *sk)
@@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
tsk = tipc_sk(sk);
tsk->max_pkt = MAX_PKT_DEFAULT;
+ tsk->maxnagle = 0;
INIT_LIST_HEAD(&tsk->publications);
INIT_LIST_HEAD(&tsk->cong_links);
msg = &tsk->phdr;
@@ -504,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error)
struct sock *sk = sock->sk;
struct tipc_sock *tsk = tipc_sk(sk);
struct net *net = sock_net(sk);
- long timeout = CONN_TIMEOUT_DEFAULT;
+ long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
u32 dnode = tsk_peer_node(tsk);
struct sk_buff *skb;
@@ -512,37 +540,50 @@ static void __tipc_shutdown(struct socket *sock, int error)
tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
!tsk_conn_cong(tsk)));
- /* Remove any pending SYN message */
+ /* Push out delayed messages if in Nagle mode */
+ tipc_sk_push_backlog(tsk);
+ /* Remove pending SYN */
__skb_queue_purge(&sk->sk_write_queue);
- /* Reject all unreceived messages, except on an active connection
- * (which disconnects locally & sends a 'FIN+' to peer).
- */
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- if (TIPC_SKB_CB(skb)->bytes_read) {
- kfree_skb(skb);
- continue;
- }
- if (!tipc_sk_type_connectionless(sk) &&
- sk->sk_state != TIPC_DISCONNECTING) {
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
- tipc_node_remove_conn(net, dnode, tsk->portid);
- }
- tipc_sk_respond(sk, skb, error);
+ /* Remove partially received buffer if any */
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb && TIPC_SKB_CB(skb)->bytes_read) {
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
}
- if (tipc_sk_type_connectionless(sk))
+ /* Reject all unreceived messages if connectionless */
+ if (tipc_sk_type_connectionless(sk)) {
+ tsk_rej_rx_queue(sk, error);
return;
+ }
- if (sk->sk_state != TIPC_DISCONNECTING) {
+ switch (sk->sk_state) {
+ case TIPC_CONNECTING:
+ case TIPC_ESTABLISHED:
+ tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ tipc_node_remove_conn(net, dnode, tsk->portid);
+ /* Send a FIN+/- to its peer */
+ skb = __skb_dequeue(&sk->sk_receive_queue);
+ if (skb) {
+ __skb_queue_purge(&sk->sk_receive_queue);
+ tipc_sk_respond(sk, skb, error);
+ break;
+ }
skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE,
TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode,
tsk_own_node(tsk), tsk_peer_port(tsk),
tsk->portid, error);
if (skb)
tipc_node_xmit_skb(net, skb, dnode, tsk->portid);
- tipc_node_remove_conn(net, dnode, tsk->portid);
- tipc_set_sk_state(sk, TIPC_DISCONNECTING);
+ break;
+ case TIPC_LISTEN:
+ /* Reject all SYN messages */
+ tsk_rej_rx_queue(sk, error);
+ break;
+ default:
+ __skb_queue_purge(&sk->sk_receive_queue);
+ break;
}
}
@@ -740,7 +781,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
/* fall through */
case TIPC_LISTEN:
case TIPC_CONNECTING:
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
revents |= EPOLLIN | EPOLLRDNORM;
break;
case TIPC_OPEN:
@@ -748,7 +789,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
revents |= EPOLLOUT;
if (!tipc_sk_type_connectionless(sk))
break;
- if (skb_queue_empty(&sk->sk_receive_queue))
+ if (skb_queue_empty_lockless(&sk->sk_receive_queue))
break;
revents |= EPOLLIN | EPOLLRDNORM;
break;
@@ -809,7 +850,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
msg_set_nameupper(hdr, seq->upper);
/* Build message as chain of buffers */
- skb_queue_head_init(&pkts);
+ __skb_queue_head_init(&pkts);
rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts);
/* Send message if build was successful */
@@ -853,8 +894,8 @@ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk,
msg_set_grp_bc_seqno(hdr, bc_snd_nxt);
/* Build message as chain of buffers */
- skb_queue_head_init(&pkts);
- mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+ __skb_queue_head_init(&pkts);
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false);
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
@@ -1058,7 +1099,7 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m,
msg_set_grp_bc_ack_req(hdr, ack);
/* Build message as chain of buffers */
- skb_queue_head_init(&pkts);
+ __skb_queue_head_init(&pkts);
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
@@ -1208,6 +1249,32 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
tipc_sk_rcv(net, inputq);
}
+/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue
+ * when socket is in Nagle mode
+ */
+static void tipc_sk_push_backlog(struct tipc_sock *tsk)
+{
+ struct sk_buff_head *txq = &tsk->sk.sk_write_queue;
+ struct net *net = sock_net(&tsk->sk);
+ u32 dnode = tsk_peer_node(tsk);
+ struct sk_buff *skb = skb_peek(txq);
+ int rc;
+
+ if (!skb || tsk->cong_link_cnt)
+ return;
+
+ /* Do not send SYN again after congestion */
+ if (msg_is_syn(buf_msg(skb)))
+ return;
+
+ tsk->snt_unacked += tsk->snd_backlog;
+ tsk->snd_backlog = 0;
+ tsk->expect_ack = true;
+ rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
+ if (rc == -ELINKCONG)
+ tsk->cong_link_cnt = 1;
+}
+
/**
* tipc_sk_conn_proto_rcv - receive a connection mng protocol message
* @tsk: receiving socket
@@ -1221,7 +1288,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
u32 onode = tsk_own_node(tsk);
struct sock *sk = &tsk->sk;
int mtyp = msg_type(hdr);
- bool conn_cong;
+ bool was_cong;
/* Ignore if connection cannot be validated: */
if (!tsk_peer_msg(tsk, hdr)) {
@@ -1254,11 +1321,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
__skb_queue_tail(xmitq, skb);
return;
} else if (mtyp == CONN_ACK) {
- conn_cong = tsk_conn_cong(tsk);
+ was_cong = tsk_conn_cong(tsk);
+ tsk->expect_ack = false;
+ tipc_sk_push_backlog(tsk);
tsk->snt_unacked -= msg_conn_ack(hdr);
if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
tsk->snd_win = msg_adv_win(hdr);
- if (conn_cong)
+ if (was_cong && !tsk_conn_cong(tsk))
sk->sk_write_space(sk);
} else if (mtyp != CONN_PROBE_REPLY) {
pr_warn("Received unknown CONN_PROTO msg\n");
@@ -1306,8 +1375,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
struct tipc_msg *hdr = &tsk->phdr;
struct tipc_name_seq *seq;
struct sk_buff_head pkts;
- u32 dport, dnode = 0;
- u32 type, inst;
+ u32 dport = 0, dnode = 0;
+ u32 type = 0, inst = 0;
int mtu, rc;
if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
@@ -1360,23 +1429,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
type = dest->addr.name.name.type;
inst = dest->addr.name.name.instance;
dnode = dest->addr.name.domain;
- msg_set_type(hdr, TIPC_NAMED_MSG);
- msg_set_hdr_sz(hdr, NAMED_H_SIZE);
- msg_set_nametype(hdr, type);
- msg_set_nameinst(hdr, inst);
- msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
dport = tipc_nametbl_translate(net, type, inst, &dnode);
- msg_set_destnode(hdr, dnode);
- msg_set_destport(hdr, dport);
if (unlikely(!dport && !dnode))
return -EHOSTUNREACH;
} else if (dest->addrtype == TIPC_ADDR_ID) {
dnode = dest->addr.id.node;
- msg_set_type(hdr, TIPC_DIRECT_MSG);
- msg_set_lookup_scope(hdr, 0);
- msg_set_destnode(hdr, dnode);
- msg_set_destport(hdr, dest->addr.id.ref);
- msg_set_hdr_sz(hdr, BASIC_H_SIZE);
} else {
return -EINVAL;
}
@@ -1387,13 +1444,31 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
if (unlikely(rc))
return rc;
- skb_queue_head_init(&pkts);
- mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+ if (dest->addrtype == TIPC_ADDR_NAME) {
+ msg_set_type(hdr, TIPC_NAMED_MSG);
+ msg_set_hdr_sz(hdr, NAMED_H_SIZE);
+ msg_set_nametype(hdr, type);
+ msg_set_nameinst(hdr, inst);
+ msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
+ msg_set_destnode(hdr, dnode);
+ msg_set_destport(hdr, dport);
+ } else { /* TIPC_ADDR_ID */
+ msg_set_type(hdr, TIPC_DIRECT_MSG);
+ msg_set_lookup_scope(hdr, 0);
+ msg_set_destnode(hdr, dnode);
+ msg_set_destport(hdr, dest->addr.id.ref);
+ msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+ }
+
+ __skb_queue_head_init(&pkts);
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false);
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
- if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue)))
+ if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) {
+ __skb_queue_purge(&pkts);
return -ENOMEM;
+ }
trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " ");
rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
@@ -1437,15 +1512,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
struct sock *sk = sock->sk;
DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+ struct sk_buff_head *txq = &sk->sk_write_queue;
struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_msg *hdr = &tsk->phdr;
struct net *net = sock_net(sk);
- struct sk_buff_head pkts;
u32 dnode = tsk_peer_node(tsk);
+ int maxnagle = tsk->maxnagle;
+ int maxpkt = tsk->max_pkt;
int send, sent = 0;
- int rc = 0;
-
- skb_queue_head_init(&pkts);
+ int blocks, rc = 0;
if (unlikely(dlen > INT_MAX))
return -EMSGSIZE;
@@ -1467,21 +1542,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
tipc_sk_connected(sk)));
if (unlikely(rc))
break;
-
send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
- rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
- if (unlikely(rc != send))
- break;
-
- trace_tipc_sk_sendstream(sk, skb_peek(&pkts),
+ blocks = tsk->snd_backlog;
+ if (tsk->oneway++ >= 4 && send <= maxnagle) {
+ rc = tipc_msg_append(hdr, m, send, maxnagle, txq);
+ if (unlikely(rc < 0))
+ break;
+ blocks += rc;
+ if (blocks <= 64 && tsk->expect_ack) {
+ tsk->snd_backlog = blocks;
+ sent += send;
+ break;
+ }
+ tsk->expect_ack = true;
+ } else {
+ rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq);
+ if (unlikely(rc != send))
+ break;
+ blocks += tsk_inc(tsk, send + MIN_H_SIZE);
+ }
+ trace_tipc_sk_sendstream(sk, skb_peek(txq),
TIPC_DUMP_SK_SNDQ, " ");
- rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+ rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
if (unlikely(rc == -ELINKCONG)) {
tsk->cong_link_cnt = 1;
rc = 0;
}
if (likely(!rc)) {
- tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
+ tsk->snt_unacked += blocks;
+ tsk->snd_backlog = 0;
sent += send;
}
} while (sent < dlen && !rc);
@@ -1526,8 +1615,9 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV);
tipc_set_sk_state(sk, TIPC_ESTABLISHED);
tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
- tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid);
+ tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true);
tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
+ tsk_set_nagle(tsk);
__skb_queue_purge(&sk->sk_write_queue);
if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
return;
@@ -1805,7 +1895,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m,
/* Send group flow control advertisement when applicable */
if (tsk->group && msg_in_group(hdr) && !grp_evt) {
- skb_queue_head_init(&xmitq);
+ __skb_queue_head_init(&xmitq);
tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen),
msg_orignode(hdr), msg_origport(hdr),
&xmitq);
@@ -1848,6 +1938,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,
bool peek = flags & MSG_PEEK;
int offset, required, copy, copied = 0;
int hlen, dlen, err, rc;
+ bool ack = false;
long timeout;
/* Catch invalid receive attempts */
@@ -1892,6 +1983,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,
/* Copy data if msg ok, otherwise return error/partial data */
if (likely(!err)) {
+ ack = msg_ack_required(hdr);
offset = skb_cb->bytes_read;
copy = min_t(int, dlen - offset, buflen - copied);
rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy);
@@ -1919,7 +2011,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,
/* Send connection flow control advertisement when applicable */
tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen);
- if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE))
+ if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)
tipc_sk_send_ack(tsk);
/* Exit if all requested data or FIN/error received */
@@ -1990,6 +2082,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
smp_wmb();
tsk->cong_link_cnt--;
wakeup = true;
+ tipc_sk_push_backlog(tsk);
break;
case GROUP_PROTOCOL:
tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq);
@@ -2029,6 +2122,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
if (unlikely(msg_mcast(hdr)))
return false;
+ tsk->oneway = 0;
switch (sk->sk_state) {
case TIPC_CONNECTING:
@@ -2074,6 +2168,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
return true;
return false;
case TIPC_ESTABLISHED:
+ if (!skb_queue_empty(&sk->sk_write_queue))
+ tipc_sk_push_backlog(tsk);
/* Accept only connection-based messages sent by peer */
if (likely(con_msg && !err && pport == oport && pnode == onode))
return true;
@@ -2119,13 +2215,13 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb)
struct tipc_msg *hdr = buf_msg(skb);
if (unlikely(msg_in_group(hdr)))
- return sk->sk_rcvbuf;
+ return READ_ONCE(sk->sk_rcvbuf);
if (unlikely(!msg_connected(hdr)))
- return sk->sk_rcvbuf << msg_importance(hdr);
+ return READ_ONCE(sk->sk_rcvbuf) << msg_importance(hdr);
if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL))
- return sk->sk_rcvbuf;
+ return READ_ONCE(sk->sk_rcvbuf);
return FLOWCTL_MSG_LIM;
}
@@ -2345,10 +2441,12 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
return -ETIMEDOUT;
if (signal_pending(current))
return sock_intr_errno(*timeo_p);
+ if (sk->sk_state == TIPC_DISCONNECTING)
+ break;
add_wait_queue(sk_sleep(sk), &wait);
- done = sk_wait_event(sk, timeo_p,
- sk->sk_state != TIPC_CONNECTING, &wait);
+ done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk),
+ &wait);
remove_wait_queue(sk_sleep(sk), &wait);
} while (!done);
return 0;
@@ -2558,7 +2656,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
* Reject any stray messages received by new socket
* before the socket lock was taken (very, very unlikely)
*/
- tsk_rej_rx_queue(new_sk);
+ tsk_rej_rx_queue(new_sk, TIPC_ERR_NO_PORT);
/* Connect new socket to it's peer */
tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg));
@@ -2674,13 +2772,14 @@ static void tipc_sk_timeout(struct timer_list *t)
struct sk_buff_head list;
int rc = 0;
- skb_queue_head_init(&list);
+ __skb_queue_head_init(&list);
bh_lock_sock(sk);
/* Try again later if socket is busy */
if (sock_owned_by_user(sk)) {
sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20);
bh_unlock_sock(sk);
+ sock_put(sk);
return;
}
@@ -2804,7 +2903,7 @@ static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid)
struct tipc_sock *tsk;
rcu_read_lock();
- tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params);
+ tsk = rhashtable_lookup(&tn->sk_rht, &portid, tsk_rht_params);
if (tsk)
sock_hold(&tsk->sk);
rcu_read_unlock();
@@ -2959,6 +3058,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
case TIPC_SRC_DROPPABLE:
case TIPC_DEST_DROPPABLE:
case TIPC_CONN_TIMEOUT:
+ case TIPC_NODELAY:
if (ol < sizeof(value))
return -EINVAL;
if (get_user(value, (u32 __user *)ov))
@@ -3007,6 +3107,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
case TIPC_GROUP_LEAVE:
res = tipc_sk_leave(tsk);
break;
+ case TIPC_NODELAY:
+ tsk->nodelay = !!value;
+ tsk_set_nagle(tsk);
+ break;
default:
res = -EINVAL;
}
@@ -3588,13 +3692,9 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct tipc_sock *tsk;
if (!tsk_portid) {
- struct nlattr **attrs;
+ struct nlattr **attrs = genl_dumpit_info(cb)->attrs;
struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1];
- err = tipc_nlmsg_parse(cb->nlh, &attrs);
- if (err)
- return err;
-
if (!attrs[TIPC_NLA_SOCK])
return -EINVAL;
@@ -3790,7 +3890,7 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf)
i += scnprintf(buf + i, sz - i, " %d", sk->sk_sndbuf);
i += scnprintf(buf + i, sz - i, " | %d", sk_rmem_alloc_get(sk));
i += scnprintf(buf + i, sz - i, " %d", sk->sk_rcvbuf);
- i += scnprintf(buf + i, sz - i, " | %d\n", sk->sk_backlog.len);
+ i += scnprintf(buf + i, sz - i, " | %d\n", READ_ONCE(sk->sk_backlog.len));
if (dqueues & TIPC_DUMP_SK_SNDQ) {
i += scnprintf(buf + i, sz - i, "sk_write_queue: ");
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 6159d327db76..58ab3d6dcdce 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -35,6 +35,7 @@
#include "core.h"
#include "trace.h"
+#include "crypto.h"
#include <linux/sysctl.h>
@@ -64,6 +65,16 @@ static struct ctl_table tipc_table[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_TIPC_CRYPTO
+ {
+ .procname = "max_tfms",
+ .data = &sysctl_tipc_max_tfms,
+ .maxlen = sizeof(sysctl_tipc_max_tfms),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+#endif
{}
};
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index ca8ac96d22a9..3a12fc18239b 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -40,6 +40,7 @@
#include "socket.h"
#include "addr.h"
#include "msg.h"
+#include "bearer.h"
#include <net/sock.h>
#include <linux/module.h>
@@ -608,6 +609,7 @@ static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt)
memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt));
skb_queue_head_init(&evtq);
__skb_queue_tail(&evtq, skb);
+ tipc_loopback_trace(net, &evtq);
tipc_sk_rcv(net, &evtq);
}
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 287df68721df..d6620ad53546 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -195,10 +195,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
.saddr = src->ipv6,
.flowi6_proto = IPPROTO_UDP
};
- err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk,
- &ndst, &fl6);
- if (err)
+ ndst = ipv6_stub->ipv6_dst_lookup_flow(net,
+ ub->ubsock->sk,
+ &fl6, NULL);
+ if (IS_ERR(ndst)) {
+ err = PTR_ERR(ndst);
goto tx_error;
+ }
dst_cache_set_ip6(cache, ndst, &fl6.saddr);
}
ttl = ip6_dst_hoplimit(ndst);
@@ -372,6 +375,7 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
goto out;
if (b && test_bit(0, &b->up)) {
+ TIPC_SKB_CB(skb)->flags = 0;
tipc_rcv(sock_net(sk), skb, b);
return 0;
}
@@ -448,15 +452,11 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb)
int i;
if (!bid && !skip_cnt) {
+ struct nlattr **attrs = genl_dumpit_info(cb)->attrs;
struct net *net = sock_net(skb->sk);
struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1];
- struct nlattr **attrs;
char *bname;
- err = tipc_nlmsg_parse(cb->nlh, &attrs);
- if (err)
- return err;
-
if (!attrs[TIPC_NLA_BEARER])
return -EINVAL;
@@ -828,7 +828,8 @@ struct tipc_media udp_media_info = {
.msg2addr = tipc_udp_msg2addr,
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
- .window = TIPC_DEF_LINK_WIN,
+ .min_win = TIPC_DEF_LINK_WIN,
+ .max_win = TIPC_DEF_LINK_WIN,
.mtu = TIPC_DEF_LINK_UDP_MTU,
.type_id = TIPC_MEDIA_TYPE_UDP,
.hwaddr_len = 0,
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index e4328b3b72eb..61ec78521a60 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -26,3 +26,13 @@ config TLS_DEVICE
Enable kernel support for HW offload of the TLS protocol.
If unsure, say N.
+
+config TLS_TOE
+ bool "Transport Layer Security TCP stack bypass"
+ depends on TLS
+ default n
+ help
+ Enable kernel support for legacy HW offload of the TLS protocol,
+ which is incompatible with the Linux networking stack semantics.
+
+ If unsure, say N.
diff --git a/net/tls/Makefile b/net/tls/Makefile
index ef0dc74ce8f9..f1ffbfe8968d 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -3,8 +3,11 @@
# Makefile for the TLS subsystem.
#
+CFLAGS_trace.o := -I$(src)
+
obj-$(CONFIG_TLS) += tls.o
-tls-y := tls_main.o tls_sw.o
+tls-y := tls_main.o tls_sw.o tls_proc.o trace.o
+tls-$(CONFIG_TLS_TOE) += tls_toe.o
tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 43922d86e510..1ba5a92832bb 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -38,6 +38,8 @@
#include <net/tcp.h>
#include <net/tls.h>
+#include "trace.h"
+
/* device_offload_lock is used to synchronize tls_dev_add
* against NETDEV_DOWN notifications.
*/
@@ -61,7 +63,7 @@ static void tls_device_free_ctx(struct tls_context *ctx)
if (ctx->rx_conf == TLS_HW)
kfree(tls_offload_ctx_rx(ctx));
- tls_ctx_free(ctx);
+ tls_ctx_free(NULL, ctx);
}
static void tls_device_gc_task(struct work_struct *work)
@@ -122,13 +124,10 @@ static struct net_device *get_netdev_for_sock(struct sock *sk)
static void destroy_record(struct tls_record_info *record)
{
- int nr_frags = record->num_frags;
- skb_frag_t *frag;
+ int i;
- while (nr_frags-- > 0) {
- frag = &record->frags[nr_frags];
- __skb_frag_unref(frag);
- }
+ for (i = 0; i < record->num_frags; i++)
+ __skb_frag_unref(&record->frags[i]);
kfree(record);
}
@@ -159,12 +158,8 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
spin_lock_irqsave(&ctx->lock, flags);
info = ctx->retransmit_hint;
- if (info && !before(acked_seq, info->end_seq)) {
+ if (info && !before(acked_seq, info->end_seq))
ctx->retransmit_hint = NULL;
- list_del(&info->list);
- destroy_record(info);
- deleted_records++;
- }
list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
if (before(acked_seq, info->end_seq))
@@ -183,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
* socket and no in-flight SKBs associated with this
* socket, so it is safe to free all the resources.
*/
-static void tls_device_sk_destruct(struct sock *sk)
+void tls_device_sk_destruct(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
@@ -201,6 +196,7 @@ static void tls_device_sk_destruct(struct sock *sk)
if (refcount_dec_and_test(&tls_ctx->refcount))
tls_device_queue_ctx_destruction(tls_ctx);
}
+EXPORT_SYMBOL_GPL(tls_device_sk_destruct);
void tls_device_free_resources_tx(struct sock *sk)
{
@@ -209,6 +205,15 @@ void tls_device_free_resources_tx(struct sock *sk)
tls_free_partial_record(sk, tls_ctx);
}
+void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+
+ trace_tls_device_tx_resync_req(sk, got_seq, exp_seq);
+ WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags));
+}
+EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request);
+
static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
u32 seq)
{
@@ -223,6 +228,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
rcd_sn = tls_ctx->tx.rec_seq;
+ trace_tls_device_tx_resync_send(sk, seq, rcd_sn);
down_read(&device_offload_lock);
netdev = tls_ctx->netdev;
if (netdev)
@@ -243,14 +249,14 @@ static void tls_append_frag(struct tls_record_info *record,
skb_frag_t *frag;
frag = &record->frags[record->num_frags - 1];
- if (frag->page.p == pfrag->page &&
- frag->page_offset + frag->size == pfrag->offset) {
- frag->size += size;
+ if (skb_frag_page(frag) == pfrag->page &&
+ skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) {
+ skb_frag_size_add(frag, size);
} else {
++frag;
- frag->page.p = pfrag->page;
- frag->page_offset = pfrag->offset;
- frag->size = size;
+ __skb_frag_set_page(frag, pfrag->page);
+ skb_frag_off_set(frag, pfrag->offset);
+ skb_frag_size_set(frag, size);
++record->num_frags;
get_page(pfrag->page);
}
@@ -263,33 +269,15 @@ static int tls_push_record(struct sock *sk,
struct tls_context *ctx,
struct tls_offload_context_tx *offload_ctx,
struct tls_record_info *record,
- struct page_frag *pfrag,
- int flags,
- unsigned char record_type)
+ int flags)
{
struct tls_prot_info *prot = &ctx->prot_info;
struct tcp_sock *tp = tcp_sk(sk);
- struct page_frag dummy_tag_frag;
skb_frag_t *frag;
int i;
- /* fill prepend */
- frag = &record->frags[0];
- tls_fill_prepend(ctx,
- skb_frag_address(frag),
- record->len - prot->prepend_size,
- record_type,
- prot->version);
-
- /* HW doesn't care about the data in the tag, because it fills it. */
- dummy_tag_frag.page = skb_frag_page(frag);
- dummy_tag_frag.offset = 0;
-
- tls_append_frag(record, &dummy_tag_frag, prot->tag_size);
record->end_seq = tp->write_seq + record->len;
- spin_lock_irq(&offload_ctx->lock);
- list_add_tail(&record->list, &offload_ctx->records_list);
- spin_unlock_irq(&offload_ctx->lock);
+ list_add_tail_rcu(&record->list, &offload_ctx->records_list);
offload_ctx->open_record = NULL;
if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
@@ -301,8 +289,8 @@ static int tls_push_record(struct sock *sk,
frag = &record->frags[i];
sg_unmark_end(&offload_ctx->sg_tx_data[i]);
sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
- frag->size, frag->page_offset);
- sk_mem_charge(sk, frag->size);
+ skb_frag_size(frag), skb_frag_off(frag));
+ sk_mem_charge(sk, skb_frag_size(frag));
get_page(skb_frag_page(frag));
}
sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
@@ -311,6 +299,38 @@ static int tls_push_record(struct sock *sk,
return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}
+static int tls_device_record_close(struct sock *sk,
+ struct tls_context *ctx,
+ struct tls_record_info *record,
+ struct page_frag *pfrag,
+ unsigned char record_type)
+{
+ struct tls_prot_info *prot = &ctx->prot_info;
+ int ret;
+
+ /* append tag
+ * device will fill in the tag, we just need to append a placeholder
+ * use socket memory to improve coalescing (re-using a single buffer
+ * increases frag count)
+ * if we can't allocate memory now, steal some back from data
+ */
+ if (likely(skb_page_frag_refill(prot->tag_size, pfrag,
+ sk->sk_allocation))) {
+ ret = 0;
+ tls_append_frag(record, pfrag, prot->tag_size);
+ } else {
+ ret = prot->tag_size;
+ if (record->len <= prot->overhead_size)
+ return -ENOMEM;
+ }
+
+ /* fill prepend */
+ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
+ record->len - prot->overhead_size,
+ record_type, prot->version);
+ return ret;
+}
+
static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
@@ -324,7 +344,7 @@ static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
frag = &record->frags[0];
__skb_frag_set_page(frag, pfrag->page);
- frag->page_offset = pfrag->offset;
+ skb_frag_off_set(frag, pfrag->offset);
skb_frag_size_set(frag, prepend_size);
get_page(pfrag->page);
@@ -365,6 +385,31 @@ static int tls_do_allocation(struct sock *sk,
return 0;
}
+static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t pre_copy, nocache;
+
+ pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1);
+ if (pre_copy) {
+ pre_copy = min(pre_copy, bytes);
+ if (copy_from_iter(addr, pre_copy, i) != pre_copy)
+ return -EFAULT;
+ bytes -= pre_copy;
+ addr += pre_copy;
+ }
+
+ nocache = round_down(bytes, SMP_CACHE_BYTES);
+ if (copy_from_iter_nocache(addr, nocache, i) != nocache)
+ return -EFAULT;
+ bytes -= nocache;
+ addr += nocache;
+
+ if (bytes && copy_from_iter(addr, bytes, i) != bytes)
+ return -EFAULT;
+
+ return 0;
+}
+
static int tls_push_data(struct sock *sk,
struct iov_iter *msg_iter,
size_t size, int flags,
@@ -385,9 +430,9 @@ static int tls_push_data(struct sock *sk,
if (flags &
~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
- if (sk->sk_err)
+ if (unlikely(sk->sk_err))
return -sk->sk_err;
flags |= MSG_SENDPAGE_DECRYPTED;
@@ -408,9 +453,8 @@ static int tls_push_data(struct sock *sk,
max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
prot->prepend_size;
do {
- rc = tls_do_allocation(sk, ctx, pfrag,
- prot->prepend_size);
- if (rc) {
+ rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
+ if (unlikely(rc)) {
rc = sk_stream_wait_memory(sk, &timeo);
if (!rc)
continue;
@@ -438,12 +482,10 @@ handle_error:
copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
copy = min_t(size_t, copy, (max_open_record_len - record->len));
- if (copy_from_iter_nocache(page_address(pfrag->page) +
- pfrag->offset,
- copy, msg_iter) != copy) {
- rc = -EFAULT;
+ rc = tls_device_copy_data(page_address(pfrag->page) +
+ pfrag->offset, copy, msg_iter);
+ if (rc)
goto handle_error;
- }
tls_append_frag(record, pfrag, copy);
size -= copy;
@@ -461,13 +503,24 @@ last_record:
if (done || record->len >= max_open_record_len ||
(record->num_frags >= MAX_SKB_FRAGS - 1)) {
+ rc = tls_device_record_close(sk, tls_ctx, record,
+ pfrag, record_type);
+ if (rc) {
+ if (rc > 0) {
+ size += rc;
+ } else {
+ size = orig_size;
+ destroy_record(record);
+ ctx->open_record = NULL;
+ break;
+ }
+ }
+
rc = tls_push_record(sk,
tls_ctx,
ctx,
record,
- pfrag,
- tls_push_record_flags,
- record_type);
+ tls_push_record_flags);
if (rc < 0)
break;
}
@@ -482,8 +535,10 @@ last_record:
int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
unsigned char record_type = TLS_RECORD_TYPE_DATA;
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
int rc;
+ mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
if (unlikely(msg->msg_controllen)) {
@@ -497,12 +552,14 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
out:
release_sock(sk);
+ mutex_unlock(&tls_ctx->tx_lock);
return rc;
}
int tls_device_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
struct iov_iter msg_iter;
char *kaddr = kmap(page);
struct kvec iov;
@@ -511,10 +568,11 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
if (flags & MSG_SENDPAGE_NOTLAST)
flags |= MSG_MORE;
+ mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
if (flags & MSG_OOB) {
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
goto out;
}
@@ -527,6 +585,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
out:
release_sock(sk);
+ mutex_unlock(&tls_ctx->tx_lock);
return rc;
}
@@ -542,12 +601,16 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
/* if retransmit_hint is irrelevant start
* from the beggining of the list
*/
- info = list_first_entry(&context->records_list,
- struct tls_record_info, list);
+ info = list_first_entry_or_null(&context->records_list,
+ struct tls_record_info, list);
+ if (!info)
+ return NULL;
record_sn = context->unacked_record_sn;
}
- list_for_each_entry_from(info, &context->records_list, list) {
+ /* We just need the _rcu for the READ_ONCE() */
+ rcu_read_lock();
+ list_for_each_entry_from_rcu(info, &context->records_list, list) {
if (before(seq, info->end_seq)) {
if (!context->retransmit_hint ||
after(info->end_seq,
@@ -556,12 +619,15 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
context->retransmit_hint = info;
}
*p_record_sn = record_sn;
- return info;
+ goto exit_rcu_unlock;
}
record_sn++;
}
+ info = NULL;
- return NULL;
+exit_rcu_unlock:
+ rcu_read_unlock();
+ return info;
}
EXPORT_SYMBOL(tls_get_record);
@@ -575,9 +641,11 @@ static int tls_device_push_pending_record(struct sock *sk, int flags)
void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
{
- if (!sk->sk_write_pending && tls_is_partially_sent_record(ctx)) {
+ if (tls_is_partially_sent_record(ctx)) {
gfp_t sk_allocation = sk->sk_allocation;
+ WARN_ON_ONCE(sk->sk_write_pending);
+
sk->sk_allocation = GFP_ATOMIC;
tls_push_partial_record(sk, ctx,
MSG_DONTWAIT | MSG_NOSIGNAL |
@@ -589,15 +657,19 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
static void tls_device_resync_rx(struct tls_context *tls_ctx,
struct sock *sk, u32 seq, u8 *rcd_sn)
{
+ struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
struct net_device *netdev;
if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags)))
return;
+
+ trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type);
netdev = READ_ONCE(tls_ctx->netdev);
if (netdev)
netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
TLS_OFFLOAD_CTX_DIR_RX);
clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC);
}
void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
@@ -605,8 +677,8 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_offload_context_rx *rx_ctx;
u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+ u32 sock_data, is_req_pending;
struct tls_prot_info *prot;
- u32 is_req_pending;
s64 resync_req;
u32 req_seq;
@@ -635,8 +707,12 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
/* head of next rec is already in, note that the sock_inq will
* include the currently parsed message when called from parser
*/
- if (tcp_inq(sk) > rcd_len)
+ sock_data = tcp_inq(sk);
+ if (sock_data > rcd_len) {
+ trace_tls_device_rx_resync_nh_delay(sk, sock_data,
+ rcd_len);
return;
+ }
rx_ctx->resync_nh_do_now = 0;
seq += rcd_len;
@@ -680,6 +756,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
/* head of next rec is already in, parser will sync for us */
if (tcp_inq(sk) > rxm->full_len) {
+ trace_tls_device_rx_resync_nh_schedule(sk);
ctx->resync_nh_do_now = 1;
} else {
struct tls_prot_info *prot = &tls_ctx->prot_info;
@@ -778,9 +855,9 @@ free_buf:
return err;
}
-int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ struct sk_buff *skb, struct strp_msg *rxm)
{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
int is_decrypted = skb->decrypted;
int is_encrypted = !is_decrypted;
@@ -792,6 +869,10 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
is_encrypted &= !skb_iter->decrypted;
}
+ trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len,
+ tls_ctx->rx.rec_seq, rxm->full_len,
+ is_encrypted, is_decrypted);
+
ctx->sw.decrypted |= is_decrypted;
/* Return immediately if the record is either entirely plaintext or
@@ -823,7 +904,7 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
spin_unlock_irq(&tls_device_lock);
ctx->sk_destruct = sk->sk_destruct;
- sk->sk_destruct = tls_device_sk_destruct;
+ smp_store_release(&sk->sk_destruct, tls_device_sk_destruct);
}
}
@@ -838,22 +919,18 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
struct net_device *netdev;
char *iv, *rec_seq;
struct sk_buff *skb;
- int rc = -EINVAL;
__be64 rcd_sn;
+ int rc;
if (!ctx)
- goto out;
+ return -EINVAL;
- if (ctx->priv_ctx_tx) {
- rc = -EEXIST;
- goto out;
- }
+ if (ctx->priv_ctx_tx)
+ return -EEXIST;
start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
- if (!start_marker_record) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!start_marker_record)
+ return -ENOMEM;
offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
if (!offload_ctx) {
@@ -939,41 +1016,43 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (skb)
TCP_SKB_CB(skb)->eor = 1;
- /* We support starting offload on multiple sockets
- * concurrently, so we only need a read lock here.
- * This lock must precede get_netdev_for_sock to prevent races between
- * NETDEV_DOWN and setsockopt.
- */
- down_read(&device_offload_lock);
netdev = get_netdev_for_sock(sk);
if (!netdev) {
pr_err_ratelimited("%s: netdev not found\n", __func__);
rc = -EINVAL;
- goto release_lock;
+ goto disable_cad;
}
if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
goto release_netdev;
}
/* Avoid offloading if the device is down
* We don't want to offload new flows after
* the NETDEV_DOWN event
+ *
+ * device_offload_lock is taken in tls_devices's NETDEV_DOWN
+ * handler thus protecting from the device going down before
+ * ctx was added to tls_device_list.
*/
+ down_read(&device_offload_lock);
if (!(netdev->flags & IFF_UP)) {
rc = -EINVAL;
- goto release_netdev;
+ goto release_lock;
}
ctx->priv_ctx_tx = offload_ctx;
rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
&ctx->crypto_send.info,
tcp_sk(sk)->write_seq);
+ trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX,
+ tcp_sk(sk)->write_seq, rec_seq, rc);
if (rc)
- goto release_netdev;
+ goto release_lock;
tls_device_attach(ctx, sk, netdev);
+ up_read(&device_offload_lock);
/* following this assignment tls_is_sk_tx_device_offloaded
* will return true and the context might be accessed
@@ -981,13 +1060,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
*/
smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
dev_put(netdev);
- up_read(&device_offload_lock);
- goto out;
-release_netdev:
- dev_put(netdev);
+ return 0;
+
release_lock:
up_read(&device_offload_lock);
+release_netdev:
+ dev_put(netdev);
+disable_cad:
clean_acked_data_disable(inet_csk(sk));
crypto_free_aead(offload_ctx->aead_send);
free_rec_seq:
@@ -999,12 +1079,12 @@ free_offload_ctx:
ctx->priv_ctx_tx = NULL;
free_marker_record:
kfree(start_marker_record);
-out:
return rc;
}
int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
{
+ struct tls12_crypto_info_aes_gcm_128 *info;
struct tls_offload_context_rx *context;
struct net_device *netdev;
int rc = 0;
@@ -1012,37 +1092,35 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
if (ctx->crypto_recv.info.version != TLS_1_2_VERSION)
return -EOPNOTSUPP;
- /* We support starting offload on multiple sockets
- * concurrently, so we only need a read lock here.
- * This lock must precede get_netdev_for_sock to prevent races between
- * NETDEV_DOWN and setsockopt.
- */
- down_read(&device_offload_lock);
netdev = get_netdev_for_sock(sk);
if (!netdev) {
pr_err_ratelimited("%s: netdev not found\n", __func__);
- rc = -EINVAL;
- goto release_lock;
+ return -EINVAL;
}
if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
- rc = -ENOTSUPP;
+ rc = -EOPNOTSUPP;
goto release_netdev;
}
/* Avoid offloading if the device is down
* We don't want to offload new flows after
* the NETDEV_DOWN event
+ *
+ * device_offload_lock is taken in tls_devices's NETDEV_DOWN
+ * handler thus protecting from the device going down before
+ * ctx was added to tls_device_list.
*/
+ down_read(&device_offload_lock);
if (!(netdev->flags & IFF_UP)) {
rc = -EINVAL;
- goto release_netdev;
+ goto release_lock;
}
context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
if (!context) {
rc = -ENOMEM;
- goto release_netdev;
+ goto release_lock;
}
context->resync_nh_reset = 1;
@@ -1054,11 +1132,18 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
&ctx->crypto_recv.info,
tcp_sk(sk)->copied_seq);
+ info = (void *)&ctx->crypto_recv.info;
+ trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX,
+ tcp_sk(sk)->copied_seq, info->rec_seq, rc);
if (rc)
goto free_sw_resources;
tls_device_attach(ctx, sk, netdev);
- goto release_netdev;
+ up_read(&device_offload_lock);
+
+ dev_put(netdev);
+
+ return 0;
free_sw_resources:
up_read(&device_offload_lock);
@@ -1066,10 +1151,10 @@ free_sw_resources:
down_read(&device_offload_lock);
release_ctx:
ctx->priv_ctx_rx = NULL;
-release_netdev:
- dev_put(netdev);
release_lock:
up_read(&device_offload_lock);
+release_netdev:
+ dev_put(netdev);
return rc;
}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 9070d68a92a4..28895333701e 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -273,7 +273,7 @@ static int fill_sg_in(struct scatterlist *sg_in,
__skb_frag_ref(frag);
sg_set_page(sg_in + i, skb_frag_page(frag),
- skb_frag_size(frag), frag->page_offset);
+ skb_frag_size(frag), skb_frag_off(frag));
remaining -= skb_frag_size(frag);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 43252a801c3f..94774c0e5ff3 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -39,8 +39,11 @@
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/inetdevice.h>
+#include <linux/inet_diag.h>
+#include <net/snmp.h>
#include <net/tls.h>
+#include <net/tls_toe.h>
MODULE_AUTHOR("Mellanox Technologies");
MODULE_DESCRIPTION("Transport Layer Security Support");
@@ -57,14 +60,12 @@ static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
static struct proto *saved_tcpv4_prot;
static DEFINE_MUTEX(tcpv4_prot_mutex);
-static LIST_HEAD(device_list);
-static DEFINE_SPINLOCK(device_spinlock);
static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
static struct proto_ops tls_sw_proto_ops;
static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
struct proto *base);
-static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
@@ -208,24 +209,15 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
return tls_push_sg(sk, ctx, sg, offset, flags);
}
-bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
+void tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
{
struct scatterlist *sg;
- sg = ctx->partially_sent_record;
- if (!sg)
- return false;
-
- while (1) {
+ for (sg = ctx->partially_sent_record; sg; sg = sg_next(sg)) {
put_page(sg_page(sg));
sk_mem_uncharge(sk, sg->length);
-
- if (sg_is_last(sg))
- break;
- sg++;
}
ctx->partially_sent_record = NULL;
- return true;
}
static void tls_write_space(struct sock *sk)
@@ -251,14 +243,27 @@ static void tls_write_space(struct sock *sk)
ctx->sk_write_space(sk);
}
-void tls_ctx_free(struct tls_context *ctx)
+/**
+ * tls_ctx_free() - free TLS ULP context
+ * @sk: socket to with @ctx is attached
+ * @ctx: TLS context structure
+ *
+ * Free TLS context. If @sk is %NULL caller guarantees that the socket
+ * to which @ctx was attached has no outstanding references.
+ */
+void tls_ctx_free(struct sock *sk, struct tls_context *ctx)
{
if (!ctx)
return;
memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send));
memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv));
- kfree(ctx);
+ mutex_destroy(&ctx->tx_lock);
+
+ if (sk)
+ kfree_rcu(ctx, rcu);
+ else
+ kfree(ctx);
}
static void tls_sk_proto_cleanup(struct sock *sk,
@@ -273,19 +278,19 @@ static void tls_sk_proto_cleanup(struct sock *sk,
kfree(ctx->tx.rec_seq);
kfree(ctx->tx.iv);
tls_sw_release_resources_tx(sk);
-#ifdef CONFIG_TLS_DEVICE
+ TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
} else if (ctx->tx_conf == TLS_HW) {
tls_device_free_resources_tx(sk);
-#endif
+ TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE);
}
- if (ctx->rx_conf == TLS_SW)
+ if (ctx->rx_conf == TLS_SW) {
tls_sw_release_resources_rx(sk);
-
-#ifdef CONFIG_TLS_DEVICE
- if (ctx->rx_conf == TLS_HW)
+ TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW);
+ } else if (ctx->rx_conf == TLS_HW) {
tls_device_offload_cleanup_rx(sk);
-#endif
+ TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE);
+ }
}
static void tls_sk_proto_close(struct sock *sk, long timeout)
@@ -306,7 +311,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
write_lock_bh(&sk->sk_callback_lock);
if (free_ctx)
- icsk->icsk_ulp_data = NULL;
+ rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
sk->sk_prot = ctx->sk_proto;
if (sk->sk_write_space == tls_write_space)
sk->sk_write_space = ctx->sk_write_space;
@@ -318,10 +323,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
tls_sw_strparser_done(ctx);
if (ctx->rx_conf == TLS_SW)
tls_sw_free_ctx_rx(ctx);
- ctx->sk_proto_close(sk, timeout);
+ ctx->sk_proto->close(sk, timeout);
if (free_ctx)
- tls_ctx_free(ctx);
+ tls_ctx_free(sk, ctx);
}
static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
@@ -438,7 +443,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname,
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
- return ctx->getsockopt(sk, level, optname, optval, optlen);
+ return ctx->sk_proto->getsockopt(sk, level,
+ optname, optval, optlen);
return do_tls_getsockopt(sk, optname, optval, optlen);
}
@@ -481,7 +487,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
/* check version */
if (crypto_info->version != TLS_1_2_VERSION &&
crypto_info->version != TLS_1_3_VERSION) {
- rc = -ENOTSUPP;
+ rc = -EINVAL;
goto err_crypto_info;
}
@@ -523,29 +529,31 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
}
if (tx) {
-#ifdef CONFIG_TLS_DEVICE
rc = tls_set_device_offload(sk, ctx);
conf = TLS_HW;
- if (rc) {
-#else
- {
-#endif
+ if (!rc) {
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE);
+ } else {
rc = tls_set_sw_offload(sk, ctx, 1);
if (rc)
goto err_crypto_info;
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
conf = TLS_SW;
}
} else {
-#ifdef CONFIG_TLS_DEVICE
rc = tls_set_device_offload_rx(sk, ctx);
conf = TLS_HW;
- if (rc) {
-#else
- {
-#endif
+ if (!rc) {
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE);
+ } else {
rc = tls_set_sw_offload(sk, ctx, 0);
if (rc)
goto err_crypto_info;
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW);
conf = TLS_SW;
}
tls_sw_strparser_arm(sk, ctx);
@@ -596,12 +604,13 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
- return ctx->setsockopt(sk, level, optname, optval, optlen);
+ return ctx->sk_proto->setsockopt(sk, level, optname, optval,
+ optlen);
return do_tls_setsockopt(sk, optname, optval, optlen);
}
-static struct tls_context *create_ctx(struct sock *sk)
+struct tls_context *tls_ctx_create(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tls_context *ctx;
@@ -610,11 +619,9 @@ static struct tls_context *create_ctx(struct sock *sk)
if (!ctx)
return NULL;
- icsk->icsk_ulp_data = ctx;
- ctx->setsockopt = sk->sk_prot->setsockopt;
- ctx->getsockopt = sk->sk_prot->getsockopt;
- ctx->sk_proto_close = sk->sk_prot->close;
- ctx->unhash = sk->sk_prot->unhash;
+ mutex_init(&ctx->tx_lock);
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ ctx->sk_proto = sk->sk_prot;
return ctx;
}
@@ -644,93 +651,6 @@ static void tls_build_proto(struct sock *sk)
}
}
-static void tls_hw_sk_destruct(struct sock *sk)
-{
- struct tls_context *ctx = tls_get_ctx(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- ctx->sk_destruct(sk);
- /* Free ctx */
- tls_ctx_free(ctx);
- icsk->icsk_ulp_data = NULL;
-}
-
-static int tls_hw_prot(struct sock *sk)
-{
- struct tls_context *ctx;
- struct tls_device *dev;
- int rc = 0;
-
- spin_lock_bh(&device_spinlock);
- list_for_each_entry(dev, &device_list, dev_list) {
- if (dev->feature && dev->feature(dev)) {
- ctx = create_ctx(sk);
- if (!ctx)
- goto out;
-
- spin_unlock_bh(&device_spinlock);
- tls_build_proto(sk);
- ctx->hash = sk->sk_prot->hash;
- ctx->unhash = sk->sk_prot->unhash;
- ctx->sk_proto_close = sk->sk_prot->close;
- ctx->sk_destruct = sk->sk_destruct;
- sk->sk_destruct = tls_hw_sk_destruct;
- ctx->rx_conf = TLS_HW_RECORD;
- ctx->tx_conf = TLS_HW_RECORD;
- update_sk_prot(sk, ctx);
- spin_lock_bh(&device_spinlock);
- rc = 1;
- break;
- }
- }
-out:
- spin_unlock_bh(&device_spinlock);
- return rc;
-}
-
-static void tls_hw_unhash(struct sock *sk)
-{
- struct tls_context *ctx = tls_get_ctx(sk);
- struct tls_device *dev;
-
- spin_lock_bh(&device_spinlock);
- list_for_each_entry(dev, &device_list, dev_list) {
- if (dev->unhash) {
- kref_get(&dev->kref);
- spin_unlock_bh(&device_spinlock);
- dev->unhash(dev, sk);
- kref_put(&dev->kref, dev->release);
- spin_lock_bh(&device_spinlock);
- }
- }
- spin_unlock_bh(&device_spinlock);
- ctx->unhash(sk);
-}
-
-static int tls_hw_hash(struct sock *sk)
-{
- struct tls_context *ctx = tls_get_ctx(sk);
- struct tls_device *dev;
- int err;
-
- err = ctx->hash(sk);
- spin_lock_bh(&device_spinlock);
- list_for_each_entry(dev, &device_list, dev_list) {
- if (dev->hash) {
- kref_get(&dev->kref);
- spin_unlock_bh(&device_spinlock);
- err |= dev->hash(dev, sk);
- kref_put(&dev->kref, dev->release);
- spin_lock_bh(&device_spinlock);
- }
- }
- spin_unlock_bh(&device_spinlock);
-
- if (err)
- tls_hw_unhash(sk);
- return err;
-}
-
static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
struct proto *base)
{
@@ -768,10 +688,11 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW];
#endif
-
+#ifdef CONFIG_TLS_TOE
prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
- prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash;
- prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_toe_hash;
+ prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_toe_unhash;
+#endif
}
static int tls_init(struct sock *sk)
@@ -779,8 +700,12 @@ static int tls_init(struct sock *sk)
struct tls_context *ctx;
int rc = 0;
- if (tls_hw_prot(sk))
+ tls_build_proto(sk);
+
+#ifdef CONFIG_TLS_TOE
+ if (tls_toe_bypass(sk))
return 0;
+#endif
/* The TLS ulp is currently supported only for TCP sockets
* in ESTABLISHED state.
@@ -789,13 +714,11 @@ static int tls_init(struct sock *sk)
* share the ulp context.
*/
if (sk->sk_state != TCP_ESTABLISHED)
- return -ENOTSUPP;
-
- tls_build_proto(sk);
+ return -ENOTCONN;
/* allocate tls context */
write_lock_bh(&sk->sk_callback_lock);
- ctx = create_ctx(sk);
+ ctx = tls_ctx_create(sk);
if (!ctx) {
rc = -ENOMEM;
goto out;
@@ -803,57 +726,139 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
- ctx->sk_proto = sk->sk_prot;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
return rc;
}
-static void tls_update(struct sock *sk, struct proto *p)
+static void tls_update(struct sock *sk, struct proto *p,
+ void (*write_space)(struct sock *sk))
{
struct tls_context *ctx;
ctx = tls_get_ctx(sk);
if (likely(ctx)) {
- ctx->sk_proto_close = p->close;
+ ctx->sk_write_space = write_space;
ctx->sk_proto = p;
} else {
sk->sk_prot = p;
+ sk->sk_write_space = write_space;
}
}
-void tls_register_device(struct tls_device *device)
+static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
{
- spin_lock_bh(&device_spinlock);
- list_add_tail(&device->dev_list, &device_list);
- spin_unlock_bh(&device_spinlock);
+ u16 version, cipher_type;
+ struct tls_context *ctx;
+ struct nlattr *start;
+ int err;
+
+ start = nla_nest_start_noflag(skb, INET_ULP_INFO_TLS);
+ if (!start)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
+ if (!ctx) {
+ err = 0;
+ goto nla_failure;
+ }
+ version = ctx->prot_info.version;
+ if (version) {
+ err = nla_put_u16(skb, TLS_INFO_VERSION, version);
+ if (err)
+ goto nla_failure;
+ }
+ cipher_type = ctx->prot_info.cipher_type;
+ if (cipher_type) {
+ err = nla_put_u16(skb, TLS_INFO_CIPHER, cipher_type);
+ if (err)
+ goto nla_failure;
+ }
+ err = nla_put_u16(skb, TLS_INFO_TXCONF, tls_user_config(ctx, true));
+ if (err)
+ goto nla_failure;
+
+ err = nla_put_u16(skb, TLS_INFO_RXCONF, tls_user_config(ctx, false));
+ if (err)
+ goto nla_failure;
+
+ rcu_read_unlock();
+ nla_nest_end(skb, start);
+ return 0;
+
+nla_failure:
+ rcu_read_unlock();
+ nla_nest_cancel(skb, start);
+ return err;
}
-EXPORT_SYMBOL(tls_register_device);
-void tls_unregister_device(struct tls_device *device)
+static size_t tls_get_info_size(const struct sock *sk)
{
- spin_lock_bh(&device_spinlock);
- list_del(&device->dev_list);
- spin_unlock_bh(&device_spinlock);
+ size_t size = 0;
+
+ size += nla_total_size(0) + /* INET_ULP_INFO_TLS */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_VERSION */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_CIPHER */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */
+ nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */
+ 0;
+
+ return size;
}
-EXPORT_SYMBOL(tls_unregister_device);
+
+static int __net_init tls_init_net(struct net *net)
+{
+ int err;
+
+ net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib);
+ if (!net->mib.tls_statistics)
+ return -ENOMEM;
+
+ err = tls_proc_init(net);
+ if (err)
+ goto err_free_stats;
+
+ return 0;
+err_free_stats:
+ free_percpu(net->mib.tls_statistics);
+ return err;
+}
+
+static void __net_exit tls_exit_net(struct net *net)
+{
+ tls_proc_fini(net);
+ free_percpu(net->mib.tls_statistics);
+}
+
+static struct pernet_operations tls_proc_ops = {
+ .init = tls_init_net,
+ .exit = tls_exit_net,
+};
static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
.name = "tls",
.owner = THIS_MODULE,
.init = tls_init,
.update = tls_update,
+ .get_info = tls_get_info,
+ .get_info_size = tls_get_info_size,
};
static int __init tls_register(void)
{
+ int err;
+
+ err = register_pernet_subsys(&tls_proc_ops);
+ if (err)
+ return err;
+
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
+ tls_sw_proto_ops.sendpage_locked = tls_sw_sendpage_locked,
-#ifdef CONFIG_TLS_DEVICE
tls_device_init();
-#endif
tcp_register_ulp(&tcp_tls_ulp_ops);
return 0;
@@ -862,9 +867,8 @@ static int __init tls_register(void)
static void __exit tls_unregister(void)
{
tcp_unregister_ulp(&tcp_tls_ulp_ops);
-#ifdef CONFIG_TLS_DEVICE
tls_device_cleanup();
-#endif
+ unregister_pernet_subsys(&tls_proc_ops);
}
module_init(tls_register);
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
new file mode 100644
index 000000000000..3a5dd1e07233
--- /dev/null
+++ b/net/tls/tls_proc.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/snmp.h>
+#include <net/tls.h>
+
+#ifdef CONFIG_PROC_FS
+static const struct snmp_mib tls_mib_list[] = {
+ SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW),
+ SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW),
+ SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE),
+ SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE),
+ SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW),
+ SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW),
+ SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE),
+ SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
+ SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
+ SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
+ SNMP_MIB_SENTINEL
+};
+
+static int tls_statistics_seq_show(struct seq_file *seq, void *v)
+{
+ unsigned long buf[LINUX_MIB_TLSMAX] = {};
+ struct net *net = seq->private;
+ int i;
+
+ snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics);
+ for (i = 0; tls_mib_list[i].name; i++)
+ seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]);
+
+ return 0;
+}
+#endif
+
+int __net_init tls_proc_init(struct net *net)
+{
+ if (!proc_create_net_single("tls_stat", 0444, net->proc_net,
+ tls_statistics_seq_show, NULL))
+ return -ENOMEM;
+ return 0;
+}
+
+void __net_exit tls_proc_fini(struct net *net)
+{
+ remove_proc_entry("tls_stat", net->proc_net);
+}
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 91d21b048a9b..c98e602a1a2d 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -168,6 +168,9 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
/* Propagate if there was an err */
if (err) {
+ if (err == -EBADMSG)
+ TLS_INC_STATS(sock_net(skb->sk),
+ LINUX_MIB_TLSDECRYPTERROR);
ctx->async_wait.err = err;
tls_err_abort(skb->sk, err);
} else {
@@ -677,12 +680,32 @@ static int tls_push_record(struct sock *sk, int flags,
split_point = msg_pl->apply_bytes;
split = split_point && split_point < msg_pl->sg.size;
+ if (unlikely((!split &&
+ msg_pl->sg.size +
+ prot->overhead_size > msg_en->sg.size) ||
+ (split &&
+ split_point +
+ prot->overhead_size > msg_en->sg.size))) {
+ split = true;
+ split_point = msg_en->sg.size;
+ }
if (split) {
rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
split_point, prot->overhead_size,
&orig_end);
if (rc < 0)
return rc;
+ /* This can happen if above tls_split_open_record allocates
+ * a single large encryption buffer instead of two smaller
+ * ones. In this case adjust pointers and continue without
+ * split.
+ */
+ if (!msg_pl->sg.size) {
+ tls_merge_open_record(sk, rec, tmp, orig_end);
+ msg_pl = &rec->msg_plaintext;
+ msg_en = &rec->msg_encrypted;
+ split = false;
+ }
sk_msg_trim(sk, msg_en, msg_pl->sg.size +
prot->overhead_size);
}
@@ -704,9 +727,14 @@ static int tls_push_record(struct sock *sk, int flags,
sg_mark_end(sk_msg_elem(msg_pl, i));
}
+ if (msg_pl->sg.end < msg_pl->sg.start) {
+ sg_chain(&msg_pl->sg.data[msg_pl->sg.start],
+ MAX_SKB_FRAGS - msg_pl->sg.start + 1,
+ msg_pl->sg.data);
+ }
+
i = msg_pl->sg.start;
- sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
- &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
+ sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]);
i = msg_en->sg.end;
sk_msg_iter_var_prev(i);
@@ -766,17 +794,20 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
policy = !(flags & MSG_SENDPAGE_NOPOLICY);
psock = sk_psock_get(sk);
- if (!psock || !policy)
- return tls_push_record(sk, flags, record_type);
+ if (!psock || !policy) {
+ err = tls_push_record(sk, flags, record_type);
+ if (err && err != -EINPROGRESS) {
+ *copied -= sk_msg_free(sk, msg);
+ tls_free_open_rec(sk);
+ }
+ return err;
+ }
more_data:
enospc = sk_msg_full(msg);
if (psock->eval == __SK_NONE) {
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
- if (delta < msg->sg.size)
- delta -= msg->sg.size;
- else
- delta = 0;
+ delta -= msg->sg.size;
}
if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
!enospc && !full_record) {
@@ -791,7 +822,7 @@ more_data:
switch (psock->eval) {
case __SK_PASS:
err = tls_push_record(sk, flags, record_type);
- if (err < 0) {
+ if (err && err != -EINPROGRESS) {
*copied -= sk_msg_free(sk, msg);
tls_free_open_rec(sk);
goto out_err;
@@ -895,17 +926,11 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
int ret = 0;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+ mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
- /* Wait till there is any pending write on socket */
- if (unlikely(sk->sk_write_pending)) {
- ret = wait_on_pending_writer(sk, &timeo);
- if (unlikely(ret))
- goto send_end;
- }
-
if (unlikely(msg->msg_controllen)) {
ret = tls_proccess_cmsg(sk, msg, &record_type);
if (ret) {
@@ -971,8 +996,6 @@ alloc_encrypted:
if (ret)
goto fallback_to_reg_send;
- rec->inplace_crypto = 0;
-
num_zc++;
copied += try_to_copy;
@@ -985,7 +1008,7 @@ alloc_encrypted:
num_async++;
else if (ret == -ENOMEM)
goto wait_for_memory;
- else if (ret == -ENOSPC)
+ else if (ctx->open_rec && ret == -ENOSPC)
goto rollback_iter;
else if (ret != -EAGAIN)
goto send_end;
@@ -1054,11 +1077,12 @@ wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
trim_sgl:
- tls_trim_both_msgs(sk, orig_size);
+ if (ctx->open_rec)
+ tls_trim_both_msgs(sk, orig_size);
goto send_end;
}
- if (msg_en->sg.size < required_size)
+ if (ctx->open_rec && msg_en->sg.size < required_size)
goto alloc_encrypted;
}
@@ -1091,6 +1115,7 @@ send_end:
ret = sk_stream_error(sk, msg->msg_flags, ret);
release_sock(sk);
+ mutex_unlock(&tls_ctx->tx_lock);
return copied ? copied : ret;
}
@@ -1114,13 +1139,6 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page,
eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST));
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- /* Wait till there is any pending write on socket */
- if (unlikely(sk->sk_write_pending)) {
- ret = wait_on_pending_writer(sk, &timeo);
- if (unlikely(ret))
- goto sendpage_end;
- }
-
/* Call the sk_stream functions to manage the sndbuf mem. */
while (size > 0) {
size_t copy, required_size;
@@ -1176,7 +1194,6 @@ alloc_payload:
tls_ctx->pending_open_record_frags = true;
if (full_record || eor || sk_msg_full(msg_pl)) {
- rec->inplace_crypto = 0;
ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
record_type, &copied, flags);
if (ret) {
@@ -1197,11 +1214,13 @@ wait_for_sndbuf:
wait_for_memory:
ret = sk_stream_wait_memory(sk, &timeo);
if (ret) {
- tls_trim_both_msgs(sk, msg_pl->sg.size);
+ if (ctx->open_rec)
+ tls_trim_both_msgs(sk, msg_pl->sg.size);
goto sendpage_end;
}
- goto alloc_payload;
+ if (ctx->open_rec)
+ goto alloc_payload;
}
if (num_async) {
@@ -1216,18 +1235,32 @@ sendpage_end:
return copied ? copied : ret;
}
+int tls_sw_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags)
+{
+ if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY |
+ MSG_NO_SHARED_FRAGS))
+ return -EOPNOTSUPP;
+
+ return tls_sw_do_sendpage(sk, page, offset, size, flags);
+}
+
int tls_sw_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags)
{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
int ret;
if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+ mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
ret = tls_sw_do_sendpage(sk, page, offset, size, flags);
release_sock(sk);
+ mutex_unlock(&tls_ctx->tx_lock);
return ret;
}
@@ -1489,13 +1522,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
int pad, err = 0;
if (!ctx->decrypted) {
-#ifdef CONFIG_TLS_DEVICE
if (tls_ctx->rx_conf == TLS_HW) {
- err = tls_device_decrypted(sk, skb);
+ err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
if (err < 0)
return err;
}
-#endif
+
/* Still not decrypted after tls_device */
if (!ctx->decrypted) {
err = decrypt_internal(sk, skb, dest, NULL, chunk, zc,
@@ -1504,7 +1536,9 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
if (err == -EINPROGRESS)
tls_advance_record_sn(sk, prot,
&tls_ctx->rx);
-
+ else if (err == -EBADMSG)
+ TLS_INC_STATS(sock_net(sk),
+ LINUX_MIB_TLSDECRYPTERROR);
return err;
}
} else {
@@ -1519,7 +1553,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
rxm->offset += prot->prepend_size;
rxm->full_len -= prot->overhead_size;
tls_advance_record_sn(sk, prot, &tls_ctx->rx);
- ctx->decrypted = true;
+ ctx->decrypted = 1;
ctx->saved_data_ready(sk);
} else {
*zc = false;
@@ -1921,7 +1955,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
/* splice does not support reading control messages */
if (ctx->control != TLS_RECORD_TYPE_DATA) {
- err = -ENOTSUPP;
+ err = -EINVAL;
goto splice_read_end;
}
@@ -1929,7 +1963,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
tls_err_abort(sk, EBADMSG);
goto splice_read_end;
}
- ctx->decrypted = true;
+ ctx->decrypted = 1;
}
rxm = strp_msg(skb);
@@ -2014,10 +2048,9 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
ret = -EINVAL;
goto read_failure;
}
-#ifdef CONFIG_TLS_DEVICE
+
tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
TCP_SKB_CB(skb)->seq + rxm->offset);
-#endif
return data_len + TLS_HEADER_SIZE;
read_failure:
@@ -2031,7 +2064,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- ctx->decrypted = false;
+ ctx->decrypted = 0;
ctx->recv_pkt = skb;
strp_pause(strp);
@@ -2079,7 +2112,8 @@ void tls_sw_release_resources_tx(struct sock *sk)
/* Free up un-sent records in tx_list. First, free
* the partially sent record if any at head of tx_list.
*/
- if (tls_free_partial_record(sk, tls_ctx)) {
+ if (tls_ctx->partially_sent_record) {
+ tls_free_partial_record(sk, tls_ctx);
rec = list_first_entry(&ctx->tx_list,
struct tls_rec, list);
list_del(&rec->list);
@@ -2172,9 +2206,11 @@ static void tx_work_handler(struct work_struct *work)
if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask))
return;
+ mutex_lock(&tls_ctx->tx_lock);
lock_sock(sk);
tls_tx_records(sk, -1);
release_sock(sk);
+ mutex_unlock(&tls_ctx->tx_lock);
}
void tls_sw_write_space(struct sock *sk, struct tls_context *ctx)
@@ -2182,12 +2218,9 @@ void tls_sw_write_space(struct sock *sk, struct tls_context *ctx)
struct tls_sw_context_tx *tx_ctx = tls_sw_ctx_tx(ctx);
/* Schedule the transmission if tx list is ready */
- if (is_tx_ready(tx_ctx) && !sk->sk_write_pending) {
- /* Schedule the transmission */
- if (!test_and_set_bit(BIT_TX_SCHEDULED,
- &tx_ctx->tx_bitmask))
- schedule_delayed_work(&tx_ctx->tx_work.work, 0);
- }
+ if (is_tx_ready(tx_ctx) &&
+ !test_and_set_bit(BIT_TX_SCHEDULED, &tx_ctx->tx_bitmask))
+ schedule_delayed_work(&tx_ctx->tx_work.work, 0);
}
void tls_sw_strparser_arm(struct sock *sk, struct tls_context *tls_ctx)
@@ -2388,10 +2421,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
if (crypto_info->version == TLS_1_3_VERSION)
- sw_ctx_rx->async_capable = false;
+ sw_ctx_rx->async_capable = 0;
else
sw_ctx_rx->async_capable =
- tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC;
+ !!(tfm->__crt_alg->cra_flags &
+ CRYPTO_ALG_ASYNC);
/* Set up strparser */
memset(&cb, 0, sizeof(cb));
diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c
new file mode 100644
index 000000000000..7e1330f19165
--- /dev/null
+++ b/net/tls/tls_toe.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <net/inet_connection_sock.h>
+#include <net/tls.h>
+#include <net/tls_toe.h>
+
+static LIST_HEAD(device_list);
+static DEFINE_SPINLOCK(device_spinlock);
+
+static void tls_toe_sk_destruct(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tls_context *ctx = tls_get_ctx(sk);
+
+ ctx->sk_destruct(sk);
+ /* Free ctx */
+ rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+ tls_ctx_free(sk, ctx);
+}
+
+int tls_toe_bypass(struct sock *sk)
+{
+ struct tls_toe_device *dev;
+ struct tls_context *ctx;
+ int rc = 0;
+
+ spin_lock_bh(&device_spinlock);
+ list_for_each_entry(dev, &device_list, dev_list) {
+ if (dev->feature && dev->feature(dev)) {
+ ctx = tls_ctx_create(sk);
+ if (!ctx)
+ goto out;
+
+ ctx->sk_destruct = sk->sk_destruct;
+ sk->sk_destruct = tls_toe_sk_destruct;
+ ctx->rx_conf = TLS_HW_RECORD;
+ ctx->tx_conf = TLS_HW_RECORD;
+ update_sk_prot(sk, ctx);
+ rc = 1;
+ break;
+ }
+ }
+out:
+ spin_unlock_bh(&device_spinlock);
+ return rc;
+}
+
+void tls_toe_unhash(struct sock *sk)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_toe_device *dev;
+
+ spin_lock_bh(&device_spinlock);
+ list_for_each_entry(dev, &device_list, dev_list) {
+ if (dev->unhash) {
+ kref_get(&dev->kref);
+ spin_unlock_bh(&device_spinlock);
+ dev->unhash(dev, sk);
+ kref_put(&dev->kref, dev->release);
+ spin_lock_bh(&device_spinlock);
+ }
+ }
+ spin_unlock_bh(&device_spinlock);
+ ctx->sk_proto->unhash(sk);
+}
+
+int tls_toe_hash(struct sock *sk)
+{
+ struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_toe_device *dev;
+ int err;
+
+ err = ctx->sk_proto->hash(sk);
+ spin_lock_bh(&device_spinlock);
+ list_for_each_entry(dev, &device_list, dev_list) {
+ if (dev->hash) {
+ kref_get(&dev->kref);
+ spin_unlock_bh(&device_spinlock);
+ err |= dev->hash(dev, sk);
+ kref_put(&dev->kref, dev->release);
+ spin_lock_bh(&device_spinlock);
+ }
+ }
+ spin_unlock_bh(&device_spinlock);
+
+ if (err)
+ tls_toe_unhash(sk);
+ return err;
+}
+
+void tls_toe_register_device(struct tls_toe_device *device)
+{
+ spin_lock_bh(&device_spinlock);
+ list_add_tail(&device->dev_list, &device_list);
+ spin_unlock_bh(&device_spinlock);
+}
+EXPORT_SYMBOL(tls_toe_register_device);
+
+void tls_toe_unregister_device(struct tls_toe_device *device)
+{
+ spin_lock_bh(&device_spinlock);
+ list_del(&device->dev_list);
+ spin_unlock_bh(&device_spinlock);
+}
+EXPORT_SYMBOL(tls_toe_unregister_device);
diff --git a/net/tls/trace.c b/net/tls/trace.c
new file mode 100644
index 000000000000..e374913cf9c9
--- /dev/null
+++ b/net/tls/trace.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/net/tls/trace.h b/net/tls/trace.h
new file mode 100644
index 000000000000..9ba5f600ea43
--- /dev/null
+++ b/net/tls/trace.h
@@ -0,0 +1,202 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tls
+
+#if !defined(_TLS_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _TLS_TRACE_H_
+
+#include <asm/unaligned.h>
+#include <linux/tracepoint.h>
+
+struct sock;
+
+TRACE_EVENT(tls_device_offload_set,
+
+ TP_PROTO(struct sock *sk, int dir, u32 tcp_seq, u8 *rec_no, int ret),
+
+ TP_ARGS(sk, dir, tcp_seq, rec_no, ret),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ __field( u64, rec_no )
+ __field( int, dir )
+ __field( u32, tcp_seq )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ __entry->rec_no = get_unaligned_be64(rec_no);
+ __entry->dir = dir;
+ __entry->tcp_seq = tcp_seq;
+ __entry->ret = ret;
+ ),
+
+ TP_printk(
+ "sk=%p direction=%d tcp_seq=%u rec_no=%llu ret=%d",
+ __entry->sk, __entry->dir, __entry->tcp_seq, __entry->rec_no,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(tls_device_decrypted,
+
+ TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, u32 rec_len,
+ bool encrypted, bool decrypted),
+
+ TP_ARGS(sk, tcp_seq, rec_no, rec_len, encrypted, decrypted),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ __field( u64, rec_no )
+ __field( u32, tcp_seq )
+ __field( u32, rec_len )
+ __field( bool, encrypted )
+ __field( bool, decrypted )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ __entry->rec_no = get_unaligned_be64(rec_no);
+ __entry->tcp_seq = tcp_seq;
+ __entry->rec_len = rec_len;
+ __entry->encrypted = encrypted;
+ __entry->decrypted = decrypted;
+ ),
+
+ TP_printk(
+ "sk=%p tcp_seq=%u rec_no=%llu len=%u encrypted=%d decrypted=%d",
+ __entry->sk, __entry->tcp_seq,
+ __entry->rec_no, __entry->rec_len,
+ __entry->encrypted, __entry->decrypted
+ )
+);
+
+TRACE_EVENT(tls_device_rx_resync_send,
+
+ TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, int sync_type),
+
+ TP_ARGS(sk, tcp_seq, rec_no, sync_type),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ __field( u64, rec_no )
+ __field( u32, tcp_seq )
+ __field( int, sync_type )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ __entry->rec_no = get_unaligned_be64(rec_no);
+ __entry->tcp_seq = tcp_seq;
+ __entry->sync_type = sync_type;
+ ),
+
+ TP_printk(
+ "sk=%p tcp_seq=%u rec_no=%llu sync_type=%d",
+ __entry->sk, __entry->tcp_seq, __entry->rec_no,
+ __entry->sync_type
+ )
+);
+
+TRACE_EVENT(tls_device_rx_resync_nh_schedule,
+
+ TP_PROTO(struct sock *sk),
+
+ TP_ARGS(sk),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ ),
+
+ TP_printk(
+ "sk=%p", __entry->sk
+ )
+);
+
+TRACE_EVENT(tls_device_rx_resync_nh_delay,
+
+ TP_PROTO(struct sock *sk, u32 sock_data, u32 rec_len),
+
+ TP_ARGS(sk, sock_data, rec_len),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ __field( u32, sock_data )
+ __field( u32, rec_len )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ __entry->sock_data = sock_data;
+ __entry->rec_len = rec_len;
+ ),
+
+ TP_printk(
+ "sk=%p sock_data=%u rec_len=%u",
+ __entry->sk, __entry->sock_data, __entry->rec_len
+ )
+);
+
+TRACE_EVENT(tls_device_tx_resync_req,
+
+ TP_PROTO(struct sock *sk, u32 tcp_seq, u32 exp_tcp_seq),
+
+ TP_ARGS(sk, tcp_seq, exp_tcp_seq),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ __field( u32, tcp_seq )
+ __field( u32, exp_tcp_seq )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ __entry->tcp_seq = tcp_seq;
+ __entry->exp_tcp_seq = exp_tcp_seq;
+ ),
+
+ TP_printk(
+ "sk=%p tcp_seq=%u exp_tcp_seq=%u",
+ __entry->sk, __entry->tcp_seq, __entry->exp_tcp_seq
+ )
+);
+
+TRACE_EVENT(tls_device_tx_resync_send,
+
+ TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no),
+
+ TP_ARGS(sk, tcp_seq, rec_no),
+
+ TP_STRUCT__entry(
+ __field( struct sock *, sk )
+ __field( u64, rec_no )
+ __field( u32, tcp_seq )
+ ),
+
+ TP_fast_assign(
+ __entry->sk = sk;
+ __entry->rec_no = get_unaligned_be64(rec_no);
+ __entry->tcp_seq = tcp_seq;
+ ),
+
+ TP_printk(
+ "sk=%p tcp_seq=%u rec_no=%llu",
+ __entry->sk, __entry->tcp_seq, __entry->rec_no
+ )
+);
+
+#endif /* _TLS_TRACE_H_ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 67e87db5877f..62c12cb5763e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -189,11 +189,17 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk)
return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
}
-static inline int unix_recvq_full(struct sock const *sk)
+static inline int unix_recvq_full(const struct sock *sk)
{
return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
}
+static inline int unix_recvq_full_lockless(const struct sock *sk)
+{
+ return skb_queue_len_lockless(&sk->sk_receive_queue) >
+ READ_ONCE(sk->sk_max_ack_backlog);
+}
+
struct sock *unix_peer_get(struct sock *s)
{
struct sock *peer;
@@ -284,11 +290,9 @@ static struct sock *__unix_find_socket_byname(struct net *net,
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len))
- goto found;
+ return s;
}
- s = NULL;
-found:
- return s;
+ return NULL;
}
static inline struct sock *unix_find_socket_byname(struct net *net,
@@ -646,6 +650,9 @@ static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
static __poll_t unix_dgram_poll(struct file *, struct socket *,
poll_table *);
static int unix_ioctl(struct socket *, unsigned int, unsigned long);
+#ifdef CONFIG_COMPAT
+static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+#endif
static int unix_shutdown(struct socket *, int);
static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
@@ -675,6 +682,16 @@ static int unix_set_peek_off(struct sock *sk, int val)
return 0;
}
+static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct unix_sock *u;
+
+ if (sk) {
+ u = unix_sk(sock->sk);
+ seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds));
+ }
+}
static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
@@ -687,6 +704,9 @@ static const struct proto_ops unix_stream_ops = {
.getname = unix_getname,
.poll = unix_poll,
.ioctl = unix_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = unix_compat_ioctl,
+#endif
.listen = unix_listen,
.shutdown = unix_shutdown,
.setsockopt = sock_no_setsockopt,
@@ -697,6 +717,7 @@ static const struct proto_ops unix_stream_ops = {
.sendpage = unix_stream_sendpage,
.splice_read = unix_stream_splice_read,
.set_peek_off = unix_set_peek_off,
+ .show_fdinfo = unix_show_fdinfo,
};
static const struct proto_ops unix_dgram_ops = {
@@ -710,6 +731,9 @@ static const struct proto_ops unix_dgram_ops = {
.getname = unix_getname,
.poll = unix_dgram_poll,
.ioctl = unix_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = unix_compat_ioctl,
+#endif
.listen = sock_no_listen,
.shutdown = unix_shutdown,
.setsockopt = sock_no_setsockopt,
@@ -719,6 +743,7 @@ static const struct proto_ops unix_dgram_ops = {
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = unix_set_peek_off,
+ .show_fdinfo = unix_show_fdinfo,
};
static const struct proto_ops unix_seqpacket_ops = {
@@ -732,6 +757,9 @@ static const struct proto_ops unix_seqpacket_ops = {
.getname = unix_getname,
.poll = unix_dgram_poll,
.ioctl = unix_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = unix_compat_ioctl,
+#endif
.listen = unix_listen,
.shutdown = unix_shutdown,
.setsockopt = sock_no_setsockopt,
@@ -741,6 +769,7 @@ static const struct proto_ops unix_seqpacket_ops = {
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
.set_peek_off = unix_set_peek_off,
+ .show_fdinfo = unix_show_fdinfo,
};
static struct proto unix_proto = {
@@ -778,6 +807,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
+ memset(&u->scm_stat, 0, sizeof(struct scm_stat));
unix_insert_socket(unix_sockets_unbound(sk), sk);
out:
if (sk == NULL)
@@ -1562,6 +1592,28 @@ static bool unix_skb_scm_eq(struct sk_buff *skb,
unix_secdata_eq(scm, skb);
}
+static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
+{
+ struct scm_fp_list *fp = UNIXCB(skb).fp;
+ struct unix_sock *u = unix_sk(sk);
+
+ lockdep_assert_held(&sk->sk_receive_queue.lock);
+
+ if (unlikely(fp && fp->count))
+ u->scm_stat.nr_fds += fp->count;
+}
+
+static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
+{
+ struct scm_fp_list *fp = UNIXCB(skb).fp;
+ struct unix_sock *u = unix_sk(sk);
+
+ lockdep_assert_held(&sk->sk_receive_queue.lock);
+
+ if (unlikely(fp && fp->count))
+ u->scm_stat.nr_fds -= fp->count;
+}
+
/*
* Send AF_UNIX data.
*/
@@ -1712,7 +1764,8 @@ restart_locked:
* - unix_peer(sk) == sk by time of get but disconnected before lock
*/
if (other != sk &&
- unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+ unlikely(unix_peer(other) != sk &&
+ unix_recvq_full_lockless(other))) {
if (timeo) {
timeo = unix_wait_for_peer(other, timeo);
@@ -1747,7 +1800,10 @@ restart_locked:
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
maybe_add_creds(skb, sock, other);
- skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_lock(&other->sk_receive_queue.lock);
+ scm_stat_add(other, skb);
+ __skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_unlock(&other->sk_receive_queue.lock);
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other);
@@ -1849,7 +1905,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto pipe_err_free;
maybe_add_creds(skb, sock, other);
- skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_lock(&other->sk_receive_queue.lock);
+ scm_stat_add(other, skb);
+ __skb_queue_tail(&other->sk_receive_queue, skb);
+ spin_unlock(&other->sk_receive_queue.lock);
unix_state_unlock(other);
other->sk_data_ready(other);
sent += size;
@@ -2048,8 +2107,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
mutex_lock(&u->iolock);
skip = sk_peek_offset(sk, flags);
- skb = __skb_try_recv_datagram(sk, flags, NULL, &skip, &err,
- &last);
+ skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
+ scm_stat_del, &skip, &err, &last);
if (skb)
break;
@@ -2058,7 +2117,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
if (err != -EAGAIN)
break;
} while (timeo &&
- !__skb_wait_for_more_packets(sk, &err, &timeo, last));
+ !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
+ &err, &timeo, last));
if (!skb) { /* implies iolock unlocked */
unix_state_lock(sk);
@@ -2343,8 +2403,12 @@ unlock:
sk_peek_offset_bwd(sk, chunk);
- if (UNIXCB(skb).fp)
+ if (UNIXCB(skb).fp) {
+ spin_lock(&sk->sk_receive_queue.lock);
+ scm_stat_del(sk, skb);
+ spin_unlock(&sk->sk_receive_queue.lock);
unix_detach_fds(&scm, skb);
+ }
if (unix_skb_len(skb))
break;
@@ -2582,6 +2646,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return err;
}
+#ifdef CONFIG_COMPAT
+static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
@@ -2599,7 +2670,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
@@ -2628,7 +2699,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
mask = 0;
/* exceptional events? */
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
@@ -2638,7 +2709,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
@@ -2848,7 +2919,7 @@ static int __init af_unix_init(void)
{
int rc = -1;
- BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
rc = proto_register(&unix_proto, 1);
if (rc != 0) {
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 8abcb815af2d..56356d2980c8 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,6 +26,18 @@ config VSOCKETS_DIAG
Enable this module so userspace applications can query open sockets.
+config VSOCKETS_LOOPBACK
+ tristate "Virtual Sockets loopback transport"
+ depends on VSOCKETS
+ default y
+ select VIRTIO_VSOCKETS_COMMON
+ help
+ This module implements a loopback transport for Virtual Sockets,
+ using vmw_vsock_virtio_transport_common.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vsock_loopback. If unsure, say N.
+
config VMWARE_VMCI_VSOCKETS
tristate "VMware VMCI transport for Virtual Sockets"
depends on VSOCKETS && VMWARE_VMCI
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 7c6f9a0b67b0..6a943ec95c4a 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
+obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index ab47bf3ab66e..9c5b2a91baad 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -126,19 +126,20 @@ static struct proto vsock_proto = {
*/
#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
-static const struct vsock_transport *transport;
+#define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256)
+#define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256)
+#define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128
+
+/* Transport used for host->guest communication */
+static const struct vsock_transport *transport_h2g;
+/* Transport used for guest->host communication */
+static const struct vsock_transport *transport_g2h;
+/* Transport used for DGRAM communication */
+static const struct vsock_transport *transport_dgram;
+/* Transport used for local communication */
+static const struct vsock_transport *transport_local;
static DEFINE_MUTEX(vsock_register_mutex);
-/**** EXPORTS ****/
-
-/* Get the ID of the local context. This is transport dependent. */
-
-int vm_sockets_get_local_cid(void)
-{
- return transport->get_local_cid();
-}
-EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
-
/**** UTILS ****/
/* Each bound VSocket is stored in the bind hash table and each connected
@@ -188,7 +189,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk)
return __vsock_bind(sk, &local_addr);
}
-static int __init vsock_init_tables(void)
+static void vsock_init_tables(void)
{
int i;
@@ -197,7 +198,6 @@ static int __init vsock_init_tables(void)
for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
INIT_LIST_HEAD(&vsock_connected_table[i]);
- return 0;
}
static void __vsock_insert_bound(struct list_head *list,
@@ -230,10 +230,16 @@ static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
{
struct vsock_sock *vsk;
- list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table)
- if (addr->svm_port == vsk->local_addr.svm_port)
+ list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
+ if (vsock_addr_equals_addr(addr, &vsk->local_addr))
return sk_vsock(vsk);
+ if (addr->svm_port == vsk->local_addr.svm_port &&
+ (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
+ addr->svm_cid == VMADDR_CID_ANY))
+ return sk_vsock(vsk);
+ }
+
return NULL;
}
@@ -382,6 +388,106 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
}
EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
+static bool vsock_use_local_transport(unsigned int remote_cid)
+{
+ if (!transport_local)
+ return false;
+
+ if (remote_cid == VMADDR_CID_LOCAL)
+ return true;
+
+ if (transport_g2h) {
+ return remote_cid == transport_g2h->get_local_cid();
+ } else {
+ return remote_cid == VMADDR_CID_HOST;
+ }
+}
+
+static void vsock_deassign_transport(struct vsock_sock *vsk)
+{
+ if (!vsk->transport)
+ return;
+
+ vsk->transport->destruct(vsk);
+ module_put(vsk->transport->module);
+ vsk->transport = NULL;
+}
+
+/* Assign a transport to a socket and call the .init transport callback.
+ *
+ * Note: for stream socket this must be called when vsk->remote_addr is set
+ * (e.g. during the connect() or when a connection request on a listener
+ * socket is received).
+ * The vsk->remote_addr is used to decide which transport to use:
+ * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if
+ * g2h is not loaded, will use local transport;
+ * - remote CID <= VMADDR_CID_HOST will use guest->host transport;
+ * - remote CID > VMADDR_CID_HOST will use host->guest transport;
+ */
+int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
+{
+ const struct vsock_transport *new_transport;
+ struct sock *sk = sk_vsock(vsk);
+ unsigned int remote_cid = vsk->remote_addr.svm_cid;
+ int ret;
+
+ switch (sk->sk_type) {
+ case SOCK_DGRAM:
+ new_transport = transport_dgram;
+ break;
+ case SOCK_STREAM:
+ if (vsock_use_local_transport(remote_cid))
+ new_transport = transport_local;
+ else if (remote_cid <= VMADDR_CID_HOST)
+ new_transport = transport_g2h;
+ else
+ new_transport = transport_h2g;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ if (vsk->transport) {
+ if (vsk->transport == new_transport)
+ return 0;
+
+ vsk->transport->release(vsk);
+ vsock_deassign_transport(vsk);
+ }
+
+ /* We increase the module refcnt to prevent the transport unloading
+ * while there are open sockets assigned to it.
+ */
+ if (!new_transport || !try_module_get(new_transport->module))
+ return -ENODEV;
+
+ ret = new_transport->init(vsk, psk);
+ if (ret) {
+ module_put(new_transport->module);
+ return ret;
+ }
+
+ vsk->transport = new_transport;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vsock_assign_transport);
+
+bool vsock_find_cid(unsigned int cid)
+{
+ if (transport_g2h && cid == transport_g2h->get_local_cid())
+ return true;
+
+ if (transport_h2g && cid == VMADDR_CID_HOST)
+ return true;
+
+ if (transport_local && cid == VMADDR_CID_LOCAL)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(vsock_find_cid);
+
static struct sock *vsock_dequeue_accept(struct sock *listener)
{
struct vsock_sock *vlistener;
@@ -418,7 +524,12 @@ static bool vsock_is_pending(struct sock *sk)
static int vsock_send_shutdown(struct sock *sk, int mode)
{
- return transport->shutdown(vsock_sk(sk), mode);
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (!vsk->transport)
+ return -ENODEV;
+
+ return vsk->transport->shutdown(vsk, mode);
}
static void vsock_pending_work(struct work_struct *work)
@@ -439,7 +550,7 @@ static void vsock_pending_work(struct work_struct *work)
if (vsock_is_pending(sk)) {
vsock_remove_pending(listener, sk);
- listener->sk_ack_backlog--;
+ sk_acceptq_removed(listener);
} else if (!vsk->rejected) {
/* We are not on the pending list and accept() did not reject
* us, so we must have been accepted by our user process. We
@@ -528,13 +639,12 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
static int __vsock_bind_dgram(struct vsock_sock *vsk,
struct sockaddr_vm *addr)
{
- return transport->dgram_bind(vsk, addr);
+ return vsk->transport->dgram_bind(vsk, addr);
}
static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
{
struct vsock_sock *vsk = vsock_sk(sk);
- u32 cid;
int retval;
/* First ensure this socket isn't already bound. */
@@ -544,10 +654,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
/* Now bind to the provided address or select appropriate values if
* none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
* like AF_INET prevents binding to a non-local IP address (in most
- * cases), we only allow binding to the local CID.
+ * cases), we only allow binding to a local CID.
*/
- cid = transport->get_local_cid();
- if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
+ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid))
return -EADDRNOTAVAIL;
switch (sk->sk_socket->type) {
@@ -571,12 +680,12 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
static void vsock_connect_timeout(struct work_struct *work);
-struct sock *__vsock_create(struct net *net,
- struct socket *sock,
- struct sock *parent,
- gfp_t priority,
- unsigned short type,
- int kern)
+static struct sock *__vsock_create(struct net *net,
+ struct socket *sock,
+ struct sock *parent,
+ gfp_t priority,
+ unsigned short type,
+ int kern)
{
struct sock *sk;
struct vsock_sock *psk;
@@ -620,46 +729,52 @@ struct sock *__vsock_create(struct net *net,
vsk->trusted = psk->trusted;
vsk->owner = get_cred(psk->owner);
vsk->connect_timeout = psk->connect_timeout;
+ vsk->buffer_size = psk->buffer_size;
+ vsk->buffer_min_size = psk->buffer_min_size;
+ vsk->buffer_max_size = psk->buffer_max_size;
} else {
vsk->trusted = capable(CAP_NET_ADMIN);
vsk->owner = get_current_cred();
vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
+ vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE;
+ vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE;
+ vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE;
}
- if (transport->init(vsk, psk) < 0) {
- sk_free(sk);
- return NULL;
- }
-
- if (sock)
- vsock_insert_unbound(vsk);
-
return sk;
}
-EXPORT_SYMBOL_GPL(__vsock_create);
-static void __vsock_release(struct sock *sk)
+static void __vsock_release(struct sock *sk, int level)
{
if (sk) {
- struct sk_buff *skb;
struct sock *pending;
struct vsock_sock *vsk;
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- transport->release(vsk);
-
- lock_sock(sk);
+ /* The release call is supposed to use lock_sock_nested()
+ * rather than lock_sock(), if a sock lock should be acquired.
+ */
+ if (vsk->transport)
+ vsk->transport->release(vsk);
+ else if (sk->sk_type == SOCK_STREAM)
+ vsock_remove_sock(vsk);
+
+ /* When "level" is SINGLE_DEPTH_NESTING, use the nested
+ * version to avoid the warning "possible recursive locking
+ * detected". When "level" is 0, lock_sock_nested(sk, level)
+ * is the same as lock_sock(sk).
+ */
+ lock_sock_nested(sk, level);
sock_orphan(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
- while ((skb = skb_dequeue(&sk->sk_receive_queue)))
- kfree_skb(skb);
+ skb_queue_purge(&sk->sk_receive_queue);
/* Clean up any sockets that never were accepted. */
while ((pending = vsock_dequeue_accept(sk)) != NULL) {
- __vsock_release(pending);
+ __vsock_release(pending, SINGLE_DEPTH_NESTING);
sock_put(pending);
}
@@ -672,7 +787,7 @@ static void vsock_sk_destruct(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
- transport->destruct(vsk);
+ vsock_deassign_transport(vsk);
/* When clearing these addresses, there's no need to set the family and
* possibly register the address family with the kernel.
@@ -694,21 +809,28 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return err;
}
+struct sock *vsock_create_connected(struct sock *parent)
+{
+ return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL,
+ parent->sk_type, 0);
+}
+EXPORT_SYMBOL_GPL(vsock_create_connected);
+
s64 vsock_stream_has_data(struct vsock_sock *vsk)
{
- return transport->stream_has_data(vsk);
+ return vsk->transport->stream_has_data(vsk);
}
EXPORT_SYMBOL_GPL(vsock_stream_has_data);
s64 vsock_stream_has_space(struct vsock_sock *vsk)
{
- return transport->stream_has_space(vsk);
+ return vsk->transport->stream_has_space(vsk);
}
EXPORT_SYMBOL_GPL(vsock_stream_has_space);
static int vsock_release(struct socket *sock)
{
- __vsock_release(sock->sk);
+ __vsock_release(sock->sk, 0);
sock->sk = NULL;
sock->state = SS_FREE;
@@ -862,7 +984,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
* the queue and write as long as the socket isn't shutdown for
* sending.
*/
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
(sk->sk_shutdown & RCV_SHUTDOWN)) {
mask |= EPOLLIN | EPOLLRDNORM;
}
@@ -871,6 +993,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
} else if (sock->type == SOCK_STREAM) {
+ const struct vsock_transport *transport = vsk->transport;
lock_sock(sk);
/* Listening sockets that have connections in their accept
@@ -881,7 +1004,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
mask |= EPOLLIN | EPOLLRDNORM;
/* If there is something in the queue then we can read. */
- if (transport->stream_is_active(vsk) &&
+ if (transport && transport->stream_is_active(vsk) &&
!(sk->sk_shutdown & RCV_SHUTDOWN)) {
bool data_ready_now = false;
int ret = transport->notify_poll_in(
@@ -946,6 +1069,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk;
struct vsock_sock *vsk;
struct sockaddr_vm *remote_addr;
+ const struct vsock_transport *transport;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
@@ -954,6 +1078,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
lock_sock(sk);
@@ -1038,8 +1163,8 @@ static int vsock_dgram_connect(struct socket *sock,
if (err)
goto out;
- if (!transport->dgram_allow(remote_addr->svm_cid,
- remote_addr->svm_port)) {
+ if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
+ remote_addr->svm_port)) {
err = -EINVAL;
goto out;
}
@@ -1055,7 +1180,9 @@ out:
static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
- return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags);
+ struct vsock_sock *vsk = vsock_sk(sock->sk);
+
+ return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
}
static const struct proto_ops vsock_dgram_ops = {
@@ -1081,6 +1208,8 @@ static const struct proto_ops vsock_dgram_ops = {
static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
{
+ const struct vsock_transport *transport = vsk->transport;
+
if (!transport->cancel_pkt)
return -EOPNOTSUPP;
@@ -1117,6 +1246,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
int err;
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
struct sockaddr_vm *remote_addr;
long timeout;
DEFINE_WAIT(wait);
@@ -1151,19 +1281,26 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
goto out;
}
+ /* Set the remote address that we are connecting to. */
+ memcpy(&vsk->remote_addr, remote_addr,
+ sizeof(vsk->remote_addr));
+
+ err = vsock_assign_transport(vsk, NULL);
+ if (err)
+ goto out;
+
+ transport = vsk->transport;
+
/* The hypervisor and well-known contexts do not have socket
* endpoints.
*/
- if (!transport->stream_allow(remote_addr->svm_cid,
+ if (!transport ||
+ !transport->stream_allow(remote_addr->svm_cid,
remote_addr->svm_port)) {
err = -ENETUNREACH;
goto out;
}
- /* Set the remote address that we are connecting to. */
- memcpy(&vsk->remote_addr, remote_addr,
- sizeof(vsk->remote_addr));
-
err = vsock_auto_bind(vsk);
if (err)
goto out;
@@ -1293,7 +1430,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
err = -listener->sk_err;
if (connected) {
- listener->sk_ack_backlog--;
+ sk_acceptq_removed(listener);
lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
vconnected = vsock_sk(connected);
@@ -1358,6 +1495,23 @@ out:
return err;
}
+static void vsock_update_buffer_size(struct vsock_sock *vsk,
+ const struct vsock_transport *transport,
+ u64 val)
+{
+ if (val > vsk->buffer_max_size)
+ val = vsk->buffer_max_size;
+
+ if (val < vsk->buffer_min_size)
+ val = vsk->buffer_min_size;
+
+ if (val != vsk->buffer_size &&
+ transport && transport->notify_buffer_size)
+ transport->notify_buffer_size(vsk, &val);
+
+ vsk->buffer_size = val;
+}
+
static int vsock_stream_setsockopt(struct socket *sock,
int level,
int optname,
@@ -1367,6 +1521,7 @@ static int vsock_stream_setsockopt(struct socket *sock,
int err;
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
u64 val;
if (level != AF_VSOCK)
@@ -1387,23 +1542,26 @@ static int vsock_stream_setsockopt(struct socket *sock,
err = 0;
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
lock_sock(sk);
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
COPY_IN(val);
- transport->set_buffer_size(vsk, val);
+ vsock_update_buffer_size(vsk, transport, val);
break;
case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
COPY_IN(val);
- transport->set_max_buffer_size(vsk, val);
+ vsk->buffer_max_size = val;
+ vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
break;
case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
COPY_IN(val);
- transport->set_min_buffer_size(vsk, val);
+ vsk->buffer_min_size = val;
+ vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
break;
case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
@@ -1470,17 +1628,17 @@ static int vsock_stream_getsockopt(struct socket *sock,
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
- val = transport->get_buffer_size(vsk);
+ val = vsk->buffer_size;
COPY_OUT(val);
break;
case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
- val = transport->get_max_buffer_size(vsk);
+ val = vsk->buffer_max_size;
COPY_OUT(val);
break;
case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
- val = transport->get_min_buffer_size(vsk);
+ val = vsk->buffer_min_size;
COPY_OUT(val);
break;
@@ -1511,6 +1669,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
ssize_t total_written;
long timeout;
int err;
@@ -1519,6 +1678,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
total_written = 0;
err = 0;
@@ -1540,7 +1700,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
- if (sk->sk_state != TCP_ESTABLISHED ||
+ if (!transport || sk->sk_state != TCP_ESTABLISHED ||
!vsock_addr_bound(&vsk->local_addr)) {
err = -ENOTCONN;
goto out;
@@ -1650,6 +1810,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
{
struct sock *sk;
struct vsock_sock *vsk;
+ const struct vsock_transport *transport;
int err;
size_t target;
ssize_t copied;
@@ -1660,11 +1821,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
sk = sock->sk;
vsk = vsock_sk(sk);
+ transport = vsk->transport;
err = 0;
lock_sock(sk);
- if (sk->sk_state != TCP_ESTABLISHED) {
+ if (!transport || sk->sk_state != TCP_ESTABLISHED) {
/* Recvmsg is supposed to return 0 if a peer performs an
* orderly shutdown. Differentiate between that case and when a
* peer has not connected or a local shutdown occured with the
@@ -1838,6 +2000,10 @@ static const struct proto_ops vsock_stream_ops = {
static int vsock_create(struct net *net, struct socket *sock,
int protocol, int kern)
{
+ struct vsock_sock *vsk;
+ struct sock *sk;
+ int ret;
+
if (!sock)
return -EINVAL;
@@ -1857,7 +2023,23 @@ static int vsock_create(struct net *net, struct socket *sock,
sock->state = SS_UNCONNECTED;
- return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
+ sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ vsk = vsock_sk(sk);
+
+ if (sock->type == SOCK_DGRAM) {
+ ret = vsock_assign_transport(vsk, NULL);
+ if (ret < 0) {
+ sock_put(sk);
+ return ret;
+ }
+ }
+
+ vsock_insert_unbound(vsk);
+
+ return 0;
}
static const struct net_proto_family vsock_family_ops = {
@@ -1870,11 +2052,20 @@ static long vsock_dev_do_ioctl(struct file *filp,
unsigned int cmd, void __user *ptr)
{
u32 __user *p = ptr;
+ u32 cid = VMADDR_CID_ANY;
int retval = 0;
switch (cmd) {
case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
- if (put_user(transport->get_local_cid(), p) != 0)
+ /* To be compatible with the VMCI behavior, we prioritize the
+ * guest CID instead of well-know host CID (VMADDR_CID_HOST).
+ */
+ if (transport_g2h)
+ cid = transport_g2h->get_local_cid();
+ else if (transport_h2g)
+ cid = transport_h2g->get_local_cid();
+
+ if (put_user(cid, p) != 0)
retval = -EFAULT;
break;
@@ -1914,24 +2105,13 @@ static struct miscdevice vsock_device = {
.fops = &vsock_device_ops,
};
-int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
+static int __init vsock_init(void)
{
- int err = mutex_lock_interruptible(&vsock_register_mutex);
+ int err = 0;
- if (err)
- return err;
-
- if (transport) {
- err = -EBUSY;
- goto err_busy;
- }
-
- /* Transport must be the owner of the protocol so that it can't
- * unload while there are open sockets.
- */
- vsock_proto.owner = owner;
- transport = t;
+ vsock_init_tables();
+ vsock_proto.owner = THIS_MODULE;
vsock_device.minor = MISC_DYNAMIC_MINOR;
err = misc_register(&vsock_device);
if (err) {
@@ -1952,7 +2132,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
goto err_unregister_proto;
}
- mutex_unlock(&vsock_register_mutex);
return 0;
err_unregister_proto:
@@ -1960,44 +2139,99 @@ err_unregister_proto:
err_deregister_misc:
misc_deregister(&vsock_device);
err_reset_transport:
- transport = NULL;
-err_busy:
- mutex_unlock(&vsock_register_mutex);
return err;
}
-EXPORT_SYMBOL_GPL(__vsock_core_init);
-void vsock_core_exit(void)
+static void __exit vsock_exit(void)
{
- mutex_lock(&vsock_register_mutex);
-
misc_deregister(&vsock_device);
sock_unregister(AF_VSOCK);
proto_unregister(&vsock_proto);
-
- /* We do not want the assignment below re-ordered. */
- mb();
- transport = NULL;
-
- mutex_unlock(&vsock_register_mutex);
}
-EXPORT_SYMBOL_GPL(vsock_core_exit);
-const struct vsock_transport *vsock_core_get_transport(void)
+const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
{
- /* vsock_register_mutex not taken since only the transport uses this
- * function and only while registered.
- */
- return transport;
+ return vsk->transport;
}
EXPORT_SYMBOL_GPL(vsock_core_get_transport);
-static void __exit vsock_exit(void)
+int vsock_core_register(const struct vsock_transport *t, int features)
+{
+ const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local;
+ int err = mutex_lock_interruptible(&vsock_register_mutex);
+
+ if (err)
+ return err;
+
+ t_h2g = transport_h2g;
+ t_g2h = transport_g2h;
+ t_dgram = transport_dgram;
+ t_local = transport_local;
+
+ if (features & VSOCK_TRANSPORT_F_H2G) {
+ if (t_h2g) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_h2g = t;
+ }
+
+ if (features & VSOCK_TRANSPORT_F_G2H) {
+ if (t_g2h) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_g2h = t;
+ }
+
+ if (features & VSOCK_TRANSPORT_F_DGRAM) {
+ if (t_dgram) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_dgram = t;
+ }
+
+ if (features & VSOCK_TRANSPORT_F_LOCAL) {
+ if (t_local) {
+ err = -EBUSY;
+ goto err_busy;
+ }
+ t_local = t;
+ }
+
+ transport_h2g = t_h2g;
+ transport_g2h = t_g2h;
+ transport_dgram = t_dgram;
+ transport_local = t_local;
+
+err_busy:
+ mutex_unlock(&vsock_register_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(vsock_core_register);
+
+void vsock_core_unregister(const struct vsock_transport *t)
{
- /* Do nothing. This function makes this module removable. */
+ mutex_lock(&vsock_register_mutex);
+
+ if (transport_h2g == t)
+ transport_h2g = NULL;
+
+ if (transport_g2h == t)
+ transport_g2h = NULL;
+
+ if (transport_dgram == t)
+ transport_dgram = NULL;
+
+ if (transport_local == t)
+ transport_local = NULL;
+
+ mutex_unlock(&vsock_register_mutex);
}
+EXPORT_SYMBOL_GPL(vsock_core_unregister);
-module_init(vsock_init_tables);
+module_init(vsock_init);
module_exit(vsock_exit);
MODULE_AUTHOR("VMware, Inc.");
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 9d864ebeb7b3..3492c021925f 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -13,15 +13,16 @@
#include <linux/hyperv.h>
#include <net/sock.h>
#include <net/af_vsock.h>
+#include <asm/hyperv-tlfs.h>
/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
- * stricter requirements on the hv_sock ring buffer size of six 4K pages. Newer
- * hosts don't have this limitation; but, keep the defaults the same for compat.
+ * stricter requirements on the hv_sock ring buffer size of six 4K pages.
+ * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this
+ * limitation; but, keep the defaults the same for compat.
*/
-#define PAGE_SIZE_4K 4096
-#define RINGBUFFER_HVS_RCV_SIZE (PAGE_SIZE_4K * 6)
-#define RINGBUFFER_HVS_SND_SIZE (PAGE_SIZE_4K * 6)
-#define RINGBUFFER_HVS_MAX_SIZE (PAGE_SIZE_4K * 64)
+#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6)
+#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6)
+#define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64)
/* The MTU is 16KB per the host side's design */
#define HVS_MTU_SIZE (1024 * 16)
@@ -54,7 +55,8 @@ struct hvs_recv_buf {
* ringbuffer APIs that allow us to directly copy data from userspace buffer
* to VMBus ringbuffer.
*/
-#define HVS_SEND_BUF_SIZE (PAGE_SIZE_4K - sizeof(struct vmpipe_proto_header))
+#define HVS_SEND_BUF_SIZE \
+ (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header))
struct hvs_send_buf {
/* The header before the payload data */
@@ -77,11 +79,11 @@ struct hvs_send_buf {
VMBUS_PKT_TRAILER_SIZE)
union hvs_service_id {
- uuid_le srv_id;
+ guid_t srv_id;
struct {
unsigned int svm_port;
- unsigned char b[sizeof(uuid_le) - sizeof(unsigned int)];
+ unsigned char b[sizeof(guid_t) - sizeof(unsigned int)];
};
};
@@ -89,8 +91,8 @@ union hvs_service_id {
struct hvsock {
struct vsock_sock *vsk;
- uuid_le vm_srv_id;
- uuid_le host_srv_id;
+ guid_t vm_srv_id;
+ guid_t host_srv_id;
struct vmbus_channel *chan;
struct vmpacket_descriptor *recv_desc;
@@ -136,77 +138,39 @@ struct hvsock {
****************************************************************************
* The only valid Service GUIDs, from the perspectives of both the host and *
* Linux VM, that can be connected by the other end, must conform to this *
- * format: <port>-facb-11e6-bd58-64006a7986d3, and the "port" must be in *
- * this range [0, 0x7FFFFFFF]. *
+ * format: <port>-facb-11e6-bd58-64006a7986d3. *
****************************************************************************
*
* When we write apps on the host to connect(), the GUID ServiceID is used.
* When we write apps in Linux VM to connect(), we only need to specify the
* port and the driver will form the GUID and use that to request the host.
*
- * From the perspective of Linux VM:
- * 1. the local ephemeral port (i.e. the local auto-bound port when we call
- * connect() without explicit bind()) is generated by __vsock_bind_stream(),
- * and the range is [1024, 0xFFFFFFFF).
- * 2. the remote ephemeral port (i.e. the auto-generated remote port for
- * a connect request initiated by the host's connect()) is generated by
- * hvs_remote_addr_init() and the range is [0x80000000, 0xFFFFFFFF).
*/
-#define MAX_LISTEN_PORT ((u32)0x7FFFFFFF)
-#define MAX_VM_LISTEN_PORT MAX_LISTEN_PORT
-#define MAX_HOST_LISTEN_PORT MAX_LISTEN_PORT
-#define MIN_HOST_EPHEMERAL_PORT (MAX_HOST_LISTEN_PORT + 1)
-
/* 00000000-facb-11e6-bd58-64006a7986d3 */
-static const uuid_le srv_id_template =
- UUID_LE(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
- 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
+static const guid_t srv_id_template =
+ GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
+ 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
+
+static bool hvs_check_transport(struct vsock_sock *vsk);
-static bool is_valid_srv_id(const uuid_le *id)
+static bool is_valid_srv_id(const guid_t *id)
{
- return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(uuid_le) - 4);
+ return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4);
}
-static unsigned int get_port_by_srv_id(const uuid_le *svr_id)
+static unsigned int get_port_by_srv_id(const guid_t *svr_id)
{
return *((unsigned int *)svr_id);
}
-static void hvs_addr_init(struct sockaddr_vm *addr, const uuid_le *svr_id)
+static void hvs_addr_init(struct sockaddr_vm *addr, const guid_t *svr_id)
{
unsigned int port = get_port_by_srv_id(svr_id);
vsock_addr_init(addr, VMADDR_CID_ANY, port);
}
-static void hvs_remote_addr_init(struct sockaddr_vm *remote,
- struct sockaddr_vm *local)
-{
- static u32 host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT;
- struct sock *sk;
-
- vsock_addr_init(remote, VMADDR_CID_ANY, VMADDR_PORT_ANY);
-
- while (1) {
- /* Wrap around ? */
- if (host_ephemeral_port < MIN_HOST_EPHEMERAL_PORT ||
- host_ephemeral_port == VMADDR_PORT_ANY)
- host_ephemeral_port = MIN_HOST_EPHEMERAL_PORT;
-
- remote->svm_port = host_ephemeral_port++;
-
- sk = vsock_find_connected_socket(remote, local);
- if (!sk) {
- /* Found an available ephemeral port */
- return;
- }
-
- /* Release refcnt got in vsock_find_connected_socket */
- sock_put(sk);
- }
-}
-
static void hvs_set_channel_pending_send_size(struct vmbus_channel *chan)
{
set_channel_pending_send_size(chan,
@@ -321,7 +285,7 @@ static void hvs_close_connection(struct vmbus_channel *chan)
static void hvs_open_connection(struct vmbus_channel *chan)
{
- uuid_le *if_instance, *if_type;
+ guid_t *if_instance, *if_type;
unsigned char conn_from_host;
struct sockaddr_vm addr;
@@ -336,12 +300,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
if_type = &chan->offermsg.offer.if_type;
if_instance = &chan->offermsg.offer.if_instance;
conn_from_host = chan->offermsg.offer.u.pipe.user_def[0];
-
- /* The host or the VM should only listen on a port in
- * [0, MAX_LISTEN_PORT]
- */
- if (!is_valid_srv_id(if_type) ||
- get_port_by_srv_id(if_type) > MAX_LISTEN_PORT)
+ if (!is_valid_srv_id(if_type))
return;
hvs_addr_init(&addr, conn_from_host ? if_type : if_instance);
@@ -358,13 +317,27 @@ static void hvs_open_connection(struct vmbus_channel *chan)
if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog)
goto out;
- new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type, 0);
+ new = vsock_create_connected(sk);
if (!new)
goto out;
new->sk_state = TCP_SYN_SENT;
vnew = vsock_sk(new);
+
+ hvs_addr_init(&vnew->local_addr, if_type);
+
+ /* Remote peer is always the host */
+ vsock_addr_init(&vnew->remote_addr,
+ VMADDR_CID_HOST, VMADDR_PORT_ANY);
+ vnew->remote_addr.svm_port = get_port_by_srv_id(if_instance);
+ ret = vsock_assign_transport(vnew, vsock_sk(sk));
+ /* Transport assigned (looking at remote_addr) must be the
+ * same where we received the request.
+ */
+ if (ret || !hvs_check_transport(vnew)) {
+ sock_put(new);
+ goto out;
+ }
hvs_new = vnew->trans;
hvs_new->chan = chan;
} else {
@@ -393,10 +366,10 @@ static void hvs_open_connection(struct vmbus_channel *chan)
} else {
sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
- sndbuf = ALIGN(sndbuf, PAGE_SIZE);
+ sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE);
rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
- rcvbuf = ALIGN(rcvbuf, PAGE_SIZE);
+ rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE);
}
ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
@@ -426,10 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
if (conn_from_host) {
new->sk_state = TCP_ESTABLISHED;
- sk->sk_ack_backlog++;
-
- hvs_addr_init(&vnew->local_addr, if_type);
- hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr);
+ sk_acceptq_added(sk);
hvs_new->vm_srv_id = *if_type;
hvs_new->host_srv_id = *if_instance;
@@ -559,7 +529,7 @@ static void hvs_release(struct vsock_sock *vsk)
struct sock *sk = sk_vsock(vsk);
bool remove_sock;
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
remove_sock = hvs_close_lock_held(vsk);
release_sock(sk);
if (remove_sock)
@@ -670,7 +640,7 @@ static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
ssize_t ret = 0;
ssize_t bytes_written = 0;
- BUILD_BUG_ON(sizeof(*send_buf) != PAGE_SIZE_4K);
+ BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE);
send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL);
if (!send_buf)
@@ -753,16 +723,6 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk)
static bool hvs_stream_allow(u32 cid, u32 port)
{
- /* The host's port range [MIN_HOST_EPHEMERAL_PORT, 0xFFFFFFFF) is
- * reserved as ephemeral ports, which are used as the host's ports
- * when the host initiates connections.
- *
- * Perform this check in the guest so an immediate error is produced
- * instead of a timeout.
- */
- if (port > MAX_HOST_LISTEN_PORT)
- return false;
-
if (cid == VMADDR_CID_HOST)
return true;
@@ -843,37 +803,9 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written,
return 0;
}
-static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- /* Ignored. */
-}
-
-static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- /* Ignored. */
-}
-
-static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- /* Ignored. */
-}
-
-static u64 hvs_get_buffer_size(struct vsock_sock *vsk)
-{
- return -ENOPROTOOPT;
-}
-
-static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk)
-{
- return -ENOPROTOOPT;
-}
-
-static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk)
-{
- return -ENOPROTOOPT;
-}
-
static struct vsock_transport hvs_transport = {
+ .module = THIS_MODULE,
+
.get_local_cid = hvs_get_local_cid,
.init = hvs_sock_init,
@@ -906,14 +838,13 @@ static struct vsock_transport hvs_transport = {
.notify_send_pre_enqueue = hvs_notify_send_pre_enqueue,
.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
- .set_buffer_size = hvs_set_buffer_size,
- .set_min_buffer_size = hvs_set_min_buffer_size,
- .set_max_buffer_size = hvs_set_max_buffer_size,
- .get_buffer_size = hvs_get_buffer_size,
- .get_min_buffer_size = hvs_get_min_buffer_size,
- .get_max_buffer_size = hvs_get_max_buffer_size,
};
+static bool hvs_check_transport(struct vsock_sock *vsk)
+{
+ return vsk->transport == &hvs_transport;
+}
+
static int hvs_probe(struct hv_device *hdev,
const struct hv_vmbus_device_id *dev_id)
{
@@ -938,6 +869,24 @@ static int hvs_remove(struct hv_device *hdev)
return 0;
}
+/* hv_sock connections can not persist across hibernation, and all the hv_sock
+ * channels are forced to be rescinded before hibernation: see
+ * vmbus_bus_suspend(). Here the dummy hvs_suspend() and hvs_resume()
+ * are only needed because hibernation requires that every vmbus device's
+ * driver should have a .suspend and .resume callback: see vmbus_suspend().
+ */
+static int hvs_suspend(struct hv_device *hv_dev)
+{
+ /* Dummy */
+ return 0;
+}
+
+static int hvs_resume(struct hv_device *dev)
+{
+ /* Dummy */
+ return 0;
+}
+
/* This isn't really used. See vmbus_match() and vmbus_probe() */
static const struct hv_vmbus_device_id id_table[] = {
{},
@@ -949,6 +898,8 @@ static struct hv_driver hvs_drv = {
.id_table = id_table,
.probe = hvs_probe,
.remove = hvs_remove,
+ .suspend = hvs_suspend,
+ .resume = hvs_resume,
};
static int __init hvs_init(void)
@@ -962,7 +913,7 @@ static int __init hvs_init(void)
if (ret != 0)
return ret;
- ret = vsock_core_init(&hvs_transport);
+ ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H);
if (ret) {
vmbus_driver_unregister(&hvs_drv);
return ret;
@@ -973,7 +924,7 @@ static int __init hvs_init(void)
static void __exit hvs_exit(void)
{
- vsock_core_exit();
+ vsock_core_unregister(&hvs_transport);
vmbus_driver_unregister(&hvs_drv);
}
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 0815d1357861..dfbaf6bd8b1c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -44,10 +44,6 @@ struct virtio_vsock {
spinlock_t send_pkt_list_lock;
struct list_head send_pkt_list;
- struct work_struct loopback_work;
- spinlock_t loopback_list_lock; /* protects loopback_list */
- struct list_head loopback_list;
-
atomic_t queued_replies;
/* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
@@ -86,47 +82,6 @@ out_rcu:
return ret;
}
-static void virtio_transport_loopback_work(struct work_struct *work)
-{
- struct virtio_vsock *vsock =
- container_of(work, struct virtio_vsock, loopback_work);
- LIST_HEAD(pkts);
-
- spin_lock_bh(&vsock->loopback_list_lock);
- list_splice_init(&vsock->loopback_list, &pkts);
- spin_unlock_bh(&vsock->loopback_list_lock);
-
- mutex_lock(&vsock->rx_lock);
-
- if (!vsock->rx_run)
- goto out;
-
- while (!list_empty(&pkts)) {
- struct virtio_vsock_pkt *pkt;
-
- pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
- list_del_init(&pkt->list);
-
- virtio_transport_recv_pkt(pkt);
- }
-out:
- mutex_unlock(&vsock->rx_lock);
-}
-
-static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
- struct virtio_vsock_pkt *pkt)
-{
- int len = pkt->len;
-
- spin_lock_bh(&vsock->loopback_list_lock);
- list_add_tail(&pkt->list, &vsock->loopback_list);
- spin_unlock_bh(&vsock->loopback_list_lock);
-
- queue_work(virtio_vsock_workqueue, &vsock->loopback_work);
-
- return len;
-}
-
static void
virtio_transport_send_pkt_work(struct work_struct *work)
{
@@ -221,7 +176,8 @@ virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
}
if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
- len = virtio_transport_send_pkt_loopback(vsock, pkt);
+ virtio_transport_free_pkt(pkt);
+ len = -ENODEV;
goto out_rcu;
}
@@ -307,6 +263,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
break;
}
+ pkt->buf_len = buf_len;
pkt->len = buf_len;
sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
@@ -369,59 +326,6 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
return val < virtqueue_get_vring_size(vq);
}
-static void virtio_transport_rx_work(struct work_struct *work)
-{
- struct virtio_vsock *vsock =
- container_of(work, struct virtio_vsock, rx_work);
- struct virtqueue *vq;
-
- vq = vsock->vqs[VSOCK_VQ_RX];
-
- mutex_lock(&vsock->rx_lock);
-
- if (!vsock->rx_run)
- goto out;
-
- do {
- virtqueue_disable_cb(vq);
- for (;;) {
- struct virtio_vsock_pkt *pkt;
- unsigned int len;
-
- if (!virtio_transport_more_replies(vsock)) {
- /* Stop rx until the device processes already
- * pending replies. Leave rx virtqueue
- * callbacks disabled.
- */
- goto out;
- }
-
- pkt = virtqueue_get_buf(vq, &len);
- if (!pkt) {
- break;
- }
-
- vsock->rx_buf_nr--;
-
- /* Drop short/long packets */
- if (unlikely(len < sizeof(pkt->hdr) ||
- len > sizeof(pkt->hdr) + pkt->len)) {
- virtio_transport_free_pkt(pkt);
- continue;
- }
-
- pkt->len = len - sizeof(pkt->hdr);
- virtio_transport_deliver_tap_pkt(pkt);
- virtio_transport_recv_pkt(pkt);
- }
- } while (!virtqueue_enable_cb(vq));
-
-out:
- if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
- virtio_vsock_rx_fill(vsock);
- mutex_unlock(&vsock->rx_lock);
-}
-
/* event_lock must be held */
static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
struct virtio_vsock_event *event)
@@ -541,6 +445,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
static struct virtio_transport virtio_transport = {
.transport = {
+ .module = THIS_MODULE,
+
.get_local_cid = virtio_transport_get_local_cid,
.init = virtio_transport_do_socket_init,
@@ -573,18 +479,65 @@ static struct virtio_transport virtio_transport = {
.notify_send_pre_block = virtio_transport_notify_send_pre_block,
.notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
-
- .set_buffer_size = virtio_transport_set_buffer_size,
- .set_min_buffer_size = virtio_transport_set_min_buffer_size,
- .set_max_buffer_size = virtio_transport_set_max_buffer_size,
- .get_buffer_size = virtio_transport_get_buffer_size,
- .get_min_buffer_size = virtio_transport_get_min_buffer_size,
- .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ .notify_buffer_size = virtio_transport_notify_buffer_size,
},
.send_pkt = virtio_transport_send_pkt,
};
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, rx_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ mutex_lock(&vsock->rx_lock);
+
+ if (!vsock->rx_run)
+ goto out;
+
+ do {
+ virtqueue_disable_cb(vq);
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ if (!virtio_transport_more_replies(vsock)) {
+ /* Stop rx until the device processes already
+ * pending replies. Leave rx virtqueue
+ * callbacks disabled.
+ */
+ goto out;
+ }
+
+ pkt = virtqueue_get_buf(vq, &len);
+ if (!pkt) {
+ break;
+ }
+
+ vsock->rx_buf_nr--;
+
+ /* Drop short/long packets */
+ if (unlikely(len < sizeof(pkt->hdr) ||
+ len > sizeof(pkt->hdr) + pkt->len)) {
+ virtio_transport_free_pkt(pkt);
+ continue;
+ }
+
+ pkt->len = len - sizeof(pkt->hdr);
+ virtio_transport_deliver_tap_pkt(pkt);
+ virtio_transport_recv_pkt(&virtio_transport, pkt);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+out:
+ if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+}
+
static int virtio_vsock_probe(struct virtio_device *vdev)
{
vq_callback_t *callbacks[] = {
@@ -636,13 +589,10 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
mutex_init(&vsock->event_lock);
spin_lock_init(&vsock->send_pkt_list_lock);
INIT_LIST_HEAD(&vsock->send_pkt_list);
- spin_lock_init(&vsock->loopback_list_lock);
- INIT_LIST_HEAD(&vsock->loopback_list);
INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
INIT_WORK(&vsock->event_work, virtio_transport_event_work);
INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
- INIT_WORK(&vsock->loopback_work, virtio_transport_loopback_work);
mutex_lock(&vsock->tx_lock);
vsock->tx_run = true;
@@ -723,22 +673,12 @@ static void virtio_vsock_remove(struct virtio_device *vdev)
}
spin_unlock_bh(&vsock->send_pkt_list_lock);
- spin_lock_bh(&vsock->loopback_list_lock);
- while (!list_empty(&vsock->loopback_list)) {
- pkt = list_first_entry(&vsock->loopback_list,
- struct virtio_vsock_pkt, list);
- list_del(&pkt->list);
- virtio_transport_free_pkt(pkt);
- }
- spin_unlock_bh(&vsock->loopback_list_lock);
-
/* Delete virtqueues and flush outstanding callbacks if any */
vdev->config->del_vqs(vdev);
/* Other works can be queued before 'config->del_vqs()', so we flush
* all works before to free the vsock object to avoid use after free.
*/
- flush_work(&vsock->loopback_work);
flush_work(&vsock->rx_work);
flush_work(&vsock->tx_work);
flush_work(&vsock->event_work);
@@ -775,7 +715,8 @@ static int __init virtio_vsock_init(void)
if (!virtio_vsock_workqueue)
return -ENOMEM;
- ret = vsock_core_init(&virtio_transport.transport);
+ ret = vsock_core_register(&virtio_transport.transport,
+ VSOCK_TRANSPORT_F_G2H);
if (ret)
goto out_wq;
@@ -786,7 +727,7 @@ static int __init virtio_vsock_init(void)
return 0;
out_vci:
- vsock_core_exit();
+ vsock_core_unregister(&virtio_transport.transport);
out_wq:
destroy_workqueue(virtio_vsock_workqueue);
return ret;
@@ -795,7 +736,7 @@ out_wq:
static void __exit virtio_vsock_exit(void)
{
unregister_virtio_driver(&virtio_vsock_driver);
- vsock_core_exit();
+ vsock_core_unregister(&virtio_transport.transport);
destroy_workqueue(virtio_vsock_workqueue);
}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 6f1a8aff65c5..d9f0c9c5425a 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -11,9 +11,6 @@
#include <linux/sched/signal.h>
#include <linux/ctype.h>
#include <linux/list.h>
-#include <linux/virtio.h>
-#include <linux/virtio_ids.h>
-#include <linux/virtio_config.h>
#include <linux/virtio_vsock.h>
#include <uapi/linux/vsockmon.h>
@@ -26,9 +23,16 @@
/* How long to wait for graceful shutdown of a connection */
#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
-static const struct virtio_transport *virtio_transport_get_ops(void)
+/* Threshold for detecting small packets to copy */
+#define GOOD_COPY_LEN 128
+
+static const struct virtio_transport *
+virtio_transport_get_ops(struct vsock_sock *vsk)
{
- const struct vsock_transport *t = vsock_core_get_transport();
+ const struct vsock_transport *t = vsock_core_get_transport(vsk);
+
+ if (WARN_ON(!t))
+ return NULL;
return container_of(t, struct virtio_transport, transport);
}
@@ -64,6 +68,9 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
pkt->buf = kmalloc(len, GFP_KERNEL);
if (!pkt->buf)
goto out_pkt;
+
+ pkt->buf_len = len;
+
err = memcpy_from_msg(pkt->buf, info->msg, len);
if (err)
goto out;
@@ -91,8 +98,17 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
struct virtio_vsock_pkt *pkt = opaque;
struct af_vsockmon_hdr *hdr;
struct sk_buff *skb;
+ size_t payload_len;
+ void *payload_buf;
- skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
+ /* A packet could be split to fit the RX buffer, so we can retrieve
+ * the payload length from the header and the buffer pointer taking
+ * care of the offset in the original packet.
+ */
+ payload_len = le32_to_cpu(pkt->hdr.len);
+ payload_buf = pkt->buf + pkt->off;
+
+ skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
GFP_ATOMIC);
if (!skb)
return NULL;
@@ -132,8 +148,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
- if (pkt->len) {
- skb_put_data(skb, pkt->buf, pkt->len);
+ if (payload_len) {
+ skb_put_data(skb, payload_buf, payload_len);
}
return skb;
@@ -145,15 +161,25 @@ void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt)
}
EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt);
+/* This function can only be used on connecting/connected sockets,
+ * since a socket assigned to a transport is required.
+ *
+ * Do not use on listener sockets!
+ */
static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
struct virtio_vsock_pkt_info *info)
{
u32 src_cid, src_port, dst_cid, dst_port;
+ const struct virtio_transport *t_ops;
struct virtio_vsock_sock *vvs;
struct virtio_vsock_pkt *pkt;
u32 pkt_len = info->pkt_len;
- src_cid = vm_sockets_get_local_cid();
+ t_ops = virtio_transport_get_ops(vsk);
+ if (unlikely(!t_ops))
+ return -EFAULT;
+
+ src_cid = t_ops->transport.get_local_cid();
src_port = vsk->local_addr.svm_port;
if (!info->remote_cid) {
dst_cid = vsk->remote_addr.svm_cid;
@@ -166,8 +192,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
vvs = vsk->trans;
/* we can send less than pkt_len bytes */
- if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
- pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+ if (pkt_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE)
+ pkt_len = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
/* virtio_transport_get_credit might return less than pkt_len credit */
pkt_len = virtio_transport_get_credit(vvs, pkt_len);
@@ -186,13 +212,17 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
virtio_transport_inc_tx_pkt(vvs, pkt);
- return virtio_transport_get_ops()->send_pkt(pkt);
+ return t_ops->send_pkt(pkt);
}
-static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
struct virtio_vsock_pkt *pkt)
{
+ if (vvs->rx_bytes + pkt->len > vvs->buf_alloc)
+ return false;
+
vvs->rx_bytes += pkt->len;
+ return true;
}
static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
@@ -204,10 +234,11 @@ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
{
- spin_lock_bh(&vvs->tx_lock);
+ spin_lock_bh(&vvs->rx_lock);
+ vvs->last_fwd_cnt = vvs->fwd_cnt;
pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
- spin_unlock_bh(&vvs->tx_lock);
+ spin_unlock_bh(&vvs->rx_lock);
}
EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
@@ -248,6 +279,55 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
}
static ssize_t
+virtio_transport_stream_do_peek(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ struct virtio_vsock_pkt *pkt;
+ size_t bytes, total = 0, off;
+ int err = -EFAULT;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ list_for_each_entry(pkt, &vvs->rx_queue, list) {
+ off = pkt->off;
+
+ if (total == len)
+ break;
+
+ while (total < len && off < pkt->len) {
+ bytes = len - total;
+ if (bytes > pkt->len - off)
+ bytes = pkt->len - off;
+
+ /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = memcpy_to_msg(msg, pkt->buf + off, bytes);
+ if (err)
+ goto out;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ total += bytes;
+ off += bytes;
+ }
+ }
+
+ spin_unlock_bh(&vvs->rx_lock);
+
+ return total;
+
+out:
+ if (total)
+ err = total;
+ return err;
+}
+
+static ssize_t
virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct msghdr *msg,
size_t len)
@@ -255,6 +335,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
struct virtio_vsock_sock *vvs = vsk->trans;
struct virtio_vsock_pkt *pkt;
size_t bytes, total = 0;
+ u32 free_space;
int err = -EFAULT;
spin_lock_bh(&vvs->rx_lock);
@@ -285,11 +366,24 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
virtio_transport_free_pkt(pkt);
}
}
+
+ free_space = vvs->buf_alloc - (vvs->fwd_cnt - vvs->last_fwd_cnt);
+
spin_unlock_bh(&vvs->rx_lock);
- /* Send a credit pkt to peer */
- virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
- NULL);
+ /* To reduce the number of credit update messages,
+ * don't update credits as long as lots of space is available.
+ * Note: the limit chosen here is arbitrary. Setting the limit
+ * too high causes extra messages. Too low causes transmitter
+ * stalls. As stalls are in theory more expensive than extra
+ * messages, we set the limit to a high value. TODO: experiment
+ * with different values.
+ */
+ if (free_space < VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
+ virtio_transport_send_credit_update(vsk,
+ VIRTIO_VSOCK_TYPE_STREAM,
+ NULL);
+ }
return total;
@@ -305,9 +399,9 @@ virtio_transport_stream_dequeue(struct vsock_sock *vsk,
size_t len, int flags)
{
if (flags & MSG_PEEK)
- return -EOPNOTSUPP;
-
- return virtio_transport_stream_do_dequeue(vsk, msg, len);
+ return virtio_transport_stream_do_peek(vsk, msg, len);
+ else
+ return virtio_transport_stream_do_dequeue(vsk, msg, len);
}
EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
@@ -369,20 +463,16 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
vsk->trans = vvs;
vvs->vsk = vsk;
- if (psk) {
+ if (psk && psk->trans) {
struct virtio_vsock_sock *ptrans = psk->trans;
- vvs->buf_size = ptrans->buf_size;
- vvs->buf_size_min = ptrans->buf_size_min;
- vvs->buf_size_max = ptrans->buf_size_max;
vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
- } else {
- vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
- vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
- vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
}
- vvs->buf_alloc = vvs->buf_size;
+ if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE;
+
+ vvs->buf_alloc = vsk->buffer_size;
spin_lock_init(&vvs->rx_lock);
spin_lock_init(&vvs->tx_lock);
@@ -392,68 +482,20 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
}
EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
-u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+/* sk_lock held by the caller */
+void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val)
{
struct virtio_vsock_sock *vvs = vsk->trans;
- return vvs->buf_size;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
-
-u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
+ if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ *val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- return vvs->buf_size_min;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
-
-u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- return vvs->buf_size_max;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
-
-void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
- val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- if (val < vvs->buf_size_min)
- vvs->buf_size_min = val;
- if (val > vvs->buf_size_max)
- vvs->buf_size_max = val;
- vvs->buf_size = val;
- vvs->buf_alloc = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
-
-void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
- val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- if (val > vvs->buf_size)
- vvs->buf_size = val;
- vvs->buf_size_min = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
-
-void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- struct virtio_vsock_sock *vvs = vsk->trans;
+ vvs->buf_alloc = *val;
- if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
- val = VIRTIO_VSOCK_MAX_BUF_SIZE;
- if (val < vvs->buf_size)
- vvs->buf_size = val;
- vvs->buf_size_max = val;
+ virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+ NULL);
}
-EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size);
int
virtio_transport_notify_poll_in(struct vsock_sock *vsk,
@@ -545,9 +587,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
{
- struct virtio_vsock_sock *vvs = vsk->trans;
-
- return vvs->buf_size;
+ return vsk->buffer_size;
}
EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
@@ -659,9 +699,9 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
/* Normally packets are associated with a socket. There may be no socket if an
* attempt was made to connect to a socket that does not exist.
*/
-static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
+ struct virtio_vsock_pkt *pkt)
{
- const struct virtio_transport *t;
struct virtio_vsock_pkt *reply;
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RST,
@@ -681,7 +721,6 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
if (!reply)
return -ENOMEM;
- t = virtio_transport_get_ops();
if (!t) {
virtio_transport_free_pkt(reply);
return -ENOTCONN;
@@ -790,7 +829,7 @@ void virtio_transport_release(struct vsock_sock *vsk)
struct sock *sk = &vsk->sk;
bool remove_sock = true;
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (sk->sk_type == SOCK_STREAM)
remove_sock = virtio_transport_close(vsk);
@@ -841,24 +880,64 @@ destroy:
return err;
}
+static void
+virtio_transport_recv_enqueue(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool can_enqueue, free_pkt = false;
+
+ pkt->len = le32_to_cpu(pkt->hdr.len);
+ pkt->off = 0;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt);
+ if (!can_enqueue) {
+ free_pkt = true;
+ goto out;
+ }
+
+ /* Try to copy small packets into the buffer of last packet queued,
+ * to avoid wasting memory queueing the entire buffer with a small
+ * payload.
+ */
+ if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) {
+ struct virtio_vsock_pkt *last_pkt;
+
+ last_pkt = list_last_entry(&vvs->rx_queue,
+ struct virtio_vsock_pkt, list);
+
+ /* If there is space in the last packet queued, we copy the
+ * new packet in its buffer.
+ */
+ if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
+ memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
+ pkt->len);
+ last_pkt->len += pkt->len;
+ free_pkt = true;
+ goto out;
+ }
+ }
+
+ list_add_tail(&pkt->list, &vvs->rx_queue);
+
+out:
+ spin_unlock_bh(&vvs->rx_lock);
+ if (free_pkt)
+ virtio_transport_free_pkt(pkt);
+}
+
static int
virtio_transport_recv_connected(struct sock *sk,
struct virtio_vsock_pkt *pkt)
{
struct vsock_sock *vsk = vsock_sk(sk);
- struct virtio_vsock_sock *vvs = vsk->trans;
int err = 0;
switch (le16_to_cpu(pkt->hdr.op)) {
case VIRTIO_VSOCK_OP_RW:
- pkt->len = le32_to_cpu(pkt->hdr.len);
- pkt->off = 0;
-
- spin_lock_bh(&vvs->rx_lock);
- virtio_transport_inc_rx_pkt(vvs, pkt);
- list_add_tail(&pkt->list, &vvs->rx_queue);
- spin_unlock_bh(&vvs->rx_lock);
-
+ virtio_transport_recv_enqueue(vsk, pkt);
sk->sk_data_ready(sk);
return err;
case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
@@ -870,9 +949,11 @@ virtio_transport_recv_connected(struct sock *sk,
if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
vsk->peer_shutdown |= SEND_SHUTDOWN;
if (vsk->peer_shutdown == SHUTDOWN_MASK &&
- vsock_stream_has_data(vsk) <= 0) {
- sock_set_flag(sk, SOCK_DONE);
- sk->sk_state = TCP_CLOSING;
+ vsock_stream_has_data(vsk) <= 0 &&
+ !sock_flag(sk, SOCK_DONE)) {
+ (void)virtio_transport_reset(vsk, NULL);
+
+ virtio_transport_do_close(vsk, true);
}
if (le32_to_cpu(pkt->hdr.flags))
sk->sk_state_change(sk);
@@ -915,32 +996,57 @@ virtio_transport_send_response(struct vsock_sock *vsk,
return virtio_transport_send_pkt_info(vsk, &info);
}
+static bool virtio_transport_space_update(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool space_available;
+
+ /* Listener sockets are not associated with any transport, so we are
+ * not able to take the state to see if there is space available in the
+ * remote peer, but since they are only used to receive requests, we
+ * can assume that there is always space available in the other peer.
+ */
+ if (!vvs)
+ return true;
+
+ /* buf_alloc and fwd_cnt is always included in the hdr */
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+ vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+ space_available = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+ return space_available;
+}
+
/* Handle server socket */
static int
-virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
+ struct virtio_transport *t)
{
struct vsock_sock *vsk = vsock_sk(sk);
struct vsock_sock *vchild;
struct sock *child;
+ int ret;
if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
- virtio_transport_reset(vsk, pkt);
+ virtio_transport_reset_no_sock(t, pkt);
return -EINVAL;
}
if (sk_acceptq_is_full(sk)) {
- virtio_transport_reset(vsk, pkt);
+ virtio_transport_reset_no_sock(t, pkt);
return -ENOMEM;
}
- child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type, 0);
+ child = vsock_create_connected(sk);
if (!child) {
- virtio_transport_reset(vsk, pkt);
+ virtio_transport_reset_no_sock(t, pkt);
return -ENOMEM;
}
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
lock_sock_nested(child, SINGLE_DEPTH_NESTING);
@@ -952,6 +1058,20 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
le32_to_cpu(pkt->hdr.src_port));
+ ret = vsock_assign_transport(vchild, vsk);
+ /* Transport assigned (looking at remote_addr) must be the same
+ * where we received the request.
+ */
+ if (ret || vchild->transport != &t->transport) {
+ release_sock(child);
+ virtio_transport_reset_no_sock(t, pkt);
+ sock_put(child);
+ return ret;
+ }
+
+ if (virtio_transport_space_update(child, pkt))
+ child->sk_write_space(child);
+
vsock_insert_connected(vchild);
vsock_enqueue_accept(sk, child);
virtio_transport_send_response(vchild, pkt);
@@ -962,26 +1082,11 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
return 0;
}
-static bool virtio_transport_space_update(struct sock *sk,
- struct virtio_vsock_pkt *pkt)
-{
- struct vsock_sock *vsk = vsock_sk(sk);
- struct virtio_vsock_sock *vvs = vsk->trans;
- bool space_available;
-
- /* buf_alloc and fwd_cnt is always included in the hdr */
- spin_lock_bh(&vvs->tx_lock);
- vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
- vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
- space_available = virtio_transport_has_space(vsk);
- spin_unlock_bh(&vvs->tx_lock);
- return space_available;
-}
-
/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
* lock.
*/
-void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+void virtio_transport_recv_pkt(struct virtio_transport *t,
+ struct virtio_vsock_pkt *pkt)
{
struct sockaddr_vm src, dst;
struct vsock_sock *vsk;
@@ -1003,7 +1108,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
le32_to_cpu(pkt->hdr.fwd_cnt));
if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
- (void)virtio_transport_reset_no_sock(pkt);
+ (void)virtio_transport_reset_no_sock(t, pkt);
goto free_pkt;
}
@@ -1014,7 +1119,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
if (!sk) {
sk = vsock_find_bound_socket(&dst);
if (!sk) {
- (void)virtio_transport_reset_no_sock(pkt);
+ (void)virtio_transport_reset_no_sock(t, pkt);
goto free_pkt;
}
}
@@ -1033,7 +1138,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
switch (sk->sk_state) {
case TCP_LISTEN:
- virtio_transport_recv_listen(sk, pkt);
+ virtio_transport_recv_listen(sk, pkt, t);
virtio_transport_free_pkt(pkt);
break;
case TCP_SYN_SENT:
@@ -1051,6 +1156,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
virtio_transport_free_pkt(pkt);
break;
}
+
release_sock(sk);
/* Release refcnt obtained when we fetched this socket out of the
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 8c9c4ed90fa7..4b8b1150a738 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -57,6 +57,7 @@ static bool vmci_transport_old_proto_override(bool *old_pkt_proto);
static u16 vmci_transport_new_proto_supported_versions(void);
static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto,
bool old_pkt_proto);
+static bool vmci_check_transport(struct vsock_sock *vsk);
struct vmci_transport_recv_pkt_info {
struct work_struct work;
@@ -74,15 +75,6 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
static int PROTOCOL_OVERRIDE = -1;
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN 128
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE 262144
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX 262144
-
-/* The default peer timeout indicates how long we will wait for a peer response
- * to a control message.
- */
-#define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
-
/* Helper function to convert from a VMCI error code to a VSock error code. */
static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
@@ -656,7 +648,7 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
static bool vmci_transport_stream_allow(u32 cid, u32 port)
{
static const u32 non_socket_contexts[] = {
- VMADDR_CID_RESERVED,
+ VMADDR_CID_LOCAL,
};
int i;
@@ -1013,8 +1005,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
return -ECONNREFUSED;
}
- pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
- sk->sk_type, 0);
+ pending = vsock_create_connected(sk);
if (!pending) {
vmci_transport_send_reset(sk, pkt);
return -ENOMEM;
@@ -1027,14 +1018,24 @@ static int vmci_transport_recv_listen(struct sock *sk,
vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context,
pkt->src_port);
+ err = vsock_assign_transport(vpending, vsock_sk(sk));
+ /* Transport assigned (looking at remote_addr) must be the same
+ * where we received the request.
+ */
+ if (err || !vmci_check_transport(vpending)) {
+ vmci_transport_send_reset(sk, pkt);
+ sock_put(pending);
+ return err;
+ }
+
/* If the proposed size fits within our min/max, accept it. Otherwise
* propose our own size.
*/
- if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size &&
- pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) {
+ if (pkt->u.size >= vpending->buffer_min_size &&
+ pkt->u.size <= vpending->buffer_max_size) {
qp_size = pkt->u.size;
} else {
- qp_size = vmci_trans(vpending)->queue_pair_size;
+ qp_size = vpending->buffer_size;
}
/* Figure out if we are using old or new requests based on the
@@ -1098,12 +1099,12 @@ static int vmci_transport_recv_listen(struct sock *sk,
}
vsock_add_pending(sk, pending);
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
pending->sk_state = TCP_SYN_SENT;
vmci_trans(vpending)->produce_size =
vmci_trans(vpending)->consume_size = qp_size;
- vmci_trans(vpending)->queue_pair_size = qp_size;
+ vpending->buffer_size = qp_size;
vmci_trans(vpending)->notify_ops->process_request(pending);
@@ -1397,8 +1398,8 @@ static int vmci_transport_recv_connecting_client_negotiate(
vsk->ignore_connecting_rst = false;
/* Verify that we're OK with the proposed queue pair size */
- if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size ||
- pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) {
+ if (pkt->u.size < vsk->buffer_min_size ||
+ pkt->u.size > vsk->buffer_max_size) {
err = -EINVAL;
goto destroy;
}
@@ -1503,8 +1504,7 @@ vmci_transport_recv_connecting_client_invalid(struct sock *sk,
vsk->sent_request = false;
vsk->ignore_connecting_rst = true;
- err = vmci_transport_send_conn_request(
- sk, vmci_trans(vsk)->queue_pair_size);
+ err = vmci_transport_send_conn_request(sk, vsk->buffer_size);
if (err < 0)
err = vmci_transport_error_to_vsock_error(err);
else
@@ -1588,21 +1588,6 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,
INIT_LIST_HEAD(&vmci_trans(vsk)->elem);
vmci_trans(vsk)->sk = &vsk->sk;
spin_lock_init(&vmci_trans(vsk)->lock);
- if (psk) {
- vmci_trans(vsk)->queue_pair_size =
- vmci_trans(psk)->queue_pair_size;
- vmci_trans(vsk)->queue_pair_min_size =
- vmci_trans(psk)->queue_pair_min_size;
- vmci_trans(vsk)->queue_pair_max_size =
- vmci_trans(psk)->queue_pair_max_size;
- } else {
- vmci_trans(vsk)->queue_pair_size =
- VMCI_TRANSPORT_DEFAULT_QP_SIZE;
- vmci_trans(vsk)->queue_pair_min_size =
- VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN;
- vmci_trans(vsk)->queue_pair_max_size =
- VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX;
- }
return 0;
}
@@ -1818,8 +1803,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
if (vmci_transport_old_proto_override(&old_pkt_proto) &&
old_pkt_proto) {
- err = vmci_transport_send_conn_request(
- sk, vmci_trans(vsk)->queue_pair_size);
+ err = vmci_transport_send_conn_request(sk, vsk->buffer_size);
if (err < 0) {
sk->sk_state = TCP_CLOSE;
return err;
@@ -1827,8 +1811,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
} else {
int supported_proto_versions =
vmci_transport_new_proto_supported_versions();
- err = vmci_transport_send_conn_request2(
- sk, vmci_trans(vsk)->queue_pair_size,
+ err = vmci_transport_send_conn_request2(sk, vsk->buffer_size,
supported_proto_versions);
if (err < 0) {
sk->sk_state = TCP_CLOSE;
@@ -1881,46 +1864,6 @@ static bool vmci_transport_stream_is_active(struct vsock_sock *vsk)
return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle);
}
-static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk)
-{
- return vmci_trans(vsk)->queue_pair_size;
-}
-
-static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk)
-{
- return vmci_trans(vsk)->queue_pair_min_size;
-}
-
-static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk)
-{
- return vmci_trans(vsk)->queue_pair_max_size;
-}
-
-static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
- if (val < vmci_trans(vsk)->queue_pair_min_size)
- vmci_trans(vsk)->queue_pair_min_size = val;
- if (val > vmci_trans(vsk)->queue_pair_max_size)
- vmci_trans(vsk)->queue_pair_max_size = val;
- vmci_trans(vsk)->queue_pair_size = val;
-}
-
-static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk,
- u64 val)
-{
- if (val > vmci_trans(vsk)->queue_pair_size)
- vmci_trans(vsk)->queue_pair_size = val;
- vmci_trans(vsk)->queue_pair_min_size = val;
-}
-
-static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk,
- u64 val)
-{
- if (val < vmci_trans(vsk)->queue_pair_size)
- vmci_trans(vsk)->queue_pair_size = val;
- vmci_trans(vsk)->queue_pair_max_size = val;
-}
-
static int vmci_transport_notify_poll_in(
struct vsock_sock *vsk,
size_t target,
@@ -2076,7 +2019,8 @@ static u32 vmci_transport_get_local_cid(void)
return vmci_get_context_id();
}
-static const struct vsock_transport vmci_transport = {
+static struct vsock_transport vmci_transport = {
+ .module = THIS_MODULE,
.init = vmci_transport_socket_init,
.destruct = vmci_transport_destruct,
.release = vmci_transport_release,
@@ -2103,15 +2047,26 @@ static const struct vsock_transport vmci_transport = {
.notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue,
.notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue,
.shutdown = vmci_transport_shutdown,
- .set_buffer_size = vmci_transport_set_buffer_size,
- .set_min_buffer_size = vmci_transport_set_min_buffer_size,
- .set_max_buffer_size = vmci_transport_set_max_buffer_size,
- .get_buffer_size = vmci_transport_get_buffer_size,
- .get_min_buffer_size = vmci_transport_get_min_buffer_size,
- .get_max_buffer_size = vmci_transport_get_max_buffer_size,
.get_local_cid = vmci_transport_get_local_cid,
};
+static bool vmci_check_transport(struct vsock_sock *vsk)
+{
+ return vsk->transport == &vmci_transport;
+}
+
+void vmci_vsock_transport_cb(bool is_host)
+{
+ int features;
+
+ if (is_host)
+ features = VSOCK_TRANSPORT_F_H2G;
+ else
+ features = VSOCK_TRANSPORT_F_G2H;
+
+ vsock_core_register(&vmci_transport, features);
+}
+
static int __init vmci_transport_init(void)
{
int err;
@@ -2128,7 +2083,6 @@ static int __init vmci_transport_init(void)
pr_err("Unable to create datagram handle. (%d)\n", err);
return vmci_transport_error_to_vsock_error(err);
}
-
err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED,
vmci_transport_qp_resumed_cb,
NULL, &vmci_transport_qp_resumed_sub_id);
@@ -2139,12 +2093,21 @@ static int __init vmci_transport_init(void)
goto err_destroy_stream_handle;
}
- err = vsock_core_init(&vmci_transport);
+ /* Register only with dgram feature, other features (H2G, G2H) will be
+ * registered when the first host or guest becomes active.
+ */
+ err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM);
if (err < 0)
goto err_unsubscribe;
+ err = vmci_register_vsock_callback(vmci_vsock_transport_cb);
+ if (err < 0)
+ goto err_unregister;
+
return 0;
+err_unregister:
+ vsock_core_unregister(&vmci_transport);
err_unsubscribe:
vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
err_destroy_stream_handle:
@@ -2170,7 +2133,8 @@ static void __exit vmci_transport_exit(void)
vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
}
- vsock_core_exit();
+ vmci_register_vsock_callback(NULL);
+ vsock_core_unregister(&vmci_transport);
}
module_exit(vmci_transport_exit);
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index 1ca1e8640b31..b7b072194282 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -108,9 +108,6 @@ struct vmci_transport {
struct vmci_qp *qpair;
u64 produce_size;
u64 consume_size;
- u64 queue_pair_size;
- u64 queue_pair_min_size;
- u64 queue_pair_max_size;
u32 detach_sub_id;
union vmci_transport_notify notify;
const struct vmci_transport_notify_ops *notify_ops;
diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h
index 7843f08d4290..a1aa5a998c0e 100644
--- a/net/vmw_vsock/vmci_transport_notify.h
+++ b/net/vmw_vsock/vmci_transport_notify.h
@@ -11,7 +11,6 @@
#include <linux/types.h>
#include <linux/vmw_vmci_defs.h>
#include <linux/vmw_vmci_api.h>
-#include <linux/vm_sockets.h>
#include "vmci_transport.h"
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
new file mode 100644
index 000000000000..a45f7ffca8c5
--- /dev/null
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* loopback transport for vsock using virtio_transport_common APIs
+ *
+ * Copyright (C) 2013-2019 Red Hat, Inc.
+ * Authors: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ * Stefano Garzarella <sgarzare@redhat.com>
+ *
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/virtio_vsock.h>
+
+struct vsock_loopback {
+ struct workqueue_struct *workqueue;
+
+ spinlock_t pkt_list_lock; /* protects pkt_list */
+ struct list_head pkt_list;
+ struct work_struct pkt_work;
+};
+
+static struct vsock_loopback the_vsock_loopback;
+
+static u32 vsock_loopback_get_local_cid(void)
+{
+ return VMADDR_CID_LOCAL;
+}
+
+static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_loopback *vsock = &the_vsock_loopback;
+ int len = pkt->len;
+
+ spin_lock_bh(&vsock->pkt_list_lock);
+ list_add_tail(&pkt->list, &vsock->pkt_list);
+ spin_unlock_bh(&vsock->pkt_list_lock);
+
+ queue_work(vsock->workqueue, &vsock->pkt_work);
+
+ return len;
+}
+
+static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
+{
+ struct vsock_loopback *vsock = &the_vsock_loopback;
+ struct virtio_vsock_pkt *pkt, *n;
+ LIST_HEAD(freeme);
+
+ spin_lock_bh(&vsock->pkt_list_lock);
+ list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) {
+ if (pkt->vsk != vsk)
+ continue;
+ list_move(&pkt->list, &freeme);
+ }
+ spin_unlock_bh(&vsock->pkt_list_lock);
+
+ list_for_each_entry_safe(pkt, n, &freeme, list) {
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+
+ return 0;
+}
+
+static struct virtio_transport loopback_transport = {
+ .transport = {
+ .module = THIS_MODULE,
+
+ .get_local_cid = vsock_loopback_get_local_cid,
+
+ .init = virtio_transport_do_socket_init,
+ .destruct = virtio_transport_destruct,
+ .release = virtio_transport_release,
+ .connect = virtio_transport_connect,
+ .shutdown = virtio_transport_shutdown,
+ .cancel_pkt = vsock_loopback_cancel_pkt,
+
+ .dgram_bind = virtio_transport_dgram_bind,
+ .dgram_dequeue = virtio_transport_dgram_dequeue,
+ .dgram_enqueue = virtio_transport_dgram_enqueue,
+ .dgram_allow = virtio_transport_dgram_allow,
+
+ .stream_dequeue = virtio_transport_stream_dequeue,
+ .stream_enqueue = virtio_transport_stream_enqueue,
+ .stream_has_data = virtio_transport_stream_has_data,
+ .stream_has_space = virtio_transport_stream_has_space,
+ .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
+ .stream_is_active = virtio_transport_stream_is_active,
+ .stream_allow = virtio_transport_stream_allow,
+
+ .notify_poll_in = virtio_transport_notify_poll_in,
+ .notify_poll_out = virtio_transport_notify_poll_out,
+ .notify_recv_init = virtio_transport_notify_recv_init,
+ .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+ .notify_send_init = virtio_transport_notify_send_init,
+ .notify_send_pre_block = virtio_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+ .notify_buffer_size = virtio_transport_notify_buffer_size,
+ },
+
+ .send_pkt = vsock_loopback_send_pkt,
+};
+
+static void vsock_loopback_work(struct work_struct *work)
+{
+ struct vsock_loopback *vsock =
+ container_of(work, struct vsock_loopback, pkt_work);
+ LIST_HEAD(pkts);
+
+ spin_lock_bh(&vsock->pkt_list_lock);
+ list_splice_init(&vsock->pkt_list, &pkts);
+ spin_unlock_bh(&vsock->pkt_list_lock);
+
+ while (!list_empty(&pkts)) {
+ struct virtio_vsock_pkt *pkt;
+
+ pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+
+ virtio_transport_deliver_tap_pkt(pkt);
+ virtio_transport_recv_pkt(&loopback_transport, pkt);
+ }
+}
+
+static int __init vsock_loopback_init(void)
+{
+ struct vsock_loopback *vsock = &the_vsock_loopback;
+ int ret;
+
+ vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0);
+ if (!vsock->workqueue)
+ return -ENOMEM;
+
+ spin_lock_init(&vsock->pkt_list_lock);
+ INIT_LIST_HEAD(&vsock->pkt_list);
+ INIT_WORK(&vsock->pkt_work, vsock_loopback_work);
+
+ ret = vsock_core_register(&loopback_transport.transport,
+ VSOCK_TRANSPORT_F_LOCAL);
+ if (ret)
+ goto out_wq;
+
+ return 0;
+
+out_wq:
+ destroy_workqueue(vsock->workqueue);
+ return ret;
+}
+
+static void __exit vsock_loopback_exit(void)
+{
+ struct vsock_loopback *vsock = &the_vsock_loopback;
+ struct virtio_vsock_pkt *pkt;
+
+ vsock_core_unregister(&loopback_transport.transport);
+
+ flush_work(&vsock->pkt_work);
+
+ spin_lock_bh(&vsock->pkt_list_lock);
+ while (!list_empty(&vsock->pkt_list)) {
+ pkt = list_first_entry(&vsock->pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->pkt_list_lock);
+
+ destroy_workqueue(vsock->workqueue);
+}
+
+module_init(vsock_loopback_init);
+module_exit(vsock_loopback_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>");
+MODULE_DESCRIPTION("loopback transport for vsock");
+MODULE_ALIAS_NETPROTO(PF_VSOCK);
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
index 1af56df30276..3c54bb6b925a 100644
--- a/net/wimax/debugfs.c
+++ b/net/wimax/debugfs.c
@@ -13,49 +13,23 @@
#define D_SUBMODULE debugfs
#include "debug-levels.h"
-
-#define __debugfs_register(prefix, name, parent) \
-do { \
- result = d_level_register_debugfs(prefix, name, parent); \
- if (result < 0) \
- goto error; \
-} while (0)
-
-
-int wimax_debugfs_add(struct wimax_dev *wimax_dev)
+void wimax_debugfs_add(struct wimax_dev *wimax_dev)
{
- int result;
struct net_device *net_dev = wimax_dev->net_dev;
- struct device *dev = net_dev->dev.parent;
struct dentry *dentry;
char buf[128];
snprintf(buf, sizeof(buf), "wimax:%s", net_dev->name);
dentry = debugfs_create_dir(buf, NULL);
- result = PTR_ERR(dentry);
- if (IS_ERR(dentry)) {
- if (result == -ENODEV)
- result = 0; /* No debugfs support */
- else
- dev_err(dev, "Can't create debugfs dentry: %d\n",
- result);
- goto out;
- }
wimax_dev->debugfs_dentry = dentry;
- __debugfs_register("wimax_dl_", debugfs, dentry);
- __debugfs_register("wimax_dl_", id_table, dentry);
- __debugfs_register("wimax_dl_", op_msg, dentry);
- __debugfs_register("wimax_dl_", op_reset, dentry);
- __debugfs_register("wimax_dl_", op_rfkill, dentry);
- __debugfs_register("wimax_dl_", op_state_get, dentry);
- __debugfs_register("wimax_dl_", stack, dentry);
- result = 0;
-out:
- return result;
-error:
- debugfs_remove_recursive(wimax_dev->debugfs_dentry);
- return result;
+ d_level_register_debugfs("wimax_dl_", debugfs, dentry);
+ d_level_register_debugfs("wimax_dl_", id_table, dentry);
+ d_level_register_debugfs("wimax_dl_", op_msg, dentry);
+ d_level_register_debugfs("wimax_dl_", op_reset, dentry);
+ d_level_register_debugfs("wimax_dl_", op_rfkill, dentry);
+ d_level_register_debugfs("wimax_dl_", op_state_get, dentry);
+ d_level_register_debugfs("wimax_dl_", stack, dentry);
}
void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 1ba99d65feca..4b9b1c5e8f3a 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -481,12 +481,7 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
/* Set up user-space interaction */
mutex_lock(&wimax_dev->mutex);
wimax_id_table_add(wimax_dev);
- result = wimax_debugfs_add(wimax_dev);
- if (result < 0) {
- dev_err(dev, "cannot initialize debugfs: %d\n",
- result);
- goto error_debugfs_add;
- }
+ wimax_debugfs_add(wimax_dev);
__wimax_state_set(wimax_dev, WIMAX_ST_DOWN);
mutex_unlock(&wimax_dev->mutex);
@@ -498,10 +493,6 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
d_fnend(3, dev, "(wimax_dev %p net_dev %p) = 0\n", wimax_dev, net_dev);
return 0;
-error_debugfs_add:
- wimax_id_table_rm(wimax_dev);
- mutex_unlock(&wimax_dev->mutex);
- wimax_rfkill_rm(wimax_dev);
error_rfkill_add:
d_fnend(3, dev, "(wimax_dev %p net_dev %p) = %d\n",
wimax_dev, net_dev, result);
diff --git a/net/wimax/wimax-internal.h b/net/wimax/wimax-internal.h
index e819a09337ee..40751207296c 100644
--- a/net/wimax/wimax-internal.h
+++ b/net/wimax/wimax-internal.h
@@ -57,13 +57,10 @@ void __wimax_state_set(struct wimax_dev *wimax_dev, enum wimax_st state)
void __wimax_state_change(struct wimax_dev *, enum wimax_st);
#ifdef CONFIG_DEBUG_FS
-int wimax_debugfs_add(struct wimax_dev *);
+void wimax_debugfs_add(struct wimax_dev *);
void wimax_debugfs_rm(struct wimax_dev *);
#else
-static inline int wimax_debugfs_add(struct wimax_dev *wimax_dev)
-{
- return 0;
-}
+static inline void wimax_debugfs_add(struct wimax_dev *wimax_dev) {}
static inline void wimax_debugfs_rm(struct wimax_dev *wimax_dev) {}
#endif
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 67f8360dfcee..63cf7131f601 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -217,6 +217,8 @@ config LIB80211_CRYPT_WEP
config LIB80211_CRYPT_CCMP
tristate
+ select CRYPTO_AES
+ select CRYPTO_CCM
config LIB80211_CRYPT_TKIP
tristate
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 7dc1bbd0888f..fcac5c6366e1 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -14,6 +14,11 @@
#include "core.h"
#include "rdev-ops.h"
+static bool cfg80211_valid_60g_freq(u32 freq)
+{
+ return freq >= 58320 && freq <= 70200;
+}
+
void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
struct ieee80211_channel *chan,
enum nl80211_channel_type chan_type)
@@ -23,6 +28,8 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
chandef->chan = chan;
chandef->center_freq2 = 0;
+ chandef->edmg.bw_config = 0;
+ chandef->edmg.channels = 0;
switch (chan_type) {
case NL80211_CHAN_NO_HT:
@@ -47,6 +54,91 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
}
EXPORT_SYMBOL(cfg80211_chandef_create);
+static bool cfg80211_edmg_chandef_valid(const struct cfg80211_chan_def *chandef)
+{
+ int max_contiguous = 0;
+ int num_of_enabled = 0;
+ int contiguous = 0;
+ int i;
+
+ if (!chandef->edmg.channels || !chandef->edmg.bw_config)
+ return false;
+
+ if (!cfg80211_valid_60g_freq(chandef->chan->center_freq))
+ return false;
+
+ for (i = 0; i < 6; i++) {
+ if (chandef->edmg.channels & BIT(i)) {
+ contiguous++;
+ num_of_enabled++;
+ } else {
+ contiguous = 0;
+ }
+
+ max_contiguous = max(contiguous, max_contiguous);
+ }
+ /* basic verification of edmg configuration according to
+ * IEEE P802.11ay/D4.0 section 9.4.2.251
+ */
+ /* check bw_config against contiguous edmg channels */
+ switch (chandef->edmg.bw_config) {
+ case IEEE80211_EDMG_BW_CONFIG_4:
+ case IEEE80211_EDMG_BW_CONFIG_8:
+ case IEEE80211_EDMG_BW_CONFIG_12:
+ if (max_contiguous < 1)
+ return false;
+ break;
+ case IEEE80211_EDMG_BW_CONFIG_5:
+ case IEEE80211_EDMG_BW_CONFIG_9:
+ case IEEE80211_EDMG_BW_CONFIG_13:
+ if (max_contiguous < 2)
+ return false;
+ break;
+ case IEEE80211_EDMG_BW_CONFIG_6:
+ case IEEE80211_EDMG_BW_CONFIG_10:
+ case IEEE80211_EDMG_BW_CONFIG_14:
+ if (max_contiguous < 3)
+ return false;
+ break;
+ case IEEE80211_EDMG_BW_CONFIG_7:
+ case IEEE80211_EDMG_BW_CONFIG_11:
+ case IEEE80211_EDMG_BW_CONFIG_15:
+ if (max_contiguous < 4)
+ return false;
+ break;
+
+ default:
+ return false;
+ }
+
+ /* check bw_config against aggregated (non contiguous) edmg channels */
+ switch (chandef->edmg.bw_config) {
+ case IEEE80211_EDMG_BW_CONFIG_4:
+ case IEEE80211_EDMG_BW_CONFIG_5:
+ case IEEE80211_EDMG_BW_CONFIG_6:
+ case IEEE80211_EDMG_BW_CONFIG_7:
+ break;
+ case IEEE80211_EDMG_BW_CONFIG_8:
+ case IEEE80211_EDMG_BW_CONFIG_9:
+ case IEEE80211_EDMG_BW_CONFIG_10:
+ case IEEE80211_EDMG_BW_CONFIG_11:
+ if (num_of_enabled < 2)
+ return false;
+ break;
+ case IEEE80211_EDMG_BW_CONFIG_12:
+ case IEEE80211_EDMG_BW_CONFIG_13:
+ case IEEE80211_EDMG_BW_CONFIG_14:
+ case IEEE80211_EDMG_BW_CONFIG_15:
+ if (num_of_enabled < 4 || max_contiguous < 2)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
{
u32 control_freq;
@@ -112,6 +204,15 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
return false;
}
+ /* channel 14 is only for IEEE 802.11b */
+ if (chandef->center_freq1 == 2484 &&
+ chandef->width != NL80211_CHAN_WIDTH_20_NOHT)
+ return false;
+
+ if (cfg80211_chandef_is_edmg(chandef) &&
+ !cfg80211_edmg_chandef_valid(chandef))
+ return false;
+
return true;
}
EXPORT_SYMBOL(cfg80211_chandef_valid);
@@ -721,12 +822,66 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
return true;
}
+/* check if the operating channels are valid and supported */
+static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels,
+ enum ieee80211_edmg_bw_config edmg_bw_config,
+ int primary_channel,
+ struct ieee80211_edmg *edmg_cap)
+{
+ struct ieee80211_channel *chan;
+ int i, freq;
+ int channels_counter = 0;
+
+ if (!edmg_channels && !edmg_bw_config)
+ return true;
+
+ if ((!edmg_channels && edmg_bw_config) ||
+ (edmg_channels && !edmg_bw_config))
+ return false;
+
+ if (!(edmg_channels & BIT(primary_channel - 1)))
+ return false;
+
+ /* 60GHz channels 1..6 */
+ for (i = 0; i < 6; i++) {
+ if (!(edmg_channels & BIT(i)))
+ continue;
+
+ if (!(edmg_cap->channels & BIT(i)))
+ return false;
+
+ channels_counter++;
+
+ freq = ieee80211_channel_to_frequency(i + 1,
+ NL80211_BAND_60GHZ);
+ chan = ieee80211_get_channel(wiphy, freq);
+ if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
+ return false;
+ }
+
+ /* IEEE802.11 allows max 4 channels */
+ if (channels_counter > 4)
+ return false;
+
+ /* check bw_config is a subset of what driver supports
+ * (see IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13)
+ */
+ if ((edmg_bw_config % 4) > (edmg_cap->bw_config % 4))
+ return false;
+
+ if (edmg_bw_config > edmg_cap->bw_config)
+ return false;
+
+ return true;
+}
+
bool cfg80211_chandef_usable(struct wiphy *wiphy,
const struct cfg80211_chan_def *chandef,
u32 prohibited_flags)
{
struct ieee80211_sta_ht_cap *ht_cap;
struct ieee80211_sta_vht_cap *vht_cap;
+ struct ieee80211_edmg *edmg_cap;
u32 width, control_freq, cap;
if (WARN_ON(!cfg80211_chandef_valid(chandef)))
@@ -734,6 +889,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy,
ht_cap = &wiphy->bands[chandef->chan->band]->ht_cap;
vht_cap = &wiphy->bands[chandef->chan->band]->vht_cap;
+ edmg_cap = &wiphy->bands[chandef->chan->band]->edmg_cap;
+
+ if (edmg_cap->channels &&
+ !cfg80211_edmg_usable(wiphy,
+ chandef->edmg.channels,
+ chandef->edmg.bw_config,
+ chandef->chan->hw_value,
+ edmg_cap))
+ return false;
control_freq = chandef->chan->center_freq;
@@ -894,7 +1058,8 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
if (chan == other_chan)
return true;
- if (chan->band != NL80211_BAND_5GHZ)
+ if (chan->band != NL80211_BAND_5GHZ &&
+ chan->band != NL80211_BAND_6GHZ)
continue;
r1 = cfg80211_get_unii(chan->center_freq);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 32b3c719fdfc..3e25229a059d 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -5,7 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2019 Intel Corporation
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -142,12 +142,10 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
if (result)
return result;
- if (rdev->wiphy.debugfsdir &&
- !debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
- rdev->wiphy.debugfsdir,
- rdev->wiphy.debugfsdir->d_parent,
- newname))
- pr_err("failed to rename debugfs dir to %s!\n", newname);
+ if (rdev->wiphy.debugfsdir)
+ debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
+ rdev->wiphy.debugfsdir,
+ rdev->wiphy.debugfsdir->d_parent, newname);
nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
@@ -302,12 +300,13 @@ static int cfg80211_rfkill_set_block(void *data, bool blocked)
return 0;
}
-static void cfg80211_rfkill_sync_work(struct work_struct *work)
+static void cfg80211_rfkill_block_work(struct work_struct *work)
{
struct cfg80211_registered_device *rdev;
- rdev = container_of(work, struct cfg80211_registered_device, rfkill_sync);
- cfg80211_rfkill_set_block(rdev, rfkill_blocked(rdev->rfkill));
+ rdev = container_of(work, struct cfg80211_registered_device,
+ rfkill_block);
+ cfg80211_rfkill_set_block(rdev, true);
}
static void cfg80211_event_work(struct work_struct *work)
@@ -518,7 +517,7 @@ use_default_name:
return NULL;
}
- INIT_WORK(&rdev->rfkill_sync, cfg80211_rfkill_sync_work);
+ INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work);
INIT_WORK(&rdev->conn_work, cfg80211_conn_work);
INIT_WORK(&rdev->event_work, cfg80211_event_work);
@@ -899,11 +898,8 @@ int wiphy_register(struct wiphy *wiphy)
cfg80211_rdev_list_generation++;
/* add to debugfs */
- rdev->wiphy.debugfsdir =
- debugfs_create_dir(wiphy_name(&rdev->wiphy),
- ieee80211_debugfs_dir);
- if (IS_ERR(rdev->wiphy.debugfsdir))
- rdev->wiphy.debugfsdir = NULL;
+ rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy),
+ ieee80211_debugfs_dir);
cfg80211_debugfs_rdev_add(rdev);
nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
@@ -1066,7 +1062,7 @@ void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked)
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
if (rfkill_set_hw_state(rdev->rfkill, blocked))
- schedule_work(&rdev->rfkill_sync);
+ schedule_work(&rdev->rfkill_block);
}
EXPORT_SYMBOL(wiphy_rfkill_set_hw_state);
@@ -1106,6 +1102,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync)
#ifdef CONFIG_CFG80211_WEXT
kzfree(wdev->wext.keys);
+ wdev->wext.keys = NULL;
#endif
/* only initialized if we have a netdev */
if (wdev->netdev)
diff --git a/net/wireless/core.h b/net/wireless/core.h
index ee8388fe4a92..ed487e324571 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -28,7 +28,7 @@ struct cfg80211_registered_device {
/* rfkill support */
struct rfkill_ops rfkill_ops;
struct rfkill *rfkill;
- struct work_struct rfkill_sync;
+ struct work_struct rfkill_block;
/* ISO / IEC 3166 alpha2 for which this device is receiving
* country IEs on, this can help disregard country IEs from APs
@@ -306,6 +306,8 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
void cfg80211_bss_expire(struct cfg80211_registered_device *rdev);
void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
unsigned long age_secs);
+void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
+ struct ieee80211_channel *channel);
/* IBSS */
int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c
index a9c0f368db5d..24e18405cdb4 100644
--- a/net/wireless/ethtool.c
+++ b/net/wireless/ethtool.c
@@ -7,9 +7,13 @@
void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct device *pdev = wiphy_dev(wdev->wiphy);
- strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name,
- sizeof(info->driver));
+ if (pdev->driver)
+ strlcpy(info->driver, pdev->driver->name,
+ sizeof(info->driver));
+ else
+ strlcpy(info->driver, "N/A", sizeof(info->driver));
strlcpy(info->version, init_utsname()->release, sizeof(info->version));
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index d1743e6abc34..ae8fe66a9bb8 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -104,13 +104,19 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
* use the mandatory rate set for 11b or
* 11a for maximum compatibility.
*/
- struct ieee80211_supported_band *sband =
- rdev->wiphy.bands[params->chandef.chan->band];
+ struct ieee80211_supported_band *sband;
+ enum nl80211_band band;
+ u32 flag;
int j;
- u32 flag = params->chandef.chan->band == NL80211_BAND_5GHZ ?
- IEEE80211_RATE_MANDATORY_A :
- IEEE80211_RATE_MANDATORY_B;
+ band = params->chandef.chan->band;
+ if (band == NL80211_BAND_5GHZ ||
+ band == NL80211_BAND_6GHZ)
+ flag = IEEE80211_RATE_MANDATORY_A;
+ else
+ flag = IEEE80211_RATE_MANDATORY_B;
+
+ sband = rdev->wiphy.bands[band];
for (j = 0; j < sband->n_bitrates; j++) {
if (sband->bitrates[j].flags & flag)
params->basic_rates |= BIT(j);
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index 7e8ff9d7dcfa..6a5f08f7491e 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -22,6 +22,7 @@
#include <linux/ieee80211.h>
#include <linux/crypto.h>
+#include <crypto/aead.h>
#include <net/lib80211.h>
@@ -48,20 +49,13 @@ struct lib80211_ccmp_data {
int key_idx;
- struct crypto_cipher *tfm;
+ struct crypto_aead *tfm;
/* scratch buffers for virt_to_page() (crypto API) */
- u8 tx_b0[AES_BLOCK_LEN], tx_b[AES_BLOCK_LEN],
- tx_e[AES_BLOCK_LEN], tx_s0[AES_BLOCK_LEN];
- u8 rx_b0[AES_BLOCK_LEN], rx_b[AES_BLOCK_LEN], rx_a[AES_BLOCK_LEN];
+ u8 tx_aad[2 * AES_BLOCK_LEN];
+ u8 rx_aad[2 * AES_BLOCK_LEN];
};
-static inline void lib80211_ccmp_aes_encrypt(struct crypto_cipher *tfm,
- const u8 pt[16], u8 ct[16])
-{
- crypto_cipher_encrypt_one(tfm, ct, pt);
-}
-
static void *lib80211_ccmp_init(int key_idx)
{
struct lib80211_ccmp_data *priv;
@@ -71,7 +65,7 @@ static void *lib80211_ccmp_init(int key_idx)
goto fail;
priv->key_idx = key_idx;
- priv->tfm = crypto_alloc_cipher("aes", 0, 0);
+ priv->tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->tfm)) {
priv->tfm = NULL;
goto fail;
@@ -82,7 +76,7 @@ static void *lib80211_ccmp_init(int key_idx)
fail:
if (priv) {
if (priv->tfm)
- crypto_free_cipher(priv->tfm);
+ crypto_free_aead(priv->tfm);
kfree(priv);
}
@@ -93,25 +87,16 @@ static void lib80211_ccmp_deinit(void *priv)
{
struct lib80211_ccmp_data *_priv = priv;
if (_priv && _priv->tfm)
- crypto_free_cipher(_priv->tfm);
+ crypto_free_aead(_priv->tfm);
kfree(priv);
}
-static inline void xor_block(u8 * b, u8 * a, size_t len)
-{
- int i;
- for (i = 0; i < len; i++)
- b[i] ^= a[i];
-}
-
-static void ccmp_init_blocks(struct crypto_cipher *tfm,
- struct ieee80211_hdr *hdr,
- u8 * pn, size_t dlen, u8 * b0, u8 * auth, u8 * s0)
+static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr,
+ const u8 *pn, u8 *iv, u8 *aad)
{
u8 *pos, qc = 0;
size_t aad_len;
int a4_included, qc_included;
- u8 aad[2 * AES_BLOCK_LEN];
a4_included = ieee80211_has_a4(hdr->frame_control);
qc_included = ieee80211_is_data_qos(hdr->frame_control);
@@ -127,17 +112,19 @@ static void ccmp_init_blocks(struct crypto_cipher *tfm,
aad_len += 2;
}
- /* CCM Initial Block:
- * Flag (Include authentication header, M=3 (8-octet MIC),
- * L=1 (2-octet Dlen))
- * Nonce: 0x00 | A2 | PN
- * Dlen */
- b0[0] = 0x59;
- b0[1] = qc;
- memcpy(b0 + 2, hdr->addr2, ETH_ALEN);
- memcpy(b0 + 8, pn, CCMP_PN_LEN);
- b0[14] = (dlen >> 8) & 0xff;
- b0[15] = dlen & 0xff;
+ /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC
+ * mode authentication are not allowed to collide, yet both are derived
+ * from the same vector. We only set L := 1 here to indicate that the
+ * data size can be represented in (L+1) bytes. The CCM layer will take
+ * care of storing the data length in the top (L+1) bytes and setting
+ * and clearing the other bits as is required to derive the two IVs.
+ */
+ iv[0] = 0x1;
+
+ /* Nonce: QC | A2 | PN */
+ iv[1] = qc;
+ memcpy(iv + 2, hdr->addr2, ETH_ALEN);
+ memcpy(iv + 8, pn, CCMP_PN_LEN);
/* AAD:
* FC with bits 4..6 and 11..13 masked to zero; 14 is always one
@@ -147,31 +134,20 @@ static void ccmp_init_blocks(struct crypto_cipher *tfm,
* QC (if present)
*/
pos = (u8 *) hdr;
- aad[0] = 0; /* aad_len >> 8 */
- aad[1] = aad_len & 0xff;
- aad[2] = pos[0] & 0x8f;
- aad[3] = pos[1] & 0xc7;
- memcpy(aad + 4, hdr->addr1, 3 * ETH_ALEN);
+ aad[0] = pos[0] & 0x8f;
+ aad[1] = pos[1] & 0xc7;
+ memcpy(aad + 2, hdr->addr1, 3 * ETH_ALEN);
pos = (u8 *) & hdr->seq_ctrl;
- aad[22] = pos[0] & 0x0f;
- aad[23] = 0; /* all bits masked */
- memset(aad + 24, 0, 8);
+ aad[20] = pos[0] & 0x0f;
+ aad[21] = 0; /* all bits masked */
+ memset(aad + 22, 0, 8);
if (a4_included)
- memcpy(aad + 24, hdr->addr4, ETH_ALEN);
+ memcpy(aad + 22, hdr->addr4, ETH_ALEN);
if (qc_included) {
- aad[a4_included ? 30 : 24] = qc;
+ aad[a4_included ? 28 : 22] = qc;
/* rest of QC masked */
}
-
- /* Start with the first block and AAD */
- lib80211_ccmp_aes_encrypt(tfm, b0, auth);
- xor_block(auth, aad, AES_BLOCK_LEN);
- lib80211_ccmp_aes_encrypt(tfm, auth, auth);
- xor_block(auth, &aad[AES_BLOCK_LEN], AES_BLOCK_LEN);
- lib80211_ccmp_aes_encrypt(tfm, auth, auth);
- b0[0] &= 0x07;
- b0[14] = b0[15] = 0;
- lib80211_ccmp_aes_encrypt(tfm, b0, s0);
+ return aad_len;
}
static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len,
@@ -214,13 +190,13 @@ static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len,
static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_ccmp_data *key = priv;
- int data_len, i, blocks, last, len;
- u8 *pos, *mic;
struct ieee80211_hdr *hdr;
- u8 *b0 = key->tx_b0;
- u8 *b = key->tx_b;
- u8 *e = key->tx_e;
- u8 *s0 = key->tx_s0;
+ struct aead_request *req;
+ struct scatterlist sg[2];
+ u8 *aad = key->tx_aad;
+ u8 iv[AES_BLOCK_LEN];
+ int len, data_len, aad_len;
+ int ret;
if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len)
return -1;
@@ -230,31 +206,28 @@ static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
if (len < 0)
return -1;
- pos = skb->data + hdr_len + CCMP_HDR_LEN;
+ req = aead_request_alloc(key->tfm, GFP_ATOMIC);
+ if (!req)
+ return -ENOMEM;
+
hdr = (struct ieee80211_hdr *)skb->data;
- ccmp_init_blocks(key->tfm, hdr, key->tx_pn, data_len, b0, b, s0);
-
- blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
- last = data_len % AES_BLOCK_LEN;
-
- for (i = 1; i <= blocks; i++) {
- len = (i == blocks && last) ? last : AES_BLOCK_LEN;
- /* Authentication */
- xor_block(b, pos, len);
- lib80211_ccmp_aes_encrypt(key->tfm, b, b);
- /* Encryption, with counter */
- b0[14] = (i >> 8) & 0xff;
- b0[15] = i & 0xff;
- lib80211_ccmp_aes_encrypt(key->tfm, b0, e);
- xor_block(pos, e, len);
- pos += len;
- }
+ aad_len = ccmp_init_iv_and_aad(hdr, key->tx_pn, iv, aad);
- mic = skb_put(skb, CCMP_MIC_LEN);
- for (i = 0; i < CCMP_MIC_LEN; i++)
- mic[i] = b[i] ^ s0[i];
+ skb_put(skb, CCMP_MIC_LEN);
- return 0;
+ sg_init_table(sg, 2);
+ sg_set_buf(&sg[0], aad, aad_len);
+ sg_set_buf(&sg[1], skb->data + hdr_len + CCMP_HDR_LEN,
+ data_len + CCMP_MIC_LEN);
+
+ aead_request_set_callback(req, 0, NULL, NULL);
+ aead_request_set_ad(req, aad_len);
+ aead_request_set_crypt(req, sg, sg, data_len, iv);
+
+ ret = crypto_aead_encrypt(req);
+ aead_request_free(req);
+
+ return ret;
}
/*
@@ -283,13 +256,13 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
struct lib80211_ccmp_data *key = priv;
u8 keyidx, *pos;
struct ieee80211_hdr *hdr;
- u8 *b0 = key->rx_b0;
- u8 *b = key->rx_b;
- u8 *a = key->rx_a;
+ struct aead_request *req;
+ struct scatterlist sg[2];
+ u8 *aad = key->rx_aad;
+ u8 iv[AES_BLOCK_LEN];
u8 pn[6];
- int i, blocks, last, len;
- size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN - CCMP_MIC_LEN;
- u8 *mic = skb->data + skb->len - CCMP_MIC_LEN;
+ int aad_len, ret;
+ size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN;
if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) {
key->dot11RSNAStatsCCMPFormatErrors++;
@@ -337,28 +310,26 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
return -4;
}
- ccmp_init_blocks(key->tfm, hdr, pn, data_len, b0, a, b);
- xor_block(mic, b, CCMP_MIC_LEN);
-
- blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
- last = data_len % AES_BLOCK_LEN;
-
- for (i = 1; i <= blocks; i++) {
- len = (i == blocks && last) ? last : AES_BLOCK_LEN;
- /* Decrypt, with counter */
- b0[14] = (i >> 8) & 0xff;
- b0[15] = i & 0xff;
- lib80211_ccmp_aes_encrypt(key->tfm, b0, b);
- xor_block(pos, b, len);
- /* Authentication */
- xor_block(a, pos, len);
- lib80211_ccmp_aes_encrypt(key->tfm, a, a);
- pos += len;
- }
+ req = aead_request_alloc(key->tfm, GFP_ATOMIC);
+ if (!req)
+ return -ENOMEM;
- if (memcmp(mic, a, CCMP_MIC_LEN) != 0) {
- net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM\n",
- hdr->addr2);
+ aad_len = ccmp_init_iv_and_aad(hdr, pn, iv, aad);
+
+ sg_init_table(sg, 2);
+ sg_set_buf(&sg[0], aad, aad_len);
+ sg_set_buf(&sg[1], pos, data_len);
+
+ aead_request_set_callback(req, 0, NULL, NULL);
+ aead_request_set_ad(req, aad_len);
+ aead_request_set_crypt(req, sg, sg, data_len, iv);
+
+ ret = crypto_aead_decrypt(req);
+ aead_request_free(req);
+
+ if (ret) {
+ net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM (%d)\n",
+ hdr->addr2, ret);
key->dot11RSNAStatsCCMPDecryptErrors++;
return -5;
}
@@ -377,7 +348,7 @@ static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
{
struct lib80211_ccmp_data *data = priv;
int keyidx;
- struct crypto_cipher *tfm = data->tfm;
+ struct crypto_aead *tfm = data->tfm;
keyidx = data->key_idx;
memset(data, 0, sizeof(*data));
@@ -394,7 +365,9 @@ static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv)
data->rx_pn[4] = seq[1];
data->rx_pn[5] = seq[0];
}
- crypto_cipher_setkey(data->tfm, data->key, CCMP_TK_LEN);
+ if (crypto_aead_setauthsize(data->tfm, CCMP_MIC_LEN) ||
+ crypto_aead_setkey(data->tfm, data->key, CCMP_TK_LEN))
+ return -1;
} else if (len == 0)
data->key_set = 0;
else
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fd05ae1437a9..cedf17d4933f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -201,6 +201,38 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info)
return __cfg80211_rdev_from_attrs(netns, info->attrs);
}
+static int validate_beacon_head(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+ const struct element *elem;
+ const struct ieee80211_mgmt *mgmt = (void *)data;
+ unsigned int fixedlen = offsetof(struct ieee80211_mgmt,
+ u.beacon.variable);
+
+ if (len < fixedlen)
+ goto err;
+
+ if (ieee80211_hdrlen(mgmt->frame_control) !=
+ offsetof(struct ieee80211_mgmt, u.beacon))
+ goto err;
+
+ data += fixedlen;
+ len -= fixedlen;
+
+ for_each_element(elem, data, len) {
+ /* nothing */
+ }
+
+ if (for_each_element_completed(elem, data, len))
+ return 0;
+
+err:
+ NL_SET_ERR_MSG_ATTR(extack, attr, "malformed beacon head");
+ return -EINVAL;
+}
+
static int validate_ie_attr(const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -281,7 +313,16 @@ nl80211_pmsr_attr_policy[NL80211_PMSR_ATTR_MAX + 1] = {
NLA_POLICY_NESTED_ARRAY(nl80211_psmr_peer_attr_policy),
};
+static const struct nla_policy
+he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = {
+ [NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] =
+ NLA_POLICY_RANGE(NLA_U8, 1, 20),
+ [NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] =
+ NLA_POLICY_RANGE(NLA_U8, 1, 20),
+};
+
const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
+ [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD },
[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
.len = 20-1 },
@@ -289,6 +330,13 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
[NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
+ [NL80211_ATTR_WIPHY_EDMG_CHANNELS] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_EDMG_CHANNELS_MIN,
+ NL80211_EDMG_CHANNELS_MAX),
+ [NL80211_ATTR_WIPHY_EDMG_BW_CONFIG] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_EDMG_BW_CONFIG_MIN,
+ NL80211_EDMG_BW_CONFIG_MAX),
+
[NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 },
[NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 },
[NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 },
@@ -322,8 +370,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
- [NL80211_ATTR_BEACON_HEAD] = { .type = NLA_BINARY,
- .len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_BEACON_HEAD] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
+ IEEE80211_MAX_DATA_LEN),
[NL80211_ATTR_BEACON_TAIL] =
NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
IEEE80211_MAX_DATA_LEN),
@@ -344,7 +393,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_MNTR_FLAGS] = { /* NLA_NESTED can't be empty */ },
[NL80211_ATTR_MESH_ID] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_MESH_ID_LEN },
- [NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 },
+ [NL80211_ATTR_MPATH_NEXT_HOP] = NLA_POLICY_ETH_ADDR_COMPAT,
[NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 },
[NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },
@@ -388,6 +437,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
[NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG },
[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
+ [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 },
[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
[NL80211_ATTR_PID] = { .type = NLA_U32 },
@@ -574,6 +624,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_SAE_PASSWORD] = { .type = NLA_BINARY,
.len = SAE_PASSWORD_MAX_LEN },
[NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
+ [NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy),
+ [NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
};
/* policy for the key attributes */
@@ -667,6 +719,7 @@ static const struct nla_policy
nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = {
[NL80211_BAND_2GHZ] = { .type = NLA_S32 },
[NL80211_BAND_5GHZ] = { .type = NLA_S32 },
+ [NL80211_BAND_6GHZ] = { .type = NLA_S32 },
[NL80211_BAND_60GHZ] = { .type = NLA_S32 },
};
@@ -749,17 +802,25 @@ int nl80211_prepare_wdev_dump(struct netlink_callback *cb,
int err;
if (!cb->args[0]) {
+ struct nlattr **attrbuf;
+
+ attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf),
+ GFP_KERNEL);
+ if (!attrbuf)
+ return -ENOMEM;
+
err = nlmsg_parse_deprecated(cb->nlh,
GENL_HDRLEN + nl80211_fam.hdrsize,
- genl_family_attrbuf(&nl80211_fam),
- nl80211_fam.maxattr,
+ attrbuf, nl80211_fam.maxattr,
nl80211_policy, NULL);
- if (err)
+ if (err) {
+ kfree(attrbuf);
return err;
+ }
- *wdev = __cfg80211_wdev_from_attrs(
- sock_net(cb->skb->sk),
- genl_family_attrbuf(&nl80211_fam));
+ *wdev = __cfg80211_wdev_from_attrs(sock_net(cb->skb->sk),
+ attrbuf);
+ kfree(attrbuf);
if (IS_ERR(*wdev))
return PTR_ERR(*wdev);
*rdev = wiphy_to_rdev((*wdev)->wiphy);
@@ -1555,6 +1616,15 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
nla_nest_end(msg, nl_iftype_data);
}
+ /* add EDMG info */
+ if (sband->edmg_cap.channels &&
+ (nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_CHANNELS,
+ sband->edmg_cap.channels) ||
+ nla_put_u8(msg, NL80211_BAND_ATTR_EDMG_BW_CONFIG,
+ sband->edmg_cap.bw_config)))
+
+ return -ENOBUFS;
+
/* add bitrates */
nl_rates = nla_nest_start_noflag(msg, NL80211_BAND_ATTR_RATES);
if (!nl_rates)
@@ -2065,6 +2135,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
CMD(add_tx_ts, ADD_TX_TS);
CMD(set_multicast_to_unicast, SET_MULTICAST_TO_UNICAST);
CMD(update_connect_params, UPDATE_CONNECT_PARAMS);
+ CMD(update_ft_ies, UPDATE_FT_IES);
}
#undef CMD
@@ -2172,6 +2243,30 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
rdev->wiphy.vht_capa_mod_mask))
goto nla_put_failure;
+ if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
+ rdev->wiphy.perm_addr))
+ goto nla_put_failure;
+
+ if (!is_zero_ether_addr(rdev->wiphy.addr_mask) &&
+ nla_put(msg, NL80211_ATTR_MAC_MASK, ETH_ALEN,
+ rdev->wiphy.addr_mask))
+ goto nla_put_failure;
+
+ if (rdev->wiphy.n_addresses > 1) {
+ void *attr;
+
+ attr = nla_nest_start(msg, NL80211_ATTR_MAC_ADDRS);
+ if (!attr)
+ goto nla_put_failure;
+
+ for (i = 0; i < rdev->wiphy.n_addresses; i++)
+ if (nla_put(msg, i + 1, ETH_ALEN,
+ rdev->wiphy.addresses[i].addr))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+ }
+
state->split_start++;
break;
case 10:
@@ -2366,14 +2461,21 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl80211_dump_wiphy_state *state)
{
- struct nlattr **tb = genl_family_attrbuf(&nl80211_fam);
- int ret = nlmsg_parse_deprecated(cb->nlh,
- GENL_HDRLEN + nl80211_fam.hdrsize,
- tb, nl80211_fam.maxattr,
- nl80211_policy, NULL);
+ struct nlattr **tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL);
+ int ret;
+
+ if (!tb)
+ return -ENOMEM;
+
+ ret = nlmsg_parse_deprecated(cb->nlh,
+ GENL_HDRLEN + nl80211_fam.hdrsize,
+ tb, nl80211_fam.maxattr,
+ nl80211_policy, NULL);
/* ignore parse errors for backward compatibility */
- if (ret)
- return 0;
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
state->split = tb[NL80211_ATTR_SPLIT_WIPHY_DUMP];
if (tb[NL80211_ATTR_WIPHY])
@@ -2386,8 +2488,10 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
int ifidx = nla_get_u32(tb[NL80211_ATTR_IFINDEX]);
netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
- if (!netdev)
- return -ENODEV;
+ if (!netdev) {
+ ret = -ENODEV;
+ goto out;
+ }
if (netdev->ieee80211_ptr) {
rdev = wiphy_to_rdev(
netdev->ieee80211_ptr->wiphy);
@@ -2395,7 +2499,10 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb,
}
}
- return 0;
+ ret = 0;
+out:
+ kfree(tb);
+ return ret;
}
static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
@@ -2564,6 +2671,8 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]);
+ memset(chandef, 0, sizeof(*chandef));
+
chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq);
chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
chandef->center_freq1 = control_freq;
@@ -2622,6 +2731,18 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
}
+ if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
+ chandef->edmg.channels =
+ nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]);
+
+ if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG])
+ chandef->edmg.bw_config =
+ nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]);
+ } else {
+ chandef->edmg.bw_config = 0;
+ chandef->edmg.channels = 0;
+ }
+
if (!cfg80211_chandef_valid(chandef)) {
NL_SET_ERR_MSG(extack, "invalid channel definition");
return -EINVAL;
@@ -3092,7 +3213,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
if (rdev->ops->get_channel) {
int ret;
- struct cfg80211_chan_def chandef;
+ struct cfg80211_chan_def chandef = {};
ret = rdev_get_channel(rdev, wdev, &chandef);
if (ret == 0) {
@@ -3821,6 +3942,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
key.type != NL80211_KEYTYPE_GROUP)
return -EINVAL;
+ if (key.type == NL80211_KEYTYPE_GROUP &&
+ info->attrs[NL80211_ATTR_VLAN_ID])
+ key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);
+
if (!rdev->ops->add_key)
return -EOPNOTSUPP;
@@ -4359,6 +4484,34 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
return 0;
}
+static int nl80211_parse_he_obss_pd(struct nlattr *attrs,
+ struct ieee80211_he_obss_pd *he_obss_pd)
+{
+ struct nlattr *tb[NL80211_HE_OBSS_PD_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NL80211_HE_OBSS_PD_ATTR_MAX, attrs,
+ he_obss_pd_policy, NULL);
+ if (err)
+ return err;
+
+ if (!tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET] ||
+ !tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET])
+ return -EINVAL;
+
+ he_obss_pd->min_offset =
+ nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]);
+ he_obss_pd->max_offset =
+ nla_get_u32(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]);
+
+ if (he_obss_pd->min_offset >= he_obss_pd->max_offset)
+ return -EINVAL;
+
+ he_obss_pd->enable = true;
+
+ return 0;
+}
+
static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
const u8 *rates)
{
@@ -4643,6 +4796,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
params.twt_responder =
nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]);
+ if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) {
+ err = nl80211_parse_he_obss_pd(
+ info->attrs[NL80211_ATTR_HE_OBSS_PD],
+ &params.he_obss_pd);
+ if (err)
+ return err;
+ }
+
nl80211_calculate_ap_params(&params);
if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])
@@ -4941,6 +5102,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
PUT_SINFO(CONNECTED_TIME, connected_time, u32);
PUT_SINFO(INACTIVE_TIME, inactive_time, u32);
+ PUT_SINFO_U64(ASSOC_AT_BOOTTIME, assoc_at);
if (sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) |
BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) &&
@@ -5555,6 +5717,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_STA_AID])
params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
+ if (info->attrs[NL80211_ATTR_VLAN_ID])
+ params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);
+
if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
params.listen_interval =
nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
@@ -5700,6 +5865,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
params.listen_interval =
nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
+ if (info->attrs[NL80211_ATTR_VLAN_ID])
+ params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);
+
if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
params.support_p2p_ps =
nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
@@ -6149,6 +6317,9 @@ static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
if (!rdev->ops->del_mpath)
return -EOPNOTSUPP;
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+ return -EOPNOTSUPP;
+
return rdev_del_mpath(rdev, dev, dst);
}
@@ -8106,10 +8277,8 @@ static int nl80211_start_sched_scan(struct sk_buff *skb,
/* leave request id zero for legacy request
* or if driver does not support multi-scheduled scan
*/
- if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) {
- while (!sched_scan_req->reqid)
- sched_scan_req->reqid = cfg80211_assign_cookie(rdev);
- }
+ if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1)
+ sched_scan_req->reqid = cfg80211_assign_cookie(rdev);
err = rdev_sched_scan_start(rdev, dev, sched_scan_req);
if (err)
@@ -8685,6 +8854,10 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_SCAN,
survey->time_scan, NL80211_SURVEY_INFO_PAD))
goto nla_put_failure;
+ if ((survey->filled & SURVEY_INFO_TIME_BSS_RX) &&
+ nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BSS_RX,
+ survey->time_bss_rx, NL80211_SURVEY_INFO_PAD))
+ goto nla_put_failure;
nla_nest_end(msg, infoattr);
@@ -8698,7 +8871,7 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq,
static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
+ struct nlattr **attrbuf;
struct survey_info survey;
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
@@ -8706,6 +8879,10 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
int res;
bool radio_stats;
+ attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL);
+ if (!attrbuf)
+ return -ENOMEM;
+
rtnl_lock();
res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev);
if (res)
@@ -8750,6 +8927,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[2] = survey_idx;
res = skb->len;
out_err:
+ kfree(attrbuf);
rtnl_unlock();
return res;
}
@@ -9609,6 +9787,7 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct cfg80211_registered_device *rdev;
+ struct nlattr **attrbuf = NULL;
int err;
long phy_idx;
void *data = NULL;
@@ -9629,7 +9808,12 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
goto out_err;
}
} else {
- struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
+ attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf),
+ GFP_KERNEL);
+ if (!attrbuf) {
+ err = -ENOMEM;
+ goto out_err;
+ }
err = nlmsg_parse_deprecated(cb->nlh,
GENL_HDRLEN + nl80211_fam.hdrsize,
@@ -9696,6 +9880,7 @@ static int nl80211_testmode_dump(struct sk_buff *skb,
/* see above */
cb->args[0] = phy_idx + 1;
out_err:
+ kfree(attrbuf);
rtnl_unlock();
return err;
}
@@ -9790,6 +9975,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
+ if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
+ connect.edmg.channels =
+ nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]);
+
+ if (info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG])
+ connect.edmg.bw_config =
+ nla_get_u8(info->attrs[NL80211_ATTR_WIPHY_EDMG_BW_CONFIG]);
+ }
+
if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
connkeys = nl80211_parse_connkeys(rdev, info, NULL);
if (IS_ERR(connkeys))
@@ -10650,6 +10844,7 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
if (err)
return err;
+ cfg80211_sinfo_release_content(&sinfo);
if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
wdev->cqm_config->last_rssi_event_value =
(s8) sinfo.rx_beacon_signal_avg;
@@ -10659,9 +10854,11 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
hyst = wdev->cqm_config->rssi_hyst;
n = wdev->cqm_config->n_rssi_thresholds;
- for (i = 0; i < n; i++)
+ for (i = 0; i < n; i++) {
+ i = array_index_nospec(i, n);
if (last < wdev->cqm_config->rssi_thresholds[i])
break;
+ }
low_index = i - 1;
if (low_index >= 0) {
@@ -12705,8 +12902,7 @@ static int nl80211_vendor_check_policy(const struct wiphy_vendor_command *vcmd,
return -EINVAL;
}
- return nl80211_validate_nested(attr, vcmd->maxattr, vcmd->policy,
- extack);
+ return nla_validate_nested(attr, vcmd->maxattr, vcmd->policy, extack);
}
static int nl80211_vendor_cmd(struct sk_buff *skb, struct genl_info *info)
@@ -12789,7 +12985,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
struct cfg80211_registered_device **rdev,
struct wireless_dev **wdev)
{
- struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam);
+ struct nlattr **attrbuf;
u32 vid, subcmd;
unsigned int i;
int vcmd_idx = -1;
@@ -12820,24 +13016,32 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
return 0;
}
+ attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), GFP_KERNEL);
+ if (!attrbuf)
+ return -ENOMEM;
+
err = nlmsg_parse_deprecated(cb->nlh,
GENL_HDRLEN + nl80211_fam.hdrsize,
attrbuf, nl80211_fam.maxattr,
nl80211_policy, NULL);
if (err)
- return err;
+ goto out;
if (!attrbuf[NL80211_ATTR_VENDOR_ID] ||
- !attrbuf[NL80211_ATTR_VENDOR_SUBCMD])
- return -EINVAL;
+ !attrbuf[NL80211_ATTR_VENDOR_SUBCMD]) {
+ err = -EINVAL;
+ goto out;
+ }
*wdev = __cfg80211_wdev_from_attrs(sock_net(skb->sk), attrbuf);
if (IS_ERR(*wdev))
*wdev = NULL;
*rdev = __cfg80211_rdev_from_attrs(sock_net(skb->sk), attrbuf);
- if (IS_ERR(*rdev))
- return PTR_ERR(*rdev);
+ if (IS_ERR(*rdev)) {
+ err = PTR_ERR(*rdev);
+ goto out;
+ }
vid = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_ID]);
subcmd = nla_get_u32(attrbuf[NL80211_ATTR_VENDOR_SUBCMD]);
@@ -12850,15 +13054,19 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
if (vcmd->info.vendor_id != vid || vcmd->info.subcmd != subcmd)
continue;
- if (!vcmd->dumpit)
- return -EOPNOTSUPP;
+ if (!vcmd->dumpit) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
vcmd_idx = i;
break;
}
- if (vcmd_idx < 0)
- return -EOPNOTSUPP;
+ if (vcmd_idx < 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
if (attrbuf[NL80211_ATTR_VENDOR_DATA]) {
data = nla_data(attrbuf[NL80211_ATTR_VENDOR_DATA]);
@@ -12869,7 +13077,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
attrbuf[NL80211_ATTR_VENDOR_DATA],
cb->extack);
if (err)
- return err;
+ goto out;
}
/* 0 is the first index - add 1 to parse only once */
@@ -12881,7 +13089,10 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb,
cb->args[4] = data_len;
/* keep rtnl locked in successful case */
- return 0;
+ err = 0;
+out:
+ kfree(attrbuf);
+ return err;
}
static int nl80211_vendor_cmd_dump(struct sk_buff *skb,
@@ -13481,7 +13692,7 @@ static int nl80211_get_ftm_responder_stats(struct sk_buff *skb,
hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
NL80211_CMD_GET_FTM_RESPONDER_STATS);
if (!hdr)
- return -ENOBUFS;
+ goto nla_put_failure;
if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
goto nla_put_failure;
@@ -13586,6 +13797,8 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
+ cfg80211_sinfo_release_content(&sinfo);
+
return rdev_probe_mesh_link(rdev, dev, dest, buf, len);
}
@@ -14559,6 +14772,7 @@ static struct genl_family nl80211_fam __ro_after_init = {
.n_ops = ARRAY_SIZE(nl80211_ops),
.mcgrps = nl80211_mcgrps,
.n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
+ .parallel_ops = true,
};
/* notification functions */
@@ -14835,12 +15049,10 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
return;
hdr = nl80211hdr_put(msg, 0, 0, 0, cmd_id);
- if (!hdr) {
- nlmsg_free(msg);
- return;
- }
+ if (!hdr)
+ goto nla_put_failure;
- if (nl80211_reg_change_event_fill(msg, request) == false)
+ if (!nl80211_reg_change_event_fill(msg, request))
goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -16090,7 +16302,9 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
if (wdev->iftype == NL80211_IFTYPE_STATION &&
!WARN_ON(!wdev->current_bss))
- wdev->current_bss->pub.channel = chandef->chan;
+ cfg80211_update_assoc_bss_entry(wdev, chandef->chan);
+
+ cfg80211_sched_dfs_chan_update(rdev);
nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL,
NL80211_CMD_CH_SWITCH_NOTIFY, 0);
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index e853a4fe6f97..e0d34f796d0b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -538,6 +538,10 @@ static inline int
rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed)
{
int ret;
+
+ if (!rdev->ops->set_wiphy_params)
+ return -EOPNOTSUPP;
+
trace_rdev_set_wiphy_params(&rdev->wiphy, changed);
ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
trace_rdev_return_int(&rdev->wiphy, ret);
@@ -1167,6 +1171,16 @@ rdev_start_radar_detection(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline void
+rdev_end_cac(struct cfg80211_registered_device *rdev,
+ struct net_device *dev)
+{
+ trace_rdev_end_cac(&rdev->wiphy, dev);
+ if (rdev->ops->end_cac)
+ rdev->ops->end_cac(&rdev->wiphy, dev);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
static inline int
rdev_set_mcast_rate(struct cfg80211_registered_device *rdev,
struct net_device *dev,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 4831ad745f91..fff9a74891fc 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2108,7 +2108,7 @@ static void reg_call_notifier(struct wiphy *wiphy,
static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
{
- struct cfg80211_chan_def chandef;
+ struct cfg80211_chan_def chandef = {};
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
enum nl80211_iftype iftype;
@@ -2261,14 +2261,15 @@ static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
static void handle_channel_custom(struct wiphy *wiphy,
struct ieee80211_channel *chan,
- const struct ieee80211_regdomain *regd)
+ const struct ieee80211_regdomain *regd,
+ u32 min_bw)
{
u32 bw_flags = 0;
const struct ieee80211_reg_rule *reg_rule = NULL;
const struct ieee80211_power_rule *power_rule = NULL;
u32 bw;
- for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) {
+ for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq),
regd, bw);
if (!IS_ERR(reg_rule))
@@ -2324,8 +2325,14 @@ static void handle_band_custom(struct wiphy *wiphy,
if (!sband)
return;
+ /*
+ * We currently assume that you always want at least 20 MHz,
+ * otherwise channel 12 might get enabled if this rule is
+ * compatible to US, which permits 2402 - 2472 MHz.
+ */
for (i = 0; i < sband->n_channels; i++)
- handle_channel_custom(wiphy, &sband->channels[i], regd);
+ handle_channel_custom(wiphy, &sband->channels[i], regd,
+ MHZ_TO_KHZ(20));
}
/* Used by drivers prior to wiphy registration */
@@ -2788,7 +2795,7 @@ static void reg_process_pending_hints(void)
/* When last_request->processed becomes true this will be rescheduled */
if (lr && !lr->processed) {
- reg_process_hint(lr);
+ pr_debug("Pending regulatory request, waiting for it to be processed...\n");
return;
}
@@ -3806,8 +3813,9 @@ void wiphy_regulatory_deregister(struct wiphy *wiphy)
}
/*
- * See http://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii, for
- * UNII band definitions
+ * See FCC notices for UNII band definitions
+ * 5GHz: https://www.fcc.gov/document/5-ghz-unlicensed-spectrum-unii
+ * 6GHz: https://www.fcc.gov/document/fcc-proposes-more-spectrum-unlicensed-use-0
*/
int cfg80211_get_unii(int freq)
{
@@ -3831,6 +3839,22 @@ int cfg80211_get_unii(int freq)
if (freq > 5725 && freq <= 5825)
return 4;
+ /* UNII-5 */
+ if (freq > 5925 && freq <= 6425)
+ return 5;
+
+ /* UNII-6 */
+ if (freq > 6425 && freq <= 6525)
+ return 6;
+
+ /* UNII-7 */
+ if (freq > 6525 && freq <= 6875)
+ return 7;
+
+ /* UNII-8 */
+ if (freq > 6875 && freq <= 7125)
+ return 8;
+
return -EINVAL;
}
@@ -3866,6 +3890,26 @@ bool regulatory_pre_cac_allowed(struct wiphy *wiphy)
return pre_cac_allowed;
}
+EXPORT_SYMBOL(regulatory_pre_cac_allowed);
+
+static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev)
+{
+ struct wireless_dev *wdev;
+ /* If we finished CAC or received radar, we should end any
+ * CAC running on the same channels.
+ * the check !cfg80211_chandef_dfs_usable contain 2 options:
+ * either all channels are available - those the CAC_FINISHED
+ * event has effected another wdev state, or there is a channel
+ * in unavailable state in wdev chandef - those the RADAR_DETECTED
+ * event has effected another wdev state.
+ * In both cases we should end the CAC on the wdev.
+ */
+ list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
+ if (wdev->cac_started &&
+ !cfg80211_chandef_dfs_usable(&rdev->wiphy, &wdev->chandef))
+ rdev_end_cac(rdev, wdev->netdev);
+ }
+}
void regulatory_propagate_dfs_state(struct wiphy *wiphy,
struct cfg80211_chan_def *chandef,
@@ -3893,8 +3937,10 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy,
cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state);
if (event == NL80211_RADAR_DETECTED ||
- event == NL80211_RADAR_CAC_FINISHED)
+ event == NL80211_RADAR_CAC_FINISHED) {
cfg80211_sched_dfs_chan_update(rdev);
+ cfg80211_check_and_end_cac(rdev);
+ }
nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL);
}
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index 504133d76de4..f9e83031a40a 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -114,7 +114,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy,
u8 country_ie_len);
/**
- * regulatory_hint_disconnect - informs all devices have been disconneted
+ * regulatory_hint_disconnect - informs all devices have been disconnected
*
* Regulotory rules can be enhanced further upon scanning and upon
* connection to an AP. These rules become stale if we disconnect
@@ -156,14 +156,6 @@ bool regulatory_indoor_allowed(void);
#define REG_PRE_CAC_EXPIRY_GRACE_MS 2000
/**
- * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain
- * @wiphy: wiphy for which pre-CAC capability is checked.
-
- * Pre-CAC is allowed only in ETSI domain.
- */
-bool regulatory_pre_cac_allowed(struct wiphy *wiphy);
-
-/**
* regulatory_propagate_dfs_state - Propagate DFS channel state to other wiphys
* @wiphy - wiphy on which radar is detected and the event will be propagated
* to other available wiphys having the same DFS domain
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index d66e6d4b7555..aef240fdf8df 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1091,6 +1091,93 @@ struct cfg80211_non_tx_bss {
u8 bssid_index;
};
+static bool
+cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
+ struct cfg80211_internal_bss *known,
+ struct cfg80211_internal_bss *new,
+ bool signal_valid)
+{
+ lockdep_assert_held(&rdev->bss_lock);
+
+ /* Update IEs */
+ if (rcu_access_pointer(new->pub.proberesp_ies)) {
+ const struct cfg80211_bss_ies *old;
+
+ old = rcu_access_pointer(known->pub.proberesp_ies);
+
+ rcu_assign_pointer(known->pub.proberesp_ies,
+ new->pub.proberesp_ies);
+ /* Override possible earlier Beacon frame IEs */
+ rcu_assign_pointer(known->pub.ies,
+ new->pub.proberesp_ies);
+ if (old)
+ kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head);
+ } else if (rcu_access_pointer(new->pub.beacon_ies)) {
+ const struct cfg80211_bss_ies *old;
+ struct cfg80211_internal_bss *bss;
+
+ if (known->pub.hidden_beacon_bss &&
+ !list_empty(&known->hidden_list)) {
+ const struct cfg80211_bss_ies *f;
+
+ /* The known BSS struct is one of the probe
+ * response members of a group, but we're
+ * receiving a beacon (beacon_ies in the new
+ * bss is used). This can only mean that the
+ * AP changed its beacon from not having an
+ * SSID to showing it, which is confusing so
+ * drop this information.
+ */
+
+ f = rcu_access_pointer(new->pub.beacon_ies);
+ kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head);
+ return false;
+ }
+
+ old = rcu_access_pointer(known->pub.beacon_ies);
+
+ rcu_assign_pointer(known->pub.beacon_ies, new->pub.beacon_ies);
+
+ /* Override IEs if they were from a beacon before */
+ if (old == rcu_access_pointer(known->pub.ies))
+ rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies);
+
+ /* Assign beacon IEs to all sub entries */
+ list_for_each_entry(bss, &known->hidden_list, hidden_list) {
+ const struct cfg80211_bss_ies *ies;
+
+ ies = rcu_access_pointer(bss->pub.beacon_ies);
+ WARN_ON(ies != old);
+
+ rcu_assign_pointer(bss->pub.beacon_ies,
+ new->pub.beacon_ies);
+ }
+
+ if (old)
+ kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head);
+ }
+
+ known->pub.beacon_interval = new->pub.beacon_interval;
+
+ /* don't update the signal if beacon was heard on
+ * adjacent channel.
+ */
+ if (signal_valid)
+ known->pub.signal = new->pub.signal;
+ known->pub.capability = new->pub.capability;
+ known->ts = new->ts;
+ known->ts_boottime = new->ts_boottime;
+ known->parent_tsf = new->parent_tsf;
+ known->pub.chains = new->pub.chains;
+ memcpy(known->pub.chain_signal, new->pub.chain_signal,
+ IEEE80211_MAX_CHAINS);
+ ether_addr_copy(known->parent_bssid, new->parent_bssid);
+ known->pub.max_bssid_indicator = new->pub.max_bssid_indicator;
+ known->pub.bssid_index = new->pub.bssid_index;
+
+ return true;
+}
+
/* Returned bss is reference counted and must be cleaned up appropriately. */
struct cfg80211_internal_bss *
cfg80211_bss_update(struct cfg80211_registered_device *rdev,
@@ -1114,88 +1201,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR);
if (found) {
- /* Update IEs */
- if (rcu_access_pointer(tmp->pub.proberesp_ies)) {
- const struct cfg80211_bss_ies *old;
-
- old = rcu_access_pointer(found->pub.proberesp_ies);
-
- rcu_assign_pointer(found->pub.proberesp_ies,
- tmp->pub.proberesp_ies);
- /* Override possible earlier Beacon frame IEs */
- rcu_assign_pointer(found->pub.ies,
- tmp->pub.proberesp_ies);
- if (old)
- kfree_rcu((struct cfg80211_bss_ies *)old,
- rcu_head);
- } else if (rcu_access_pointer(tmp->pub.beacon_ies)) {
- const struct cfg80211_bss_ies *old;
- struct cfg80211_internal_bss *bss;
-
- if (found->pub.hidden_beacon_bss &&
- !list_empty(&found->hidden_list)) {
- const struct cfg80211_bss_ies *f;
-
- /*
- * The found BSS struct is one of the probe
- * response members of a group, but we're
- * receiving a beacon (beacon_ies in the tmp
- * bss is used). This can only mean that the
- * AP changed its beacon from not having an
- * SSID to showing it, which is confusing so
- * drop this information.
- */
-
- f = rcu_access_pointer(tmp->pub.beacon_ies);
- kfree_rcu((struct cfg80211_bss_ies *)f,
- rcu_head);
- goto drop;
- }
-
- old = rcu_access_pointer(found->pub.beacon_ies);
-
- rcu_assign_pointer(found->pub.beacon_ies,
- tmp->pub.beacon_ies);
-
- /* Override IEs if they were from a beacon before */
- if (old == rcu_access_pointer(found->pub.ies))
- rcu_assign_pointer(found->pub.ies,
- tmp->pub.beacon_ies);
-
- /* Assign beacon IEs to all sub entries */
- list_for_each_entry(bss, &found->hidden_list,
- hidden_list) {
- const struct cfg80211_bss_ies *ies;
-
- ies = rcu_access_pointer(bss->pub.beacon_ies);
- WARN_ON(ies != old);
-
- rcu_assign_pointer(bss->pub.beacon_ies,
- tmp->pub.beacon_ies);
- }
-
- if (old)
- kfree_rcu((struct cfg80211_bss_ies *)old,
- rcu_head);
- }
-
- found->pub.beacon_interval = tmp->pub.beacon_interval;
- /*
- * don't update the signal if beacon was heard on
- * adjacent channel.
- */
- if (signal_valid)
- found->pub.signal = tmp->pub.signal;
- found->pub.capability = tmp->pub.capability;
- found->ts = tmp->ts;
- found->ts_boottime = tmp->ts_boottime;
- found->parent_tsf = tmp->parent_tsf;
- found->pub.chains = tmp->pub.chains;
- memcpy(found->pub.chain_signal, tmp->pub.chain_signal,
- IEEE80211_MAX_CHAINS);
- ether_addr_copy(found->parent_bssid, tmp->parent_bssid);
- found->pub.max_bssid_indicator = tmp->pub.max_bssid_indicator;
- found->pub.bssid_index = tmp->pub.bssid_index;
+ if (!cfg80211_update_known_bss(rdev, found, tmp, signal_valid))
+ goto drop;
} else {
struct cfg80211_internal_bss *new;
struct cfg80211_internal_bss *hidden;
@@ -1368,6 +1375,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
struct cfg80211_internal_bss tmp = {}, *res;
int bss_type;
bool signal_valid;
+ unsigned long ts;
if (WARN_ON(!wiphy))
return NULL;
@@ -1390,8 +1398,11 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
tmp.ts_boottime = data->boottime_ns;
if (non_tx_data) {
tmp.pub.transmitted_bss = non_tx_data->tx_bss;
+ ts = bss_from_pub(non_tx_data->tx_bss)->ts;
tmp.pub.bssid_index = non_tx_data->bssid_index;
tmp.pub.max_bssid_indicator = non_tx_data->max_bssid_indicator;
+ } else {
+ ts = jiffies;
}
/*
@@ -1425,8 +1436,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
- res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
- jiffies);
+ res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, ts);
if (!res)
return NULL;
@@ -1440,7 +1450,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
regulatory_hint_found_beacon(wiphy, channel, gfp);
}
- if (non_tx_data && non_tx_data->tx_bss) {
+ if (non_tx_data) {
/* this is a nontransmitting bss, we need to add it to
* transmitting bss' list if it is not there
*/
@@ -1659,6 +1669,8 @@ cfg80211_inform_bss_data(struct wiphy *wiphy,
res = cfg80211_inform_single_bss_data(wiphy, data, ftype, bssid, tsf,
capability, beacon_interval, ie,
ielen, NULL, gfp);
+ if (!res)
+ return NULL;
non_tx_data.tx_bss = res;
cfg80211_parse_mbssid_data(wiphy, data, ftype, bssid, tsf,
beacon_interval, ie, ielen, &non_tx_data,
@@ -1691,8 +1703,7 @@ cfg80211_parse_mbssid_frame_data(struct wiphy *wiphy,
static void
cfg80211_update_notlisted_nontrans(struct wiphy *wiphy,
struct cfg80211_bss *nontrans_bss,
- struct ieee80211_mgmt *mgmt, size_t len,
- gfp_t gfp)
+ struct ieee80211_mgmt *mgmt, size_t len)
{
u8 *ie, *new_ie, *pos;
const u8 *nontrans_ssid, *trans_ssid, *mbssid;
@@ -1703,6 +1714,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy,
const struct cfg80211_bss_ies *old;
u8 cpy_len;
+ lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock);
+
ie = mgmt->u.probe_resp.variable;
new_ie_len = ielen;
@@ -1711,26 +1724,30 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy,
return;
new_ie_len -= trans_ssid[1];
mbssid = cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen);
- if (!mbssid)
+ /*
+ * It's not valid to have the MBSSID element before SSID
+ * ignore if that happens - the code below assumes it is
+ * after (while copying things inbetween).
+ */
+ if (!mbssid || mbssid < trans_ssid)
return;
new_ie_len -= mbssid[1];
- rcu_read_lock();
+
nontrans_ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID);
- if (!nontrans_ssid) {
- rcu_read_unlock();
+ if (!nontrans_ssid)
return;
- }
+
new_ie_len += nontrans_ssid[1];
- rcu_read_unlock();
/* generate new ie for nontrans BSS
* 1. replace SSID with nontrans BSS' SSID
* 2. skip MBSSID IE
*/
- new_ie = kzalloc(new_ie_len, gfp);
+ new_ie = kzalloc(new_ie_len, GFP_ATOMIC);
if (!new_ie)
return;
- new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, gfp);
+
+ new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, GFP_ATOMIC);
if (!new_ies)
goto out_free;
@@ -1776,7 +1793,6 @@ static struct cfg80211_bss *
cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
struct cfg80211_inform_bss *data,
struct ieee80211_mgmt *mgmt, size_t len,
- struct cfg80211_non_tx_bss *non_tx_data,
gfp_t gfp)
{
struct cfg80211_internal_bss tmp = {}, *res;
@@ -1835,11 +1851,6 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
tmp.pub.chains = data->chains;
memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS);
ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
- if (non_tx_data) {
- tmp.pub.transmitted_bss = non_tx_data->tx_bss;
- tmp.pub.bssid_index = non_tx_data->bssid_index;
- tmp.pub.max_bssid_indicator = non_tx_data->max_bssid_indicator;
- }
signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
wiphy->max_adj_channel_rssi_comp;
@@ -1877,7 +1888,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
struct cfg80211_non_tx_bss non_tx_data;
res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt,
- len, NULL, gfp);
+ len, gfp);
if (!res || !wiphy->support_mbssid ||
!cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen))
return res;
@@ -1890,6 +1901,8 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
cfg80211_parse_mbssid_frame_data(wiphy, data, mgmt, len,
&non_tx_data, gfp);
+ spin_lock_bh(&wiphy_to_rdev(wiphy)->bss_lock);
+
/* check if the res has other nontransmitting bss which is not
* in MBSSID IE
*/
@@ -1904,8 +1917,9 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy,
ies2 = rcu_access_pointer(tmp_bss->ies);
if (ies2->tsf < ies1->tsf)
cfg80211_update_notlisted_nontrans(wiphy, tmp_bss,
- mgmt, len, gfp);
+ mgmt, len);
}
+ spin_unlock_bh(&wiphy_to_rdev(wiphy)->bss_lock);
return res;
}
@@ -1995,6 +2009,85 @@ void cfg80211_bss_iter(struct wiphy *wiphy,
}
EXPORT_SYMBOL(cfg80211_bss_iter);
+void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
+ struct ieee80211_channel *chan)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct cfg80211_internal_bss *cbss = wdev->current_bss;
+ struct cfg80211_internal_bss *new = NULL;
+ struct cfg80211_internal_bss *bss;
+ struct cfg80211_bss *nontrans_bss;
+ struct cfg80211_bss *tmp;
+
+ spin_lock_bh(&rdev->bss_lock);
+
+ if (WARN_ON(cbss->pub.channel == chan))
+ goto done;
+
+ /* use transmitting bss */
+ if (cbss->pub.transmitted_bss)
+ cbss = container_of(cbss->pub.transmitted_bss,
+ struct cfg80211_internal_bss,
+ pub);
+
+ cbss->pub.channel = chan;
+
+ list_for_each_entry(bss, &rdev->bss_list, list) {
+ if (!cfg80211_bss_type_match(bss->pub.capability,
+ bss->pub.channel->band,
+ wdev->conn_bss_type))
+ continue;
+
+ if (bss == cbss)
+ continue;
+
+ if (!cmp_bss(&bss->pub, &cbss->pub, BSS_CMP_REGULAR)) {
+ new = bss;
+ break;
+ }
+ }
+
+ if (new) {
+ /* to save time, update IEs for transmitting bss only */
+ if (cfg80211_update_known_bss(rdev, cbss, new, false)) {
+ new->pub.proberesp_ies = NULL;
+ new->pub.beacon_ies = NULL;
+ }
+
+ list_for_each_entry_safe(nontrans_bss, tmp,
+ &new->pub.nontrans_list,
+ nontrans_list) {
+ bss = container_of(nontrans_bss,
+ struct cfg80211_internal_bss, pub);
+ if (__cfg80211_unlink_bss(rdev, bss))
+ rdev->bss_generation++;
+ }
+
+ WARN_ON(atomic_read(&new->hold));
+ if (!WARN_ON(!__cfg80211_unlink_bss(rdev, new)))
+ rdev->bss_generation++;
+ }
+
+ rb_erase(&cbss->rbn, &rdev->bss_tree);
+ rb_insert_bss(rdev, cbss);
+ rdev->bss_generation++;
+
+ list_for_each_entry_safe(nontrans_bss, tmp,
+ &cbss->pub.nontrans_list,
+ nontrans_list) {
+ bss = container_of(nontrans_bss,
+ struct cfg80211_internal_bss, pub);
+ bss->pub.channel = chan;
+ rb_erase(&bss->rbn, &rdev->bss_tree);
+ rb_insert_bss(rdev, bss);
+ rdev->bss_generation++;
+ }
+
+done:
+ spin_unlock_bh(&rdev->bss_lock);
+}
+
#ifdef CONFIG_CFG80211_WEXT
static struct cfg80211_registered_device *
cfg80211_get_dev_from_ifindex(struct net *net, int ifindex)
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 7a6c38ddc65a..d32a2ec4d96a 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1307,14 +1307,14 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
if (wdev->conn_owner_nlportid) {
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
- cfg80211_leave_ibss(rdev, wdev->netdev, false);
+ __cfg80211_leave_ibss(rdev, wdev->netdev, false);
break;
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_P2P_GO:
- cfg80211_stop_ap(rdev, wdev->netdev, false);
+ __cfg80211_stop_ap(rdev, wdev->netdev, false);
break;
case NL80211_IFTYPE_MESH_POINT:
- cfg80211_leave_mesh(rdev, wdev->netdev);
+ __cfg80211_leave_mesh(rdev, wdev->netdev);
break;
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_P2P_CLIENT:
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 4fbb91a511ae..3ef1679b0e66 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -646,6 +646,11 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa,
TP_ARGS(wiphy, netdev)
);
+DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev),
+ TP_ARGS(wiphy, netdev)
+);
+
DECLARE_EVENT_CLASS(station_add_change,
TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac,
struct station_parameters *params),
@@ -2009,7 +2014,7 @@ TRACE_EVENT(rdev_start_nan,
WIPHY_ENTRY
WDEV_ENTRY
__field(u8, master_pref)
- __field(u8, bands);
+ __field(u8, bands)
),
TP_fast_assign(
WIPHY_ASSIGN;
@@ -2031,8 +2036,8 @@ TRACE_EVENT(rdev_nan_change_conf,
WIPHY_ENTRY
WDEV_ENTRY
__field(u8, master_pref)
- __field(u8, bands);
- __field(u32, changes);
+ __field(u8, bands)
+ __field(u32, changes)
),
TP_fast_assign(
WIPHY_ASSIGN;
@@ -2446,10 +2451,11 @@ TRACE_EVENT(rdev_set_mcast_rate,
sizeof(int) * NUM_NL80211_BANDS);
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
- "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
+ "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 6GHz=0x%x, 60GHz=0x%x]",
WIPHY_PR_ARG, NETDEV_PR_ARG,
__entry->mcast_rate[NL80211_BAND_2GHZ],
__entry->mcast_rate[NL80211_BAND_5GHZ],
+ __entry->mcast_rate[NL80211_BAND_6GHZ],
__entry->mcast_rate[NL80211_BAND_60GHZ])
);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d0e35b7b9e35..8481e9ac33da 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -91,6 +91,11 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band)
else
return 5000 + chan * 5;
break;
+ case NL80211_BAND_6GHZ:
+ /* see 802.11ax D4.1 27.3.22.2 */
+ if (chan <= 253)
+ return 5940 + chan * 5;
+ break;
case NL80211_BAND_60GHZ:
if (chan < 7)
return 56160 + chan * 2160;
@@ -111,8 +116,11 @@ int ieee80211_frequency_to_channel(int freq)
return (freq - 2407) / 5;
else if (freq >= 4910 && freq <= 4980)
return (freq - 4000) / 5;
- else if (freq <= 45000) /* DMG band lower limit */
+ else if (freq < 5945)
return (freq - 5000) / 5;
+ else if (freq <= 45000) /* DMG band lower limit */
+ /* see 802.11ax D4.1 27.3.22.2 */
+ return (freq - 5940) / 5;
else if (freq >= 58320 && freq <= 70200)
return (freq - 56160) / 2160;
else
@@ -148,6 +156,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband)
switch (sband->band) {
case NL80211_BAND_5GHZ:
+ case NL80211_BAND_6GHZ:
want = 3;
for (i = 0; i < sband->n_bitrates; i++) {
if (sband->bitrates[i].bitrate == 60 ||
@@ -233,25 +242,30 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
switch (params->cipher) {
case WLAN_CIPHER_SUITE_TKIP:
+ /* Extended Key ID can only be used with CCMP/GCMP ciphers */
+ if ((pairwise && key_idx) ||
+ params->mode != NL80211_KEY_RX_TX)
+ return -EINVAL;
+ break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
- /* IEEE802.11-2016 allows only 0 and - when using Extended Key
- * ID - 1 as index for pairwise keys.
+ /* IEEE802.11-2016 allows only 0 and - when supporting
+ * Extended Key ID - 1 as index for pairwise keys.
* @NL80211_KEY_NO_TX is only allowed for pairwise keys when
* the driver supports Extended Key ID.
* @NL80211_KEY_SET_TX can't be set when installing and
* validating a key.
*/
- if (params->mode == NL80211_KEY_NO_TX) {
- if (!wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_EXT_KEY_ID))
- return -EINVAL;
- else if (!pairwise || key_idx < 0 || key_idx > 1)
+ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) ||
+ params->mode == NL80211_KEY_SET_TX)
+ return -EINVAL;
+ if (wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_EXT_KEY_ID)) {
+ if (pairwise && (key_idx < 0 || key_idx > 1))
return -EINVAL;
- } else if ((pairwise && key_idx) ||
- params->mode == NL80211_KEY_SET_TX) {
+ } else if (pairwise && key_idx) {
return -EINVAL;
}
break;
@@ -550,7 +564,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
struct skb_shared_info *sh = skb_shinfo(skb);
int page_offset;
- page_ref_inc(page);
+ get_page(page);
page_offset = ptr - page_address(page);
skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
}
@@ -955,6 +969,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
}
cfg80211_process_rdev_events(rdev);
+ cfg80211_mlme_purge_registrations(dev->ieee80211_ptr);
}
err = rdev_change_virtual_intf(rdev, dev, ntype, params);
@@ -1034,7 +1049,7 @@ static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate)
return (bitrate + 50000) / 100000;
}
-static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate)
+static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate)
{
static const u32 __mcs2bitrate[] = {
/* control PHY */
@@ -1081,6 +1096,40 @@ static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate)
return __mcs2bitrate[rate->mcs];
}
+static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate)
+{
+ static const u32 __mcs2bitrate[] = {
+ /* control PHY */
+ [0] = 275,
+ /* SC PHY */
+ [1] = 3850,
+ [2] = 7700,
+ [3] = 9625,
+ [4] = 11550,
+ [5] = 12512, /* 1251.25 mbps */
+ [6] = 13475,
+ [7] = 15400,
+ [8] = 19250,
+ [9] = 23100,
+ [10] = 25025,
+ [11] = 26950,
+ [12] = 30800,
+ [13] = 38500,
+ [14] = 46200,
+ [15] = 50050,
+ [16] = 53900,
+ [17] = 57750,
+ [18] = 69300,
+ [19] = 75075,
+ [20] = 80850,
+ };
+
+ if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate)))
+ return 0;
+
+ return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch;
+}
+
static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate)
{
static const u32 base[4][10] = {
@@ -1253,8 +1302,10 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
{
if (rate->flags & RATE_INFO_FLAGS_MCS)
return cfg80211_calculate_bitrate_ht(rate);
- if (rate->flags & RATE_INFO_FLAGS_60G)
- return cfg80211_calculate_bitrate_60g(rate);
+ if (rate->flags & RATE_INFO_FLAGS_DMG)
+ return cfg80211_calculate_bitrate_dmg(rate);
+ if (rate->flags & RATE_INFO_FLAGS_EDMG)
+ return cfg80211_calculate_bitrate_edmg(rate);
if (rate->flags & RATE_INFO_FLAGS_VHT_MCS)
return cfg80211_calculate_bitrate_vht(rate);
if (rate->flags & RATE_INFO_FLAGS_HE_MCS)
@@ -1466,6 +1517,9 @@ bool ieee80211_operating_class_to_band(u8 operating_class,
case 128 ... 130:
*band = NL80211_BAND_5GHZ;
return true;
+ case 131 ... 135:
+ *band = NL80211_BAND_6GHZ;
+ return true;
case 81:
case 82:
case 83:
@@ -1505,7 +1559,8 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
}
if (freq == 2484) {
- if (chandef->width > NL80211_CHAN_WIDTH_40)
+ /* channel 14 is only for IEEE 802.11b */
+ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT)
return false;
*op_class = 82; /* channel 14 */
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 46e4d69db845..cac9e28d852b 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -7,6 +7,7 @@
* we directly assign the wireless handlers of wireless interfaces.
*
* Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2019 Intel Corporation
*/
#include <linux/export.h>
@@ -797,7 +798,7 @@ static int cfg80211_wext_giwfreq(struct net_device *dev,
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- struct cfg80211_chan_def chandef;
+ struct cfg80211_chan_def chandef = {};
int ret;
switch (wdev->iftype) {
@@ -864,8 +865,8 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev,
}
}
} else {
- rfkill_set_sw_state(rdev->rfkill, true);
- schedule_work(&rdev->rfkill_sync);
+ if (rfkill_set_sw_state(rdev->rfkill, true))
+ schedule_work(&rdev->rfkill_block);
return 0;
}
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 5e677dac2a0c..69102fda9ebd 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -657,7 +657,8 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev)
return NULL;
}
-static int iw_handler_get_iwstats(struct net_device * dev,
+/* noinline to avoid a bogus warning with -O3 */
+static noinline int iw_handler_get_iwstats(struct net_device * dev,
struct iw_request_info * info,
union iwreq_data * wrqu,
char * extra)
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index c67d7a82ab13..73fd0eae08ca 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -202,6 +202,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
struct iw_point *data, char *ssid)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int ret = 0;
/* call only for station! */
if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
@@ -219,7 +220,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
if (ie) {
data->flags = 1;
data->length = ie[1];
- memcpy(ssid, ie + 2, data->length);
+ if (data->length > IW_ESSID_MAX_SIZE)
+ ret = -EINVAL;
+ else
+ memcpy(ssid, ie + 2, data->length);
}
rcu_read_unlock();
} else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) {
@@ -229,7 +233,7 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
}
wdev_unlock(wdev);
- return 0;
+ return ret;
}
int cfg80211_mgd_wext_siwap(struct net_device *dev,
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 6aee9f5e8e71..d5b09bbff375 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -659,6 +659,12 @@ static int x25_release(struct socket *sock)
sock_set_flag(sk, SOCK_DEAD);
sock_set_flag(sk, SOCK_DESTROY);
break;
+
+ case X25_STATE_5:
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25_disconnect(sk, 0, 0, 0);
+ __x25_destroy_socket(sk);
+ goto out;
}
sock_orphan(sk);
@@ -760,6 +766,10 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
if (sk->sk_state == TCP_ESTABLISHED)
goto out;
+ rc = -EALREADY; /* Do nothing if call is already in progress */
+ if (sk->sk_state == TCP_SYN_SENT)
+ goto out;
+
sk->sk_state = TCP_CLOSE;
sock->state = SS_UNCONNECTED;
@@ -806,7 +816,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
/* Now the loop */
rc = -EINPROGRESS;
if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
- goto out_put_neigh;
+ goto out;
rc = x25_wait_for_connection_establishment(sk);
if (rc)
@@ -891,7 +901,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags,
/* Now attach up the new socket */
skb->sk = NULL;
kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
newsock->state = SS_CONNECTED;
rc = 0;
out2:
@@ -1054,6 +1064,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) {
x25_write_internal(make, X25_CALL_ACCEPTED);
makex25->state = X25_STATE_3;
+ } else {
+ makex25->state = X25_STATE_5;
}
/*
@@ -1062,7 +1074,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
makex25->calluserdata.cudlength = skb->len;
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
x25_insert_socket(make);
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 5c111bc3c8ea..00e782335cb0 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -55,7 +55,7 @@ static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb)
if (!sock_owned_by_user(sk)) {
queued = x25_process_rx_frame(sk, skb);
} else {
- queued = !sk_add_backlog(sk, skb, sk->sk_rcvbuf);
+ queued = !sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf));
}
bh_unlock_sock(sk);
sock_put(sk);
diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c
index f97c43344e95..4d3bb46aaae0 100644
--- a/net/x25/x25_in.c
+++ b/net/x25/x25_in.c
@@ -382,6 +382,35 @@ out_clear:
return 0;
}
+/*
+ * State machine for state 5, Call Accepted / Call Connected pending (X25_ACCPT_APPRV_FLAG).
+ * The handling of the timer(s) is in file x25_timer.c
+ * Handling of state 0 and connection release is in af_x25.c.
+ */
+static int x25_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype)
+{
+ struct x25_sock *x25 = x25_sk(sk);
+
+ switch (frametype) {
+ case X25_CLEAR_REQUEST:
+ if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 2)) {
+ x25_write_internal(sk, X25_CLEAR_REQUEST);
+ x25->state = X25_STATE_2;
+ x25_start_t23timer(sk);
+ return 0;
+ }
+
+ x25_write_internal(sk, X25_CLEAR_CONFIRMATION);
+ x25_disconnect(sk, 0, skb->data[3], skb->data[4]);
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
/* Higher level upcall for a LAPB frame */
int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb)
{
@@ -406,6 +435,9 @@ int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb)
case X25_STATE_4:
queued = x25_state4_machine(sk, skb, frametype);
break;
+ case X25_STATE_5:
+ queued = x25_state5_machine(sk, skb, frametype);
+ break;
}
x25_kick(sk);
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 83de74ca729a..fa7bb5e060d0 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,6 +14,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/idr.h>
+#include <linux/vmalloc.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
@@ -26,6 +27,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
unsigned long flags;
+ if (!xs->tx)
+ return;
+
spin_lock_irqsave(&umem->xsk_list_lock, flags);
list_add_rcu(&xs->list, &umem->xsk_list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
@@ -35,6 +39,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
unsigned long flags;
+ if (!xs->tx)
+ return;
+
spin_lock_irqsave(&umem->xsk_list_lock, flags);
list_del_rcu(&xs->list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
@@ -105,14 +112,22 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
umem->dev = dev;
umem->queue_id = queue_id;
+ if (flags & XDP_USE_NEED_WAKEUP) {
+ umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
+ /* Tx needs to be explicitly woken up the first time.
+ * Also for supporting drivers that do not implement this
+ * feature. They will always have to call sendto().
+ */
+ xsk_set_tx_need_wakeup(umem);
+ }
+
dev_hold(dev);
if (force_copy)
/* For copy-mode, we are done. */
return 0;
- if (!dev->netdev_ops->ndo_bpf ||
- !dev->netdev_ops->ndo_xsk_async_xmit) {
+ if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
err = -EOPNOTSUPP;
goto err_unreg_umem;
}
@@ -164,17 +179,41 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
umem->zc = false;
}
-static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+static void xdp_umem_unmap_pages(struct xdp_umem *umem)
{
unsigned int i;
+ for (i = 0; i < umem->npgs; i++)
+ if (PageHighMem(umem->pgs[i]))
+ vunmap(umem->pages[i].addr);
+}
+
+static int xdp_umem_map_pages(struct xdp_umem *umem)
+{
+ unsigned int i;
+ void *addr;
+
for (i = 0; i < umem->npgs; i++) {
- struct page *page = umem->pgs[i];
+ if (PageHighMem(umem->pgs[i]))
+ addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
+ else
+ addr = page_address(umem->pgs[i]);
+
+ if (!addr) {
+ xdp_umem_unmap_pages(umem);
+ return -ENOMEM;
+ }
- set_page_dirty_lock(page);
- put_page(page);
+ umem->pages[i].addr = addr;
}
+ return 0;
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+ unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
+
kfree(umem->pgs);
umem->pgs = NULL;
}
@@ -207,9 +246,10 @@ static void xdp_umem_release(struct xdp_umem *umem)
xsk_reuseq_destroy(umem);
+ xdp_umem_unmap_pages(umem);
xdp_umem_unpin_pages(umem);
- kfree(umem->pages);
+ kvfree(umem->pages);
umem->pages = NULL;
xdp_umem_unaccount_pages(umem);
@@ -251,7 +291,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
return -ENOMEM;
down_read(&current->mm->mmap_sem);
- npgs = get_user_pages(umem->address, umem->npgs,
+ npgs = pin_user_pages(umem->address, umem->npgs,
gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
up_read(&current->mm->mmap_sem);
@@ -299,10 +339,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
+ bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
unsigned int chunks, chunks_per_page;
u64 addr = mr->addr, size = mr->len;
- int size_chk, err, i;
+ int size_chk, err;
if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
/* Strictly speaking we could support this, if:
@@ -314,7 +355,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
- if (!is_power_of_2(chunk_size))
+ if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
+ XDP_UMEM_USES_NEED_WAKEUP))
+ return -EINVAL;
+
+ if (!unaligned_chunks && !is_power_of_2(chunk_size))
return -EINVAL;
if (!PAGE_ALIGNED(addr)) {
@@ -331,24 +376,26 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (chunks == 0)
return -EINVAL;
- chunks_per_page = PAGE_SIZE / chunk_size;
- if (chunks < chunks_per_page || chunks % chunks_per_page)
- return -EINVAL;
-
- headroom = ALIGN(headroom, 64);
+ if (!unaligned_chunks) {
+ chunks_per_page = PAGE_SIZE / chunk_size;
+ if (chunks < chunks_per_page || chunks % chunks_per_page)
+ return -EINVAL;
+ }
size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
if (size_chk < 0)
return -EINVAL;
umem->address = (unsigned long)addr;
- umem->chunk_mask = ~((u64)chunk_size - 1);
+ umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
+ : ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
umem->chunk_size_nohr = chunk_size - headroom;
umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL;
umem->user = NULL;
+ umem->flags = mr->flags;
INIT_LIST_HEAD(&umem->xsk_list);
spin_lock_init(&umem->xsk_list_lock);
@@ -362,17 +409,21 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
goto out_account;
- umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+ umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages),
+ GFP_KERNEL_ACCOUNT);
if (!umem->pages) {
err = -ENOMEM;
- goto out_account;
+ goto out_pin;
}
- for (i = 0; i < umem->npgs; i++)
- umem->pages[i].addr = page_address(umem->pgs[i]);
+ err = xdp_umem_map_pages(umem);
+ if (!err)
+ return 0;
- return 0;
+ kvfree(umem->pages);
+out_pin:
+ xdp_umem_unpin_pages(umem);
out_account:
xdp_umem_unaccount_pages(umem);
return err;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 59b57d708697..df600487a68d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -31,6 +31,8 @@
#define TX_BATCH_SIZE 16
+static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
+
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
@@ -39,37 +41,119 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
- return xskq_has_addrs(umem->fq, cnt);
+ return xskq_cons_has_entries(umem->fq, cnt);
}
EXPORT_SYMBOL(xsk_umem_has_addrs);
-u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
- return xskq_peek_addr(umem->fq, addr);
+ return xskq_cons_peek_addr(umem->fq, addr, umem);
}
EXPORT_SYMBOL(xsk_umem_peek_addr);
-void xsk_umem_discard_addr(struct xdp_umem *umem)
+void xsk_umem_release_addr(struct xdp_umem *umem)
{
- xskq_discard_addr(umem->fq);
+ xskq_cons_release(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_release_addr);
+
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+ if (umem->need_wakeup & XDP_WAKEUP_RX)
+ return;
+
+ umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+ umem->need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+ struct xdp_sock *xs;
+
+ if (umem->need_wakeup & XDP_WAKEUP_TX)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+ }
+ rcu_read_unlock();
+
+ umem->need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+ if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+ return;
+
+ umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ umem->need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+ struct xdp_sock *xs;
+
+ if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ }
+ rcu_read_unlock();
+
+ umem->need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+ return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+}
+EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
+
+/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
+ * each page. This is only required in copy mode.
+ */
+static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
+ u32 len, u32 metalen)
+{
+ void *to_buf = xdp_umem_get_data(umem, addr);
+
+ addr = xsk_umem_add_offset_to_addr(addr);
+ if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
+ void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
+ u64 page_start = addr & ~(PAGE_SIZE - 1);
+ u64 first_len = PAGE_SIZE - (addr - page_start);
+
+ memcpy(to_buf, from_buf, first_len + metalen);
+ memcpy(next_pg_addr, from_buf + first_len, len - first_len);
+
+ return;
+ }
+
+ memcpy(to_buf, from_buf, len + metalen);
}
-EXPORT_SYMBOL(xsk_umem_discard_addr);
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- void *to_buf, *from_buf;
+ u64 offset = xs->umem->headroom;
+ u64 addr, memcpy_addr;
+ void *from_buf;
u32 metalen;
- u64 addr;
int err;
- if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
- addr += xs->umem->headroom;
-
if (unlikely(xdp_data_meta_unsupported(xdp))) {
from_buf = xdp->data;
metalen = 0;
@@ -78,12 +162,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
metalen = xdp->data - xdp->data_meta;
}
- to_buf = xdp_umem_get_data(xs->umem, addr);
- memcpy(to_buf, from_buf, len + metalen);
- addr += metalen;
- err = xskq_produce_batch_desc(xs->rx, addr, len);
+ memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
+ __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+
+ offset += metalen;
+ addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (!err) {
- xskq_discard_addr(xs->umem->fq);
+ xskq_cons_release(xs->umem->fq);
xdp_return_buff(xdp);
return 0;
}
@@ -94,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
+ int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
if (err)
xs->rx_dropped++;
@@ -102,10 +188,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
return err;
}
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+ if (READ_ONCE(xs->state) == XSK_BOUND) {
+ /* Matches smp_wmb() in bind(). */
+ smp_rmb();
+ return true;
+ }
+ return false;
+}
+
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 len;
+ if (!xsk_is_bound(xs))
+ return -EINVAL;
+
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
@@ -115,16 +214,17 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
}
-void xsk_flush(struct xdp_sock *xs)
+static void xsk_flush(struct xdp_sock *xs)
{
- xskq_produce_flush_desc(xs->rx);
- xs->sk.sk_data_ready(&xs->sk);
+ xskq_prod_submit(xs->rx);
+ sock_def_readable(&xs->sk);
}
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 metalen = xdp->data - xdp->data_meta;
u32 len = xdp->data_end - xdp->data;
+ u64 offset = xs->umem->headroom;
void *buffer;
u64 addr;
int err;
@@ -136,23 +236,23 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
goto out_unlock;
}
- if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
err = -ENOSPC;
goto out_drop;
}
- addr += xs->umem->headroom;
-
+ addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
buffer = xdp_umem_get_data(xs->umem, addr);
memcpy(buffer, xdp->data_meta, len + metalen);
- addr += metalen;
- err = xskq_produce_batch_desc(xs->rx, addr, len);
+
+ addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (err)
goto out_drop;
- xskq_discard_addr(xs->umem->fq);
- xskq_produce_flush_desc(xs->rx);
+ xskq_cons_release(xs->umem->fq);
+ xskq_prod_submit(xs->rx);
spin_unlock_bh(&xs->rx_lock);
@@ -166,9 +266,35 @@ out_unlock:
return err;
}
+int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+ struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+ int err;
+
+ err = xsk_rcv(xs, xdp);
+ if (err)
+ return err;
+
+ if (!xs->flush_node.prev)
+ list_add(&xs->flush_node, flush_list);
+
+ return 0;
+}
+
+void __xsk_map_flush(void)
+{
+ struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
+ struct xdp_sock *xs, *tmp;
+
+ list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+ xsk_flush(xs);
+ __list_del_clearprev(&xs->flush_node);
+ }
+}
+
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
- xskq_produce_flush_addr_n(umem->cq, nb_entries);
+ xskq_prod_submit_n(umem->cq, nb_entries);
}
EXPORT_SYMBOL(xsk_umem_complete_tx);
@@ -190,13 +316,18 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
- if (!xskq_peek_desc(xs->tx, desc))
+ if (!xskq_cons_peek_desc(xs->tx, desc, umem))
continue;
- if (xskq_produce_addr_lazy(umem->cq, desc->addr))
+ /* This is the backpreassure mechanism for the Tx path.
+ * Reserve space in the completion queue and only proceed
+ * if there is space in it. This avoids having to implement
+ * any buffering in the Tx path.
+ */
+ if (xskq_prod_reserve_addr(umem->cq, desc->addr))
goto out;
- xskq_discard_desc(xs->tx);
+ xskq_cons_release(xs->tx);
rcu_read_unlock();
return true;
}
@@ -207,12 +338,21 @@ out:
}
EXPORT_SYMBOL(xsk_umem_consume_tx);
-static int xsk_zc_xmit(struct sock *sk)
+static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
{
- struct xdp_sock *xs = xdp_sk(sk);
struct net_device *dev = xs->dev;
+ int err;
- return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+ rcu_read_lock();
+ err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int xsk_zc_xmit(struct xdp_sock *xs)
+{
+ return xsk_wakeup(xs, XDP_WAKEUP_TX);
}
static void xsk_destruct_skb(struct sk_buff *skb)
@@ -222,17 +362,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
unsigned long flags;
spin_lock_irqsave(&xs->tx_completion_lock, flags);
- WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+ xskq_prod_submit_addr(xs->umem->cq, addr);
spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
sock_wfree(skb);
}
-static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
- size_t total_len)
+static int xsk_generic_xmit(struct sock *sk)
{
- u32 max_batch = TX_BATCH_SIZE;
struct xdp_sock *xs = xdp_sk(sk);
+ u32 max_batch = TX_BATCH_SIZE;
bool sent_frame = false;
struct xdp_desc desc;
struct sk_buff *skb;
@@ -243,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- while (xskq_peek_desc(xs->tx, &desc)) {
+ while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
char *buffer;
u64 addr;
u32 len;
@@ -264,7 +403,12 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
addr = desc.addr;
buffer = xdp_umem_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
- if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
+ /* This is the backpreassure mechanism for the Tx path.
+ * Reserve space in the completion queue and only proceed
+ * if there is space in it. This avoids having to implement
+ * any buffering in the Tx path.
+ */
+ if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
kfree_skb(skb);
goto out;
}
@@ -272,11 +416,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
skb->dev = xs->dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
skb->destructor = xsk_destruct_skb;
err = dev_direct_xmit(skb, xs->queue_id);
- xskq_discard_desc(xs->tx);
+ xskq_cons_release(xs->tx);
/* Ignore NET_XMIT_CN as packet might have been sent */
if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
/* SKB completed but not sent */
@@ -295,35 +439,57 @@ out:
return err;
}
-static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+static int __xsk_sendmsg(struct sock *sk)
{
- bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
- struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
- if (unlikely(!xs->dev))
- return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN;
if (unlikely(!xs->tx))
return -ENOBUFS;
- if (need_wait)
+
+ return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
+}
+
+static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+ bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
+ struct sock *sk = sock->sk;
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ if (unlikely(!xsk_is_bound(xs)))
+ return -ENXIO;
+ if (unlikely(need_wait))
return -EOPNOTSUPP;
- return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
+ return __xsk_sendmsg(sk);
}
-static unsigned int xsk_poll(struct file *file, struct socket *sock,
+static __poll_t xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
- unsigned int mask = datagram_poll(file, sock, wait);
+ __poll_t mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
+ struct xdp_umem *umem;
- if (xs->rx && !xskq_empty_desc(xs->rx))
- mask |= POLLIN | POLLRDNORM;
- if (xs->tx && !xskq_full_desc(xs->tx))
- mask |= POLLOUT | POLLWRNORM;
+ if (unlikely(!xsk_is_bound(xs)))
+ return mask;
+
+ umem = xs->umem;
+
+ if (umem->need_wakeup) {
+ if (xs->zc)
+ xsk_wakeup(xs, umem->need_wakeup);
+ else
+ /* Poll needs to drive Tx also in copy mode */
+ __xsk_sendmsg(sk);
+ }
+
+ if (xs->rx && !xskq_prod_is_empty(xs->rx))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (xs->tx && !xskq_cons_is_full(xs->tx))
+ mask |= EPOLLOUT | EPOLLWRNORM;
return mask;
}
@@ -342,7 +508,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
/* Make sure queue is ready before it can be seen by others */
smp_wmb();
- *queue = q;
+ WRITE_ONCE(*queue, q);
return 0;
}
@@ -350,10 +516,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
{
struct net_device *dev = xs->dev;
- if (!dev || xs->state != XSK_BOUND)
+ if (xs->state != XSK_BOUND)
return;
-
- xs->state = XSK_UNBOUND;
+ WRITE_ONCE(xs->state, XSK_UNBOUND);
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
@@ -362,6 +527,52 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
dev_put(dev);
}
+static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
+ struct xdp_sock ***map_entry)
+{
+ struct xsk_map *map = NULL;
+ struct xsk_map_node *node;
+
+ *map_entry = NULL;
+
+ spin_lock_bh(&xs->map_list_lock);
+ node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
+ node);
+ if (node) {
+ WARN_ON(xsk_map_inc(node->map));
+ map = node->map;
+ *map_entry = node->map_entry;
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+ return map;
+}
+
+static void xsk_delete_from_maps(struct xdp_sock *xs)
+{
+ /* This function removes the current XDP socket from all the
+ * maps it resides in. We need to take extra care here, due to
+ * the two locks involved. Each map has a lock synchronizing
+ * updates to the entries, and each socket has a lock that
+ * synchronizes access to the list of maps (map_list). For
+ * deadlock avoidance the locks need to be taken in the order
+ * "map lock"->"socket map list lock". We start off by
+ * accessing the socket map list, and take a reference to the
+ * map to guarantee existence between the
+ * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
+ * calls. Then we ask the map to remove the socket, which
+ * tries to remove the socket from the map. Note that there
+ * might be updates to the map between
+ * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
+ */
+ struct xdp_sock **map_entry = NULL;
+ struct xsk_map *map;
+
+ while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
+ xsk_map_try_sock_delete(map, xs, map_entry);
+ xsk_map_put(map);
+ }
+}
+
static int xsk_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -381,7 +592,10 @@ static int xsk_release(struct socket *sock)
sock_prot_inuse_add(net, sk->sk_prot, -1);
local_bh_enable();
+ xsk_delete_from_maps(xs);
+ mutex_lock(&xs->mutex);
xsk_unbind_dev(xs);
+ mutex_unlock(&xs->mutex);
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
@@ -412,6 +626,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
+/* Check if umem pages are contiguous.
+ * If zero-copy mode, use the DMA address to do the page contiguity check
+ * For all other modes we use addr (kernel virtual address)
+ * Store the result in the low bits of addr.
+ */
+static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
+{
+ struct xdp_umem_page *pgs = umem->pages;
+ int i, is_contig;
+
+ for (i = 0; i < umem->npgs - 1; i++) {
+ is_contig = (flags & XDP_ZEROCOPY) ?
+ (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
+ (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
+ pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
+ }
+}
+
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -427,7 +659,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
return -EINVAL;
flags = sxdp->sxdp_flags;
- if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
+ if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+ XDP_USE_NEED_WAKEUP))
return -EINVAL;
rtnl_lock();
@@ -454,7 +687,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
struct xdp_sock *umem_xs;
struct socket *sock;
- if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+ if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+ (flags & XDP_USE_NEED_WAKEUP)) {
/* Cannot specify flags for shared sockets. */
err = -EINVAL;
goto out_unlock;
@@ -473,19 +707,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
umem_xs = xdp_sk(sock->sk);
- if (!umem_xs->umem) {
- /* No umem to inherit. */
+ if (!xsk_is_bound(umem_xs)) {
err = -EBADF;
sockfd_put(sock);
goto out_unlock;
- } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+ }
+ if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
err = -EINVAL;
sockfd_put(sock);
goto out_unlock;
}
xdp_get_umem(umem_xs->umem);
- xs->umem = umem_xs->umem;
+ WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
err = -EINVAL;
@@ -500,6 +734,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
goto out_unlock;
+
+ xsk_check_page_contiguity(xs->umem, flags);
}
xs->dev = dev;
@@ -510,16 +746,28 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
- if (err)
+ if (err) {
dev_put(dev);
- else
- xs->state = XSK_BOUND;
+ } else {
+ /* Matches smp_rmb() in bind() for shared umem
+ * sockets, and xsk_is_bound().
+ */
+ smp_wmb();
+ WRITE_ONCE(xs->state, XSK_BOUND);
+ }
out_release:
mutex_unlock(&xs->mutex);
rtnl_unlock();
return err;
}
+struct xdp_umem_reg_v1 {
+ __u64 addr; /* Start of packet data area */
+ __u64 len; /* Length of packet data area */
+ __u32 chunk_size;
+ __u32 headroom;
+};
+
static int xsk_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -549,15 +797,24 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
}
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false);
+ if (!err && optname == XDP_TX_RING)
+ /* Tx needs to be explicitly woken up the first time */
+ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
mutex_unlock(&xs->mutex);
return err;
}
case XDP_UMEM_REG:
{
- struct xdp_umem_reg mr;
+ size_t mr_size = sizeof(struct xdp_umem_reg);
+ struct xdp_umem_reg mr = {};
struct xdp_umem *umem;
- if (copy_from_user(&mr, optval, sizeof(mr)))
+ if (optlen < sizeof(struct xdp_umem_reg_v1))
+ return -EINVAL;
+ else if (optlen < sizeof(mr))
+ mr_size = sizeof(struct xdp_umem_reg_v1);
+
+ if (copy_from_user(&mr, optval, mr_size))
return -EFAULT;
mutex_lock(&xs->mutex);
@@ -574,7 +831,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
/* Make sure umem is ready before it can be seen by others */
smp_wmb();
- xs->umem = umem;
+ WRITE_ONCE(xs->umem, umem);
mutex_unlock(&xs->mutex);
return 0;
}
@@ -610,6 +867,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -ENOPROTOOPT;
}
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+ ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+ ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
static int xsk_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -649,26 +920,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
case XDP_MMAP_OFFSETS:
{
struct xdp_mmap_offsets off;
+ struct xdp_mmap_offsets_v1 off_v1;
+ bool flags_supported = true;
+ void *to_copy;
- if (len < sizeof(off))
+ if (len < sizeof(off_v1))
return -EINVAL;
+ else if (len < sizeof(off))
+ flags_supported = false;
+
+ if (flags_supported) {
+ /* xdp_ring_offset is identical to xdp_ring_offset_v1
+ * except for the flags field added to the end.
+ */
+ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+ &off.rx);
+ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+ &off.tx);
+ xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+ &off.fr);
+ xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+ &off.cr);
+ off.rx.flags = offsetof(struct xdp_rxtx_ring,
+ ptrs.flags);
+ off.tx.flags = offsetof(struct xdp_rxtx_ring,
+ ptrs.flags);
+ off.fr.flags = offsetof(struct xdp_umem_ring,
+ ptrs.flags);
+ off.cr.flags = offsetof(struct xdp_umem_ring,
+ ptrs.flags);
+
+ len = sizeof(off);
+ to_copy = &off;
+ } else {
+ xsk_enter_rxtx_offsets(&off_v1.rx);
+ xsk_enter_rxtx_offsets(&off_v1.tx);
+ xsk_enter_umem_offsets(&off_v1.fr);
+ xsk_enter_umem_offsets(&off_v1.cr);
+
+ len = sizeof(off_v1);
+ to_copy = &off_v1;
+ }
- off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
- off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
- off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
- off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
- off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
- off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
-
- off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
- off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
- off.fr.desc = offsetof(struct xdp_umem_ring, desc);
- off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
- off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
- off.cr.desc = offsetof(struct xdp_umem_ring, desc);
-
- len = sizeof(off);
- if (copy_to_user(optval, &off, len))
+ if (copy_to_user(optval, to_copy, len))
return -EFAULT;
if (put_user(len, optlen))
return -EFAULT;
@@ -713,7 +1007,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn;
struct page *qpg;
- if (xs->state != XSK_READY)
+ if (READ_ONCE(xs->state) != XSK_READY)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) {
@@ -739,7 +1033,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
/* Matches the smp_wmb() in xsk_init_queue */
smp_rmb();
qpg = virt_to_head_page(q->ring);
- if (size > (PAGE_SIZE << compound_order(qpg)))
+ if (size > page_size(qpg))
return -EINVAL;
pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
@@ -855,6 +1149,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
spin_lock_init(&xs->rx_lock);
spin_lock_init(&xs->tx_completion_lock);
+ INIT_LIST_HEAD(&xs->map_list);
+ spin_lock_init(&xs->map_list_lock);
+
mutex_lock(&net->xdp.lock);
sk_add_node_rcu(sk, &net->xdp.list);
mutex_unlock(&net->xdp.lock);
@@ -895,7 +1192,7 @@ static struct pernet_operations xsk_net_ops = {
static int __init xsk_init(void)
{
- int err;
+ int err, cpu;
err = proto_register(&xsk_proto, 0 /* no slab */);
if (err)
@@ -913,6 +1210,8 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
return 0;
out_pernet:
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index ba8120610426..4cfd106bdb53 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,6 +4,19 @@
#ifndef XSK_H_
#define XSK_H_
+struct xdp_ring_offset_v1 {
+ __u64 producer;
+ __u64 consumer;
+ __u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+ struct xdp_ring_offset_v1 rx;
+ struct xdp_ring_offset_v1 tx;
+ struct xdp_ring_offset_v1 fr;
+ struct xdp_ring_offset_v1 cr;
+};
+
static inline struct xdp_sock *xdp_sk(struct sock *sk)
{
return (struct xdp_sock *)sk;
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..f59791ba43a0 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.id = umem->id;
du.size = umem->size;
du.num_pages = umem->npgs;
- du.chunk_size = (__u32)(~umem->chunk_mask + 1);
+ du.chunk_size = umem->chunk_size_nohr + umem->headroom;
du.headroom = umem->headroom;
du.ifindex = umem->dev ? umem->dev->ifindex : 0;
du.queue_id = umem->queue_id;
@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
msg->xdiag_ino = sk_ino;
sock_diag_save_cookie(sk, msg->xdiag_cookie);
+ mutex_lock(&xs->mutex);
if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
goto out_nlmsg_trim;
@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
goto out_nlmsg_trim;
+ mutex_unlock(&xs->mutex);
nlmsg_end(nlskb, nlh);
return 0;
out_nlmsg_trim:
+ mutex_unlock(&xs->mutex);
nlmsg_cancel(nlskb, nlh);
return -EMSGSIZE;
}
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index b66504592d9b..c90e9c1e3c63 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -18,14 +18,14 @@ void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
q->chunk_mask = chunk_mask;
}
-static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
{
- return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
-}
+ struct xdp_umem_ring *umem_ring;
+ struct xdp_rxtx_ring *rxtx_ring;
-static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
-{
- return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc);
+ if (umem_queue)
+ return struct_size(umem_ring, desc, q->nentries);
+ return struct_size(rxtx_ring, desc, q->nentries);
}
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
@@ -43,8 +43,7 @@ struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
__GFP_COMP | __GFP_NORETRY;
- size = umem_queue ? xskq_umem_get_ring_size(q) :
- xskq_rxtx_get_ring_size(q);
+ size = xskq_get_ring_size(q, umem_queue);
q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
get_order(size));
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 909c5168ed0f..bec2af11853a 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -10,12 +10,10 @@
#include <linux/if_xdp.h>
#include <net/xdp_sock.h>
-#define RX_BATCH_SIZE 16
-#define LAZY_UPDATE_THRESHOLD 128
-
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
+ u32 flags;
};
/* Used for the RX and TX queues for packets */
@@ -35,10 +33,8 @@ struct xsk_queue {
u64 size;
u32 ring_mask;
u32 nentries;
- u32 prod_head;
- u32 prod_tail;
- u32 cons_head;
- u32 cons_tail;
+ u32 cached_prod;
+ u32 cached_cons;
struct xdp_ring *ring;
u64 invalid_descs;
};
@@ -85,57 +81,116 @@ struct xsk_queue {
* now and again after circling through the ring.
*/
-/* Common functions operating for both RXTX and umem queues */
+/* The operations on the rings are the following:
+ *
+ * producer consumer
+ *
+ * RESERVE entries PEEK in the ring for entries
+ * WRITE data into the ring READ data from the ring
+ * SUBMIT entries RELEASE entries
+ *
+ * The producer reserves one or more entries in the ring. It can then
+ * fill in these entries and finally submit them so that they can be
+ * seen and read by the consumer.
+ *
+ * The consumer peeks into the ring to see if the producer has written
+ * any new entries. If so, the producer can then read these entries
+ * and when it is done reading them release them back to the producer
+ * so that the producer can use these slots to fill in new entries.
+ *
+ * The function names below reflect these operations.
+ */
+
+/* Functions that read and validate content from consumer rings. */
-static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
+ u64 addr,
+ u64 length)
{
- return q ? q->invalid_descs : 0;
+ bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
+ bool next_pg_contig =
+ (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
+ XSK_NEXT_PG_CONTIG_MASK;
+
+ return cross_pg && !next_pg_contig;
}
-static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
+static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
+ u64 addr,
+ u64 length,
+ struct xdp_umem *umem)
{
- u32 entries = q->prod_tail - q->cons_tail;
+ u64 base_addr = xsk_umem_extract_addr(addr);
- if (entries == 0) {
- /* Refresh the local pointer */
- q->prod_tail = READ_ONCE(q->ring->producer);
- entries = q->prod_tail - q->cons_tail;
+ addr = xsk_umem_add_offset_to_addr(addr);
+ if (base_addr >= q->size || addr >= q->size ||
+ xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
+ q->invalid_descs++;
+ return false;
}
- return (entries > dcnt) ? dcnt : entries;
+ return true;
}
-static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
+static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
{
- u32 free_entries = q->nentries - (producer - q->cons_tail);
-
- if (free_entries >= dcnt)
- return free_entries;
+ if (addr >= q->size) {
+ q->invalid_descs++;
+ return false;
+ }
- /* Refresh the local tail pointer */
- q->cons_tail = READ_ONCE(q->ring->consumer);
- return q->nentries - (producer - q->cons_tail);
+ return true;
}
-static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
+static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
+ struct xdp_umem *umem)
{
- u32 entries = q->prod_tail - q->cons_tail;
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
- if (entries >= cnt)
- return true;
+ while (q->cached_cons != q->cached_prod) {
+ u32 idx = q->cached_cons & q->ring_mask;
- /* Refresh the local pointer. */
- q->prod_tail = READ_ONCE(q->ring->producer);
- entries = q->prod_tail - q->cons_tail;
+ *addr = ring->desc[idx] & q->chunk_mask;
- return entries >= cnt;
-}
+ if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+ if (xskq_cons_is_valid_unaligned(q, *addr,
+ umem->chunk_size_nohr,
+ umem))
+ return true;
+ goto out;
+ }
-/* UMEM queue */
+ if (xskq_cons_is_valid_addr(q, *addr))
+ return true;
-static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
+out:
+ q->cached_cons++;
+ }
+
+ return false;
+}
+
+static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
+ struct xdp_desc *d,
+ struct xdp_umem *umem)
{
- if (addr >= q->size) {
+ if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+ if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
+ return false;
+
+ if (d->len > umem->chunk_size_nohr || d->options) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+ }
+
+ if (!xskq_cons_is_valid_addr(q, d->addr))
+ return false;
+
+ if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
+ d->options) {
q->invalid_descs++;
return false;
}
@@ -143,177 +198,184 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
return true;
}
-static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+static inline bool xskq_cons_read_desc(struct xsk_queue *q,
+ struct xdp_desc *desc,
+ struct xdp_umem *umem)
{
- while (q->cons_tail != q->cons_head) {
- struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
- unsigned int idx = q->cons_tail & q->ring_mask;
+ while (q->cached_cons != q->cached_prod) {
+ struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+ u32 idx = q->cached_cons & q->ring_mask;
- *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
- if (xskq_is_valid_addr(q, *addr))
- return addr;
+ *desc = ring->desc[idx];
+ if (xskq_cons_is_valid_desc(q, desc, umem))
+ return true;
- q->cons_tail++;
+ q->cached_cons++;
}
- return NULL;
+ return false;
}
-static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
-{
- if (q->cons_tail == q->cons_head) {
- smp_mb(); /* D, matches A */
- WRITE_ONCE(q->ring->consumer, q->cons_tail);
- q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
-
- /* Order consumer and data */
- smp_rmb();
- }
+/* Functions for consumers */
- return xskq_validate_addr(q, addr);
+static inline void __xskq_cons_release(struct xsk_queue *q)
+{
+ smp_mb(); /* D, matches A */
+ WRITE_ONCE(q->ring->consumer, q->cached_cons);
}
-static inline void xskq_discard_addr(struct xsk_queue *q)
+static inline void __xskq_cons_peek(struct xsk_queue *q)
{
- q->cons_tail++;
+ /* Refresh the local pointer */
+ q->cached_prod = READ_ONCE(q->ring->producer);
+ smp_rmb(); /* C, matches B */
}
-static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
+static inline void xskq_cons_get_entries(struct xsk_queue *q)
{
- struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ __xskq_cons_release(q);
+ __xskq_cons_peek(q);
+}
- if (xskq_nb_free(q, q->prod_tail, 1) == 0)
- return -ENOSPC;
+static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
+{
+ u32 entries = q->cached_prod - q->cached_cons;
- /* A, matches D */
- ring->desc[q->prod_tail++ & q->ring_mask] = addr;
+ if (entries >= cnt)
+ return true;
- /* Order producer and data */
- smp_wmb(); /* B, matches C */
+ __xskq_cons_peek(q);
+ entries = q->cached_prod - q->cached_cons;
- WRITE_ONCE(q->ring->producer, q->prod_tail);
- return 0;
+ return entries >= cnt;
}
-static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
+ struct xdp_umem *umem)
{
- struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-
- if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
- return -ENOSPC;
-
- /* A, matches D */
- ring->desc[q->prod_head++ & q->ring_mask] = addr;
- return 0;
+ if (q->cached_prod == q->cached_cons)
+ xskq_cons_get_entries(q);
+ return xskq_cons_read_addr(q, addr, umem);
}
-static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
- u32 nb_entries)
+static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
+ struct xdp_desc *desc,
+ struct xdp_umem *umem)
{
- /* Order producer and data */
- smp_wmb(); /* B, matches C */
-
- q->prod_tail += nb_entries;
- WRITE_ONCE(q->ring->producer, q->prod_tail);
+ if (q->cached_prod == q->cached_cons)
+ xskq_cons_get_entries(q);
+ return xskq_cons_read_desc(q, desc, umem);
}
-static inline int xskq_reserve_addr(struct xsk_queue *q)
+static inline void xskq_cons_release(struct xsk_queue *q)
{
- if (xskq_nb_free(q, q->prod_head, 1) == 0)
- return -ENOSPC;
+ /* To improve performance, only update local state here.
+ * Reflect this to global state when we get new entries
+ * from the ring in xskq_cons_get_entries().
+ */
+ q->cached_cons++;
+}
- /* A, matches D */
- q->prod_head++;
- return 0;
+static inline bool xskq_cons_is_full(struct xsk_queue *q)
+{
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) ==
+ q->nentries;
}
-/* Rx/Tx queue */
+/* Functions for producers */
-static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+static inline bool xskq_prod_is_full(struct xsk_queue *q)
{
- if (!xskq_is_valid_addr(q, d->addr))
- return false;
+ u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
- if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
- d->options) {
- q->invalid_descs++;
+ if (free_entries)
return false;
- }
- return true;
+ /* Refresh the local tail pointer */
+ q->cached_cons = READ_ONCE(q->ring->consumer);
+ free_entries = q->nentries - (q->cached_prod - q->cached_cons);
+
+ return !free_entries;
}
-static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
- struct xdp_desc *desc)
+static inline int xskq_prod_reserve(struct xsk_queue *q)
{
- while (q->cons_tail != q->cons_head) {
- struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
- unsigned int idx = q->cons_tail & q->ring_mask;
-
- *desc = READ_ONCE(ring->desc[idx]);
- if (xskq_is_valid_desc(q, desc))
- return desc;
-
- q->cons_tail++;
- }
+ if (xskq_prod_is_full(q))
+ return -ENOSPC;
- return NULL;
+ /* A, matches D */
+ q->cached_prod++;
+ return 0;
}
-static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
- struct xdp_desc *desc)
+static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
{
- if (q->cons_tail == q->cons_head) {
- smp_mb(); /* D, matches A */
- WRITE_ONCE(q->ring->consumer, q->cons_tail);
- q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
-
- /* Order consumer and data */
- smp_rmb(); /* C, matches B */
- }
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
- return xskq_validate_desc(q, desc);
-}
+ if (xskq_prod_is_full(q))
+ return -ENOSPC;
-static inline void xskq_discard_desc(struct xsk_queue *q)
-{
- q->cons_tail++;
+ /* A, matches D */
+ ring->desc[q->cached_prod++ & q->ring_mask] = addr;
+ return 0;
}
-static inline int xskq_produce_batch_desc(struct xsk_queue *q,
- u64 addr, u32 len)
+static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
+ u64 addr, u32 len)
{
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
- unsigned int idx;
+ u32 idx;
- if (xskq_nb_free(q, q->prod_head, 1) == 0)
+ if (xskq_prod_is_full(q))
return -ENOSPC;
/* A, matches D */
- idx = (q->prod_head++) & q->ring_mask;
+ idx = q->cached_prod++ & q->ring_mask;
ring->desc[idx].addr = addr;
ring->desc[idx].len = len;
return 0;
}
-static inline void xskq_produce_flush_desc(struct xsk_queue *q)
+static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx)
{
- /* Order producer and data */
smp_wmb(); /* B, matches C */
- q->prod_tail = q->prod_head;
- WRITE_ONCE(q->ring->producer, q->prod_tail);
+ WRITE_ONCE(q->ring->producer, idx);
+}
+
+static inline void xskq_prod_submit(struct xsk_queue *q)
+{
+ __xskq_prod_submit(q, q->cached_prod);
+}
+
+static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ u32 idx = q->ring->producer;
+
+ ring->desc[idx++ & q->ring_mask] = addr;
+
+ __xskq_prod_submit(q, idx);
}
-static inline bool xskq_full_desc(struct xsk_queue *q)
+static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries)
{
- return xskq_nb_avail(q, q->nentries) == q->nentries;
+ __xskq_prod_submit(q, q->ring->producer + nb_entries);
}
-static inline bool xskq_empty_desc(struct xsk_queue *q)
+static inline bool xskq_prod_is_empty(struct xsk_queue *q)
{
- return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
+ /* No barriers needed since data is not accessed */
+ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer);
+}
+
+/* For both producers and consumers */
+
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+ return q ? q->invalid_descs : 0;
}
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 51bb6018f3bf..6921a18201a0 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -3,20 +3,20 @@
# XFRM configuration
#
config XFRM
- bool
- depends on INET
- select GRO_CELLS
- select SKB_EXTENSIONS
+ bool
+ depends on INET
+ select GRO_CELLS
+ select SKB_EXTENSIONS
config XFRM_OFFLOAD
- bool
+ bool
config XFRM_ALGO
tristate
select XFRM
select CRYPTO
select CRYPTO_HASH
- select CRYPTO_BLKCIPHER
+ select CRYPTO_SKCIPHER
if INET
config XFRM_USER
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index fbc4552d17b8..212a4fcb4a88 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
+obj-$(CONFIG_INET_ESPINTCP) += espintcp.o
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
new file mode 100644
index 000000000000..f15d6a564b0e
--- /dev/null
+++ b/net/xfrm/espintcp.c
@@ -0,0 +1,509 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/tcp.h>
+#include <net/strparser.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <net/espintcp.h>
+#include <linux/skmsg.h>
+#include <net/inet_common.h>
+
+static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb,
+ struct sock *sk)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf ||
+ !sk_rmem_schedule(sk, skb, skb->truesize)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ skb_set_owner_r(skb, sk);
+
+ memset(skb->cb, 0, sizeof(skb->cb));
+ skb_queue_tail(&ctx->ike_queue, skb);
+ ctx->saved_data_ready(sk);
+}
+
+static void handle_esp(struct sk_buff *skb, struct sock *sk)
+{
+ skb_reset_transport_header(skb);
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ rcu_read_lock();
+ skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ local_bh_disable();
+ xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP);
+ local_bh_enable();
+ rcu_read_unlock();
+}
+
+static void espintcp_rcv(struct strparser *strp, struct sk_buff *skb)
+{
+ struct espintcp_ctx *ctx = container_of(strp, struct espintcp_ctx,
+ strp);
+ struct strp_msg *rxm = strp_msg(skb);
+ u32 nonesp_marker;
+ int err;
+
+ err = skb_copy_bits(skb, rxm->offset + 2, &nonesp_marker,
+ sizeof(nonesp_marker));
+ if (err < 0) {
+ kfree_skb(skb);
+ return;
+ }
+
+ /* remove header, leave non-ESP marker/SPI */
+ if (!__pskb_pull(skb, rxm->offset + 2)) {
+ kfree_skb(skb);
+ return;
+ }
+
+ if (pskb_trim(skb, rxm->full_len - 2) != 0) {
+ kfree_skb(skb);
+ return;
+ }
+
+ if (nonesp_marker == 0)
+ handle_nonesp(ctx, skb, strp->sk);
+ else
+ handle_esp(skb, strp->sk);
+}
+
+static int espintcp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+ struct strp_msg *rxm = strp_msg(skb);
+ __be16 blen;
+ u16 len;
+ int err;
+
+ if (skb->len < rxm->offset + 2)
+ return 0;
+
+ err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen));
+ if (err < 0)
+ return err;
+
+ len = be16_to_cpu(blen);
+ if (len < 6)
+ return -EINVAL;
+
+ return len;
+}
+
+static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+ struct sk_buff *skb;
+ int err = 0;
+ int copied;
+ int off = 0;
+
+ flags |= nonblock ? MSG_DONTWAIT : 0;
+
+ skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, NULL, &off, &err);
+ if (!skb)
+ return err;
+
+ copied = len;
+ if (copied > skb->len)
+ copied = skb->len;
+ else if (copied < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+
+ err = skb_copy_datagram_msg(skb, 0, msg, copied);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ if (flags & MSG_TRUNC)
+ copied = skb->len;
+ kfree_skb(skb);
+ return copied;
+}
+
+int espintcp_queue_out(struct sock *sk, struct sk_buff *skb)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+
+ if (skb_queue_len(&ctx->out_queue) >= netdev_max_backlog)
+ return -ENOBUFS;
+
+ __skb_queue_tail(&ctx->out_queue, skb);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(espintcp_queue_out);
+
+/* espintcp length field is 2B and length includes the length field's size */
+#define MAX_ESPINTCP_MSG (((1 << 16) - 1) - 2)
+
+static int espintcp_sendskb_locked(struct sock *sk, struct espintcp_msg *emsg,
+ int flags)
+{
+ do {
+ int ret;
+
+ ret = skb_send_sock_locked(sk, emsg->skb,
+ emsg->offset, emsg->len);
+ if (ret < 0)
+ return ret;
+
+ emsg->len -= ret;
+ emsg->offset += ret;
+ } while (emsg->len > 0);
+
+ kfree_skb(emsg->skb);
+ memset(emsg, 0, sizeof(*emsg));
+
+ return 0;
+}
+
+static int espintcp_sendskmsg_locked(struct sock *sk,
+ struct espintcp_msg *emsg, int flags)
+{
+ struct sk_msg *skmsg = &emsg->skmsg;
+ struct scatterlist *sg;
+ int done = 0;
+ int ret;
+
+ flags |= MSG_SENDPAGE_NOTLAST;
+ sg = &skmsg->sg.data[skmsg->sg.start];
+ do {
+ size_t size = sg->length - emsg->offset;
+ int offset = sg->offset + emsg->offset;
+ struct page *p;
+
+ emsg->offset = 0;
+
+ if (sg_is_last(sg))
+ flags &= ~MSG_SENDPAGE_NOTLAST;
+
+ p = sg_page(sg);
+retry:
+ ret = do_tcp_sendpages(sk, p, offset, size, flags);
+ if (ret < 0) {
+ emsg->offset = offset - sg->offset;
+ skmsg->sg.start += done;
+ return ret;
+ }
+
+ if (ret != size) {
+ offset += ret;
+ size -= ret;
+ goto retry;
+ }
+
+ done++;
+ put_page(p);
+ sk_mem_uncharge(sk, sg->length);
+ sg = sg_next(sg);
+ } while (sg);
+
+ memset(emsg, 0, sizeof(*emsg));
+
+ return 0;
+}
+
+static int espintcp_push_msgs(struct sock *sk)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+ struct espintcp_msg *emsg = &ctx->partial;
+ int err;
+
+ if (!emsg->len)
+ return 0;
+
+ if (ctx->tx_running)
+ return -EAGAIN;
+ ctx->tx_running = 1;
+
+ if (emsg->skb)
+ err = espintcp_sendskb_locked(sk, emsg, 0);
+ else
+ err = espintcp_sendskmsg_locked(sk, emsg, 0);
+ if (err == -EAGAIN) {
+ ctx->tx_running = 0;
+ return 0;
+ }
+ if (!err)
+ memset(emsg, 0, sizeof(*emsg));
+
+ ctx->tx_running = 0;
+
+ return err;
+}
+
+int espintcp_push_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+ struct espintcp_msg *emsg = &ctx->partial;
+ unsigned int len;
+ int offset;
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ kfree_skb(skb);
+ return -ECONNRESET;
+ }
+
+ offset = skb_transport_offset(skb);
+ len = skb->len - offset;
+
+ espintcp_push_msgs(sk);
+
+ if (emsg->len) {
+ kfree_skb(skb);
+ return -ENOBUFS;
+ }
+
+ skb_set_owner_w(skb, sk);
+
+ emsg->offset = offset;
+ emsg->len = len;
+ emsg->skb = skb;
+
+ espintcp_push_msgs(sk);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(espintcp_push_skb);
+
+static int espintcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+ struct espintcp_msg *emsg = &ctx->partial;
+ struct iov_iter pfx_iter;
+ struct kvec pfx_iov = {};
+ size_t msglen = size + 2;
+ char buf[2] = {0};
+ int err, end;
+
+ if (msg->msg_flags)
+ return -EOPNOTSUPP;
+
+ if (size > MAX_ESPINTCP_MSG)
+ return -EMSGSIZE;
+
+ if (msg->msg_controllen)
+ return -EOPNOTSUPP;
+
+ lock_sock(sk);
+
+ err = espintcp_push_msgs(sk);
+ if (err < 0) {
+ err = -ENOBUFS;
+ goto unlock;
+ }
+
+ sk_msg_init(&emsg->skmsg);
+ while (1) {
+ /* only -ENOMEM is possible since we don't coalesce */
+ err = sk_msg_alloc(sk, &emsg->skmsg, msglen, 0);
+ if (!err)
+ break;
+
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto fail;
+ }
+
+ *((__be16 *)buf) = cpu_to_be16(msglen);
+ pfx_iov.iov_base = buf;
+ pfx_iov.iov_len = sizeof(buf);
+ iov_iter_kvec(&pfx_iter, WRITE, &pfx_iov, 1, pfx_iov.iov_len);
+
+ err = sk_msg_memcopy_from_iter(sk, &pfx_iter, &emsg->skmsg,
+ pfx_iov.iov_len);
+ if (err < 0)
+ goto fail;
+
+ err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, &emsg->skmsg, size);
+ if (err < 0)
+ goto fail;
+
+ end = emsg->skmsg.sg.end;
+ emsg->len = size;
+ sk_msg_iter_var_prev(end);
+ sg_mark_end(sk_msg_elem(&emsg->skmsg, end));
+
+ tcp_rate_check_app_limited(sk);
+
+ err = espintcp_push_msgs(sk);
+ /* this message could be partially sent, keep it */
+ if (err < 0)
+ goto unlock;
+ release_sock(sk);
+
+ return size;
+
+fail:
+ sk_msg_free(sk, &emsg->skmsg);
+ memset(emsg, 0, sizeof(*emsg));
+unlock:
+ release_sock(sk);
+ return err;
+}
+
+static struct proto espintcp_prot __ro_after_init;
+static struct proto_ops espintcp_ops __ro_after_init;
+
+static void espintcp_data_ready(struct sock *sk)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+
+ strp_data_ready(&ctx->strp);
+}
+
+static void espintcp_tx_work(struct work_struct *work)
+{
+ struct espintcp_ctx *ctx = container_of(work,
+ struct espintcp_ctx, work);
+ struct sock *sk = ctx->strp.sk;
+
+ lock_sock(sk);
+ if (!ctx->tx_running)
+ espintcp_push_msgs(sk);
+ release_sock(sk);
+}
+
+static void espintcp_write_space(struct sock *sk)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+
+ schedule_work(&ctx->work);
+ ctx->saved_write_space(sk);
+}
+
+static void espintcp_destruct(struct sock *sk)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+
+ kfree(ctx);
+}
+
+bool tcp_is_ulp_esp(struct sock *sk)
+{
+ return sk->sk_prot == &espintcp_prot;
+}
+EXPORT_SYMBOL_GPL(tcp_is_ulp_esp);
+
+static int espintcp_init_sk(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct strp_callbacks cb = {
+ .rcv_msg = espintcp_rcv,
+ .parse_msg = espintcp_parse,
+ };
+ struct espintcp_ctx *ctx;
+ int err;
+
+ /* sockmap is not compatible with espintcp */
+ if (sk->sk_user_data)
+ return -EBUSY;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ err = strp_init(&ctx->strp, sk, &cb);
+ if (err)
+ goto free;
+
+ __sk_dst_reset(sk);
+
+ strp_check_rcv(&ctx->strp);
+ skb_queue_head_init(&ctx->ike_queue);
+ skb_queue_head_init(&ctx->out_queue);
+ sk->sk_prot = &espintcp_prot;
+ sk->sk_socket->ops = &espintcp_ops;
+ ctx->saved_data_ready = sk->sk_data_ready;
+ ctx->saved_write_space = sk->sk_write_space;
+ sk->sk_data_ready = espintcp_data_ready;
+ sk->sk_write_space = espintcp_write_space;
+ sk->sk_destruct = espintcp_destruct;
+ rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
+ INIT_WORK(&ctx->work, espintcp_tx_work);
+
+ /* avoid using task_frag */
+ sk->sk_allocation = GFP_ATOMIC;
+
+ return 0;
+
+free:
+ kfree(ctx);
+ return err;
+}
+
+static void espintcp_release(struct sock *sk)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+ struct sk_buff_head queue;
+ struct sk_buff *skb;
+
+ __skb_queue_head_init(&queue);
+ skb_queue_splice_init(&ctx->out_queue, &queue);
+
+ while ((skb = __skb_dequeue(&queue)))
+ espintcp_push_skb(sk, skb);
+
+ tcp_release_cb(sk);
+}
+
+static void espintcp_close(struct sock *sk, long timeout)
+{
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+ struct espintcp_msg *emsg = &ctx->partial;
+
+ strp_stop(&ctx->strp);
+
+ sk->sk_prot = &tcp_prot;
+ barrier();
+
+ cancel_work_sync(&ctx->work);
+ strp_done(&ctx->strp);
+
+ skb_queue_purge(&ctx->out_queue);
+ skb_queue_purge(&ctx->ike_queue);
+
+ if (emsg->len) {
+ if (emsg->skb)
+ kfree_skb(emsg->skb);
+ else
+ sk_msg_free(sk, &emsg->skmsg);
+ }
+
+ tcp_close(sk, timeout);
+}
+
+static __poll_t espintcp_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ __poll_t mask = datagram_poll(file, sock, wait);
+ struct sock *sk = sock->sk;
+ struct espintcp_ctx *ctx = espintcp_getctx(sk);
+
+ if (!skb_queue_empty(&ctx->ike_queue))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static struct tcp_ulp_ops espintcp_ulp __read_mostly = {
+ .name = "espintcp",
+ .owner = THIS_MODULE,
+ .init = espintcp_init_sk,
+};
+
+void __init espintcp_init(void)
+{
+ memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot));
+ memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops));
+ espintcp_prot.sendmsg = espintcp_sendmsg;
+ espintcp_prot.recvmsg = espintcp_recvmsg;
+ espintcp_prot.close = espintcp_close;
+ espintcp_prot.release_cb = espintcp_release;
+ espintcp_ops.poll = espintcp_poll;
+
+ tcp_register_ulp(&espintcp_ulp);
+}
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index 32a378e7011f..4dae3ab8d030 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -626,8 +626,8 @@ static const struct xfrm_algo_list xfrm_aalg_list = {
static const struct xfrm_algo_list xfrm_ealg_list = {
.algs = ealg_list,
.entries = ARRAY_SIZE(ealg_list),
- .type = CRYPTO_ALG_TYPE_BLKCIPHER,
- .mask = CRYPTO_ALG_TYPE_BLKCIPHER_MASK,
+ .type = CRYPTO_ALG_TYPE_SKCIPHER,
+ .mask = CRYPTO_ALG_TYPE_MASK,
};
static const struct xfrm_algo_list xfrm_calg_list = {
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 189ef15acbbc..50f567a88f45 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -78,7 +78,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
int err;
unsigned long flags;
struct xfrm_state *x;
- struct sk_buff *skb2;
+ struct sk_buff *skb2, *nskb;
struct softnet_data *sd;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
@@ -148,11 +148,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
return skb;
}
- skb2 = skb;
-
- do {
- struct sk_buff *nskb = skb2->next;
-
+ skb_list_walk_safe(skb, skb2, nskb) {
esp_features |= skb->dev->gso_partial_features;
skb_mark_not_on_list(skb2);
@@ -176,14 +172,11 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
if (!skb)
return NULL;
- goto skip_push;
+ continue;
}
skb_push(skb2, skb2->data - skb_mac_header(skb2));
-
-skip_push:
- skb2 = nskb;
- } while (skb2);
+ }
return skb;
}
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6088bc2dc11e..aa35f23c4912 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -36,6 +36,7 @@ struct xfrm_trans_cb {
#endif
} header;
int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb);
+ struct net *net;
};
#define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))
@@ -480,6 +481,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
else
XFRM_INC_STATS(net,
LINUX_MIB_XFRMINSTATEINVALID);
+
+ if (encap_type == -1)
+ dev_put(skb->dev);
goto drop;
}
@@ -706,7 +710,7 @@ resume:
if (err)
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
if (decaps) {
sp = skb_sec_path(skb);
@@ -763,12 +767,13 @@ static void xfrm_trans_reinject(unsigned long data)
skb_queue_splice_init(&trans->queue, &queue);
while ((skb = __skb_dequeue(&queue)))
- XFRM_TRANS_SKB_CB(skb)->finish(dev_net(skb->dev), NULL, skb);
+ XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net,
+ NULL, skb);
}
-int xfrm_trans_queue(struct sk_buff *skb,
- int (*finish)(struct net *, struct sock *,
- struct sk_buff *))
+int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
+ int (*finish)(struct net *, struct sock *,
+ struct sk_buff *))
{
struct xfrm_trans_tasklet *trans;
@@ -777,11 +782,22 @@ int xfrm_trans_queue(struct sk_buff *skb,
if (skb_queue_len(&trans->queue) >= netdev_max_backlog)
return -ENOBUFS;
+ BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb));
+
XFRM_TRANS_SKB_CB(skb)->finish = finish;
+ XFRM_TRANS_SKB_CB(skb)->net = net;
__skb_queue_tail(&trans->queue, skb);
tasklet_schedule(&trans->tasklet);
return 0;
}
+EXPORT_SYMBOL(xfrm_trans_queue_net);
+
+int xfrm_trans_queue(struct sk_buff *skb,
+ int (*finish)(struct net *, struct sock *,
+ struct sk_buff *))
+{
+ return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish);
+}
EXPORT_SYMBOL(xfrm_trans_queue);
void __init xfrm_input_init(void)
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 74868f9d81fb..3361e3ac5714 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -145,8 +145,6 @@ static int xfrmi_create(struct net_device *dev)
if (err < 0)
goto out;
- strcpy(xi->p.name, dev->name);
-
dev_hold(dev);
xfrmi_link(xfrmn, xi);
@@ -177,7 +175,6 @@ static void xfrmi_dev_uninit(struct net_device *dev)
struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
xfrmi_unlink(xfrmn, xi);
- dev_put(xi->phydev);
dev_put(dev);
}
@@ -188,7 +185,7 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- nf_reset(skb);
+ nf_reset_ct(skb);
nf_reset_trace(skb);
if (!xnet)
@@ -271,9 +268,6 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
int err = -1;
int mtu;
- if (!dst)
- goto tx_err_link_failure;
-
dst_hold(dst);
dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id);
if (IS_ERR(dst)) {
@@ -294,22 +288,22 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
if (tdev == dev) {
stats->collisions++;
net_warn_ratelimited("%s: Local routing loop detected!\n",
- xi->p.name);
+ dev->name);
goto tx_err_dst_release;
}
mtu = dst_mtu(dst);
if (!skb->ignore_df && skb->len > mtu) {
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IPV6)) {
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
} else {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
}
dst_release(dst);
@@ -346,6 +340,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
struct net_device_stats *stats = &xi->dev->stats;
+ struct dst_entry *dst = skb_dst(skb);
struct flowi fl;
int ret;
@@ -355,16 +350,39 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
case htons(ETH_P_IPV6):
xfrm_decode_session(skb, &fl, AF_INET6);
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ if (!dst) {
+ fl.u.ip6.flowi6_oif = dev->ifindex;
+ fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ stats->tx_carrier_errors++;
+ goto tx_err;
+ }
+ skb_dst_set(skb, dst);
+ }
break;
case htons(ETH_P_IP):
xfrm_decode_session(skb, &fl, AF_INET);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ if (!dst) {
+ struct rtable *rt;
+
+ fl.u.ip4.flowi4_oif = dev->ifindex;
+ fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4);
+ if (IS_ERR(rt)) {
+ stats->tx_carrier_errors++;
+ goto tx_err;
+ }
+ skb_dst_set(skb, &rt->dst);
+ }
break;
default:
goto tx_err;
}
- fl.flowi_oif = xi->phydev->ifindex;
+ fl.flowi_oif = xi->p.link;
ret = xfrmi_xmit2(skb, dev, &fl);
if (ret < 0)
@@ -505,7 +523,7 @@ static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
{
- struct net *net = dev_net(xi->dev);
+ struct net *net = xi->net;
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
int err;
@@ -550,7 +568,7 @@ static int xfrmi_get_iflink(const struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
- return xi->phydev->ifindex;
+ return xi->p.link;
}
@@ -566,22 +584,21 @@ static void xfrmi_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &xfrmi_netdev_ops;
dev->type = ARPHRD_NONE;
- dev->hard_header_len = ETH_HLEN;
- dev->min_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
dev->min_mtu = ETH_MIN_MTU;
- dev->max_mtu = ETH_DATA_LEN;
- dev->addr_len = ETH_ALEN;
+ dev->max_mtu = IP_MAX_MTU;
dev->flags = IFF_NOARP;
dev->needs_free_netdev = true;
dev->priv_destructor = xfrmi_dev_free;
netif_keep_dst(dev);
+
+ eth_broadcast_addr(dev->broadcast);
}
static int xfrmi_dev_init(struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
- struct net_device *phydev = xi->phydev;
+ struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link);
int err;
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
@@ -596,13 +613,19 @@ static int xfrmi_dev_init(struct net_device *dev)
dev->features |= NETIF_F_LLTX;
- dev->needed_headroom = phydev->needed_headroom;
- dev->needed_tailroom = phydev->needed_tailroom;
+ if (phydev) {
+ dev->needed_headroom = phydev->needed_headroom;
+ dev->needed_tailroom = phydev->needed_tailroom;
- if (is_zero_ether_addr(dev->dev_addr))
- eth_hw_addr_inherit(dev, phydev);
- if (is_zero_ether_addr(dev->broadcast))
- memcpy(dev->broadcast, phydev->broadcast, dev->addr_len);
+ if (is_zero_ether_addr(dev->dev_addr))
+ eth_hw_addr_inherit(dev, phydev);
+ if (is_zero_ether_addr(dev->broadcast))
+ memcpy(dev->broadcast, phydev->broadcast,
+ dev->addr_len);
+ } else {
+ eth_hw_addr_random(dev);
+ eth_broadcast_addr(dev->broadcast);
+ }
return 0;
}
@@ -638,12 +661,6 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
int err;
xfrmi_netlink_parms(data, &p);
-
- if (!tb[IFLA_IFNAME])
- return -EINVAL;
-
- nla_strlcpy(p.name, tb[IFLA_IFNAME], IFNAMSIZ);
-
xi = xfrmi_locate(net, &p);
if (xi)
return -EEXIST;
@@ -652,13 +669,8 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
xi->p = p;
xi->net = net;
xi->dev = dev;
- xi->phydev = dev_get_by_index(net, p.link);
- if (!xi->phydev)
- return -ENODEV;
err = xfrmi_create(dev);
- if (err < 0)
- dev_put(xi->phydev);
return err;
}
@@ -672,11 +684,11 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct xfrm_if *xi = netdev_priv(dev);
- struct net *net = dev_net(dev);
-
- xfrmi_netlink_parms(data, &xi->p);
+ struct net *net = xi->net;
+ struct xfrm_if_parms p;
- xi = xfrmi_locate(net, &xi->p);
+ xfrmi_netlink_parms(data, &p);
+ xi = xfrmi_locate(net, &p);
if (!xi) {
xi = netdev_priv(dev);
} else {
@@ -684,7 +696,7 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
return -EEXIST;
}
- return xfrmi_update(xi, &xi->p);
+ return xfrmi_update(xi, &p);
}
static size_t xfrmi_get_size(const struct net_device *dev)
@@ -715,7 +727,7 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
- return dev_net(xi->phydev);
+ return xi->net;
}
static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
@@ -738,30 +750,7 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
.get_link_net = xfrmi_get_link_net,
};
-static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
-{
- struct xfrm_if *xi;
- LIST_HEAD(list);
-
- xi = rtnl_dereference(xfrmn->xfrmi[0]);
- if (!xi)
- return;
-
- unregister_netdevice_queue(xi->dev, &list);
- unregister_netdevice_many(&list);
-}
-
-static void __net_exit xfrmi_exit_net(struct net *net)
-{
- struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
-
- rtnl_lock();
- xfrmi_destroy_interfaces(xfrmn);
- rtnl_unlock();
-}
-
static struct pernet_operations xfrmi_net_ops = {
- .exit = xfrmi_exit_net,
.id = &xfrmi_net_id,
.size = sizeof(struct xfrmi_net),
};
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index 32c364d3bfb3..4d422447aadc 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -85,7 +85,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
if (dlen < len)
len = dlen;
- frag->page_offset = 0;
+ skb_frag_off_set(frag, 0);
skb_frag_size_set(frag, len);
memcpy(skb_frag_address(frag), scratch, len);
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 9499b35feb92..fafc7aba705f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -502,7 +502,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err)
struct net *net = xs_net(skb_dst(skb)->xfrm);
while (likely((err = xfrm_output_one(skb, err)) == 0)) {
- nf_reset(skb);
+ nf_reset_ct(skb);
err = skb_dst(skb)->ops->local_out(net, skb->sk, skb);
if (unlikely(err != 1))
@@ -533,7 +533,7 @@ static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct sk_buff *segs;
+ struct sk_buff *segs, *nskb;
BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET);
@@ -544,8 +544,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
if (segs == NULL)
return -EINVAL;
- do {
- struct sk_buff *nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
int err;
skb_mark_not_on_list(segs);
@@ -555,9 +554,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
kfree_skb_list(nskb);
return err;
}
-
- segs = nskb;
- } while (segs);
+ }
return 0;
}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8ca637a72697..dbda08ec566e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -39,6 +39,9 @@
#ifdef CONFIG_XFRM_STATISTICS
#include <net/snmp.h>
#endif
+#ifdef CONFIG_INET_ESPINTCP
+#include <net/espintcp.h>
+#endif
#include "xfrm_hash.h"
@@ -912,6 +915,7 @@ restart:
} else if (delta > 0) {
p = &parent->rb_right;
} else {
+ bool same_prefixlen = node->prefixlen == n->prefixlen;
struct xfrm_policy *tmp;
hlist_for_each_entry(tmp, &n->hhead, bydst) {
@@ -919,9 +923,11 @@ restart:
hlist_del_rcu(&tmp->bydst);
}
+ node->prefixlen = prefixlen;
+
xfrm_policy_inexact_list_reinsert(net, node, family);
- if (node->prefixlen == n->prefixlen) {
+ if (same_prefixlen) {
kfree_rcu(n, rcu);
return;
}
@@ -929,7 +935,6 @@ restart:
rb_erase(*p, new);
kfree_rcu(n, rcu);
n = node;
- n->prefixlen = prefixlen;
goto restart;
}
}
@@ -2806,7 +2811,7 @@ static void xfrm_policy_queue_process(struct timer_list *t)
continue;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_dst_drop(skb);
skb_dst_set(skb, dst);
@@ -3184,7 +3189,7 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
flags | XFRM_LOOKUP_QUEUE |
XFRM_LOOKUP_KEEP_DST_REF);
- if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
+ if (PTR_ERR(dst) == -EREMOTE)
return make_blackhole(net, dst_orig->ops->family, dst_orig);
if (IS_ERR(dst))
@@ -3269,7 +3274,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
struct flowi4 *fl4 = &fl->u.ip4;
int oif = 0;
- if (skb_dst(skb))
+ if (skb_dst(skb) && skb_dst(skb)->dev)
oif = skb_dst(skb)->dev->ifindex;
memset(fl4, 0, sizeof(struct flowi4));
@@ -3387,7 +3392,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
nexthdr = nh[nhoff];
- if (skb_dst(skb))
+ if (skb_dst(skb) && skb_dst(skb)->dev)
oif = skb_dst(skb)->dev->ifindex;
memset(fl6, 0, sizeof(struct flowi6));
@@ -4155,6 +4160,10 @@ void __init xfrm_init(void)
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
+#ifdef CONFIG_INET_ESPINTCP
+ espintcp_init();
+#endif
+
RCU_INIT_POINTER(xfrm_if_cb, NULL);
synchronize_rcu();
}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c6f3c4a1bd99..170d6e7f31d3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -495,6 +495,8 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
x->type->destructor(x);
xfrm_put_type(x->type);
}
+ if (x->xfrag.page)
+ put_page(x->xfrag.page);
xfrm_dev_state_free(x);
security_xfrm_state_free(x);
xfrm_state_free(x);
@@ -668,6 +670,9 @@ int __xfrm_state_delete(struct xfrm_state *x)
net->xfrm.state_num--;
spin_unlock(&net->xfrm.xfrm_state_lock);
+ if (x->encap_sk)
+ sock_put(rcu_dereference_raw(x->encap_sk));
+
xfrm_dev_state_delete(x);
/* All xfrm_state objects are created by xfrm_state_alloc.
OpenPOWER on IntegriCloud